From 6958fd5aaf0bfe6a6128c5c0f3b77eb7328fe8a2 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Fri, 23 Aug 2024 18:18:04 -0400 Subject: [PATCH 001/150] convert lexer to use menhir tokens --- dune-project | 10 +- src/parser/ast.ml | 28 +++- src/parser/dune | 3 + src/parser/lexer.mli | 5 +- src/parser/lexer.mll | 297 ++++++++++++++++------------------- src/parser/odoc_parser.ml | 19 +-- src/parser/parser.mly | 57 +++++++ src/parser/test/dune | 1 + src/parser/test/serialize.ml | 169 ++++++++++++++++++++ src/parser/test/test.ml | 179 +-------------------- src/parser/token.ml | 226 ++++++++++++-------------- 11 files changed, 504 insertions(+), 490 deletions(-) create mode 100644 src/parser/parser.mly create mode 100644 src/parser/test/serialize.ml diff --git a/dune-project b/dune-project index 12643f8d30..b079985c00 100644 --- a/dune-project +++ b/dune-project @@ -4,6 +4,8 @@ (documentation "https://ocaml.org/p/odoc") +(using menhir 2.1) + (source (github ocaml/odoc)) @@ -11,20 +13,20 @@ (authors "Anton Bachin " - "Daniel B\195\188nzli " + "Daniel Bünzli " "David Sheets " "Jon Ludlam " "Jules Aguillon " "Leo White " "Lubega Simon " - "Paul-Elliot Angl\195\168s d'Auriac " + "Paul-Elliot Anglès d'Auriac " "Thomas Refis ") (maintainers - "Daniel B\195\188nzli " + "Daniel Bünzli " "Jon Ludlam " "Jules Aguillon " - "Paul-Elliot Angl\195\168s d'Auriac ") + "Paul-Elliot Anglès d'Auriac ") (cram enable) diff --git a/src/parser/ast.ml b/src/parser/ast.ml index 8662f68e8a..7bff58a6d7 100644 --- a/src/parser/ast.ml +++ b/src/parser/ast.ml @@ -1,5 +1,27 @@ (** Abstract syntax tree representing ocamldoc comments *) +type parser_tag = + Author + | Deprecated + | Param of string + | Raise of string + | Return + | See of [ `Url | `File | `Document ] * string + | Since of string + | Before of string + | Version of string + | Canonical of string + | Inline + | Open + | Closed + | Hidden + +type list_kind = [ `Ordered | `Unordered ] +type list_syntax = [ `Light | `Heavy ] +type list_item = [ `Li | `Dash ] + +type table_cell_kind = [ `Header | `Data ] + (** This is a syntactic representation of ocamldoc comments. See {{:https://ocaml.org/releases/4.12/htmlman/ocamldoc.html}The manual} for a detailed description of the syntax understood. Note that there is no attempt at semantic @@ -14,6 +36,7 @@ type alignment = [ `Left | `Center | `Right ] type reference_kind = [ `Simple | `With_text ] (** References in doc comments can be of two kinds: [{!simple}] or [{{!ref}With text}]. *) + type inline_element = [ `Space of string | `Word of string @@ -56,8 +79,8 @@ and nestable_block_element = | `Verbatim of string | `Modules of string with_location list | `List of - [ `Unordered | `Ordered ] - * [ `Light | `Heavy ] + list_kind + * list_syntax * nestable_block_element with_location list list | `Table of table | `Math_block of string (** @since 2.0.0 *) @@ -69,6 +92,7 @@ and nestable_block_element = {{:https://ocaml.org/releases/4.12/htmlman/ocamldoc.html#sss:ocamldoc-list}manual}). *) + and table = nestable_block_element abstract_table * [ `Light | `Heavy ] type internal_tag = diff --git a/src/parser/dune b/src/parser/dune index e7a3d1ce4c..9b9e8e1a7b 100644 --- a/src/parser/dune +++ b/src/parser/dune @@ -1,5 +1,8 @@ (ocamllex lexer) +(menhir + (modules parser)) + (library (name odoc_parser) (public_name odoc-parser) diff --git a/src/parser/lexer.mli b/src/parser/lexer.mli index ce053b495e..ff39d6c13b 100644 --- a/src/parser/lexer.mli +++ b/src/parser/lexer.mli @@ -3,8 +3,7 @@ type input = { file : string; offset_to_location : int -> Loc.point; - warnings : Warning.t list ref; - lexbuf : Lexing.lexbuf; + mutable warnings : Warning.t list; } -val token : input -> Lexing.lexbuf -> Token.t Loc.with_location +val token : input -> Lexing.lexbuf -> Parser.token diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 864c32c25b..1b6895df73 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -1,5 +1,7 @@ { + open Parser + let unescape_word : string -> string = fun s -> (* The common case is that there are no escape sequences. *) match String.index s '\\' with @@ -32,8 +34,8 @@ type math_kind = let math_constr kind x = match kind with - | Inline -> `Math_span x - | Block -> `Math_block x + | Inline -> Math_span x + | Block -> Math_block x (* This is used for code and verbatim blocks. It can be done with a regular expression, but the regexp gets quite ugly, so a function is easier to @@ -139,16 +141,15 @@ let trim_leading_whitespace : first_line_offset:int -> string -> string = type input = { file : string; offset_to_location : int -> Loc.point; - warnings : Warning.t list ref; - lexbuf : Lexing.lexbuf; + mutable warnings : Warning.t list; } let with_location_adjustments - k input ?start_offset ?adjust_start_by ?end_offset ?adjust_end_by value = + k lexbuf input ?start_offset ?adjust_start_by ?end_offset ?adjust_end_by value = let start = match start_offset with - | None -> Lexing.lexeme_start input.lexbuf + | None -> Lexing.lexeme_start lexbuf | Some s -> s in let start = @@ -158,7 +159,7 @@ let with_location_adjustments in let end_ = match end_offset with - | None -> Lexing.lexeme_end input.lexbuf + | None -> Lexing.lexeme_end lexbuf | Some e -> e in let end_ = @@ -179,44 +180,23 @@ let emit = let warning = with_location_adjustments (fun input location error -> - input.warnings := (error location) :: !(input.warnings)) + input.warnings <- (error location) :: input.warnings) let reference_token media start target input lexbuf = match start with - | "{!" -> `Simple_reference target - | "{{!" -> `Begin_reference_with_replacement_text target - | "{:" -> `Simple_link (target) - | "{{:" -> `Begin_link_with_replacement_text (target) - - | "{image!" -> `Simple_media (`Reference target, `Image) - | "{image:" -> `Simple_media (`Link target, `Image) - | "{audio!" -> `Simple_media (`Reference target, `Audio) - | "{audio:" -> `Simple_media (`Link target, `Audio) - | "{video!" -> `Simple_media (`Reference target, `Video) - | "{video:" -> `Simple_media (`Link target, `Video) + | "{!" -> Simple_ref target + | "{{!" -> Ref_with_replacement target + | "{:" -> Simple_link target + | "{{:" -> Link_with_replacement target + | _ -> assert false - | _ -> - let target, kind = - match start with - | "{{image!" -> `Reference target, `Image - | "{{image:" -> `Link target, `Image - | "{{audio!" -> `Reference target, `Audio - | "{{audio:" -> `Link target, `Audio - | "{{video!" -> `Reference target, `Video - | "{{video:" -> `Link target, `Video - | _ -> assert false - in - let token_descr = Token.describe (`Media_with_replacement_text (target, kind, "")) in - let content = media token_descr (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) input lexbuf in - `Media_with_replacement_text (target, kind, content) - -let trim_leading_space_or_accept_whitespace input start_offset text = +let trim_leading_space_or_accept_whitespace lexbuf input start_offset text = match text.[0] with | ' ' -> String.sub text 1 (String.length text - 1) | '\t' | '\r' | '\n' -> text | exception Invalid_argument _ -> "" | _ -> - warning + warning lexbuf input ~start_offset ~end_offset:(start_offset + 2) @@ -230,13 +210,13 @@ let trim_trailing_space_or_accept_whitespace text = | _ -> text | exception Invalid_argument _ -> text -let emit_verbatim input start_offset buffer = - let t = Buffer.contents buffer in - let t = trim_trailing_space_or_accept_whitespace t in - let t = trim_leading_space_or_accept_whitespace input start_offset t in - let t = trim_leading_blank_lines t in - let t = trim_trailing_blank_lines t in - emit input (`Verbatim t) ~start_offset +let emit_verbatim lexbuf input start_offset buffer = + let t = Buffer.contents buffer + |> trim_trailing_space_or_accept_whitespace + |> trim_leading_space_or_accept_whitespace lexbuf input start_offset + |> trim_leading_blank_lines + |> trim_trailing_blank_lines in + emit lexbuf input (Verbatim t) ~start_offset (* The locations have to be treated carefully in this function. We need to ensure that the []`Code_block] location matches the entirety of the block including the terminator, @@ -244,24 +224,25 @@ let emit_verbatim input start_offset buffer = Note that the location reflects the content _without_ stripping of whitespace, whereas the value of the content in the tree has whitespace stripped from the beginning, and trailing empty lines removed. *) -let emit_code_block ~start_offset content_offset input metadata delim terminator c has_results = - let c = Buffer.contents c |> trim_trailing_blank_lines in +let emit_code_block ~start_offset ~content_offset ~lexbuf input meta delimiter terminator buffer output = let content_location = input.offset_to_location content_offset in - let c = - with_location_adjustments - (fun _ _location c -> - let first_line_offset = content_location.column in - trim_leading_whitespace ~first_line_offset c) - input c - in - let c = trim_leading_blank_lines c in - let c = with_location_adjustments ~adjust_end_by:terminator ~start_offset:content_offset (fun _ -> Loc.at) input c in - emit ~start_offset input (`Code_block (metadata, delim, c, has_results)) - -let heading_level input level = + let content = + Buffer.contents buffer + |> trim_trailing_blank_lines + |> with_location_adjustments + (fun _ _location c -> + let first_line_offset = content_location.column in + trim_leading_whitespace ~first_line_offset c) + lexbuf + input + |> trim_leading_blank_lines + |> with_location_adjustments ~adjust_end_by:terminator ~start_offset:content_offset (fun _ -> Loc.at) lexbuf input in + emit ~start_offset lexbuf input (Code_block { meta; delimiter; content; output }) + +let heading_level lexbuf input level = if String.length level >= 2 && level.[0] = '0' then begin warning - input ~start_offset:1 (Parse_error.leading_zero_in_heading_level level) + lexbuf input ~start_offset:1 (Parse_error.leading_zero_in_heading_level level) end; int_of_string level @@ -321,6 +302,7 @@ rule reference_paren_content input start ref_offset start_offset depth_paren (depth_paren - 1) buffer lexbuf } | eof { warning + lexbuf input ~start_offset (Parse_error.unclosed_bracket ~bracket:"(") ; @@ -349,6 +331,7 @@ and reference_content input start start_offset buffer = parse } | eof { warning + lexbuf input ~start_offset (Parse_error.unclosed_bracket ~bracket:start) ; @@ -360,61 +343,61 @@ and reference_content input start start_offset buffer = parse and token input = parse | horizontal_space* eof - { emit input `End } + { emit lexbuf input END } | ((horizontal_space* newline as prefix) horizontal_space* ((newline horizontal_space*)+ as suffix) as ws) - { emit input (`Blank_line ws) ~adjust_start_by:prefix ~adjust_end_by:suffix } + { emit lexbuf input (Blank_line ws) ~adjust_start_by:prefix ~adjust_end_by:suffix } | (horizontal_space* newline horizontal_space* as ws) - { emit input (`Single_newline ws) } + { emit lexbuf input (Single_newline ws) } | (horizontal_space+ as ws) - { emit input (`Space ws) } + { emit lexbuf input (Space ws) } | (horizontal_space* (newline horizontal_space*)? as p) '}' - { emit input `Right_brace ~adjust_start_by:p } + { emit lexbuf input RIGHT_BRACE ~adjust_start_by:p } | '|' - { emit input `Bar } + { emit lexbuf input BAR } | word_char (word_char | bullet_char | '@')* | bullet_char (word_char | bullet_char | '@')+ as w - { emit input (`Word (unescape_word w)) } + { emit lexbuf input (Word (unescape_word w)) } | '[' { code_span (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) input lexbuf } | '-' - { emit input `Minus } + { emit lexbuf input MINUS } | '+' - { emit input `Plus } + { emit lexbuf input PLUS } | "{b" - { emit input (`Begin_style `Bold) } + { emit lexbuf input (Begin_style `Bold) } | "{i" - { emit input (`Begin_style `Italic) } + { emit lexbuf input (Begin_style `Italic) } | "{e" - { emit input (`Begin_style `Emphasis) } + { emit lexbuf input (Begin_style `Emphasis) } | "{L" - { emit input (`Begin_paragraph_style `Left) } + { emit lexbuf input (Begin_paragraph_style `Left) } | "{C" - { emit input (`Begin_paragraph_style `Center) } + { emit lexbuf input (Begin_paragraph_style `Center) } | "{R" - { emit input (`Begin_paragraph_style `Right) } + { emit lexbuf input (Begin_paragraph_style `Right) } | "{^" - { emit input (`Begin_style `Superscript) } + { emit lexbuf input (Begin_style `Superscript) } | "{_" - { emit input (`Begin_style `Subscript) } + { emit lexbuf input (Begin_style `Subscript) } | "{math" space_char { math Block (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) input lexbuf } @@ -424,7 +407,7 @@ and token input = parse | "{!modules:" ([^ '}']* as modules) '}' - { emit input (`Modules modules) } + { emit lexbuf input (Modules modules) } | (media_start as start) { @@ -432,47 +415,37 @@ and token input = parse let target = reference_content input start start_offset (Buffer.create 16) lexbuf in - let token = reference_token media start target input lexbuf in - emit ~start_offset input token } + let token = (reference_token start target) in + emit ~start_offset lexbuf input token } | "{[" - { code_block false (Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf) None (Buffer.create 256) "" input lexbuf } + { code_block (Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf) None (Buffer.create 256) "" input lexbuf } - | (("{" (delim_char* as delim) "@" horizontal_space*) as prefix) (language_tag_char+ as lang_tag_) + | (("{" (delim_char* as delimiter) "@" horizontal_space*) as prefix) (language_tag_char+ as lang_tag_) { let start_offset = Lexing.lexeme_start lexbuf in let lang_tag = - with_location_adjustments ~adjust_start_by:prefix (fun _ -> Loc.at) input lang_tag_ + with_location_adjustments ~adjust_start_by:prefix (fun _ -> Loc.at) lexbuf input lang_tag_ in let emit_truncated_code_block () = - let empty_content = with_location_adjustments (fun _ -> Loc.at) input "" in - emit ~start_offset input (`Code_block (Some (lang_tag, None), delim, empty_content, false)) - in - (* Disallow result block sections for code blocks without a delimiter. - This avoids the surprising parsing of '][' ending the code block. *) - let allow_result_block = delim <> "" in - let code_block_with_metadata metadata = - let content_offset = Lexing.lexeme_end lexbuf in - let metadata = Some (lang_tag, metadata) in - let prefix = Buffer.create 256 in - code_block allow_result_block start_offset content_offset metadata - prefix delim input lexbuf + let empty_content = with_location_adjustments (fun _ -> Loc.at) lexbuf input "" in + emit ~start_offset lexbuf input (Code_block { meta = Some { language = lang_tag; tags = None }; delimiter = Some delimiter; content = empty_content; output = None}) in match code_block_metadata_tail input lexbuf with - | `Ok metadata -> code_block_with_metadata metadata - | `Eof -> - warning input ~start_offset Parse_error.truncated_code_block_meta; + | Ok metadata -> code_block start_offset (Lexing.lexeme_end lexbuf) (Some (lang_tag, metadata)) (Buffer.create 256) delimiter input lexbuf + | Error `Eof -> + warning lexbuf input ~start_offset Parse_error.truncated_code_block_meta; emit_truncated_code_block () - | `Invalid_char c -> - warning input ~start_offset + | Error (`Invalid_char c) -> + warning lexbuf input ~start_offset (Parse_error.language_tag_invalid_char lang_tag_ c); - code_block_with_metadata None + code_block start_offset (Lexing.lexeme_end lexbuf) (Some (lang_tag, None)) (Buffer.create 256) delimiter input lexbuf } | "{@" horizontal_space* '[' { - warning input Parse_error.no_language_tag_in_meta; - code_block false (Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf) None (Buffer.create 256) "" input lexbuf + warning lexbuf input Parse_error.no_language_tag_in_meta; + code_block (Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf) None (Buffer.create 256) "" input lexbuf } | "{v" @@ -481,97 +454,98 @@ and token input = parse | "{%" ((raw_markup_target as target) ':')? (raw_markup as s) ("%}" | eof as e) - { let token = `Raw_markup (target, s) in + { let token = Raw_markup (target, s) in if e <> "%}" then warning + lexbuf input ~start_offset:(Lexing.lexeme_end lexbuf) (Parse_error.not_allowed - ~what:(Token.describe `End) + ~what:(Token.describe END) ~in_what:(Token.describe token)); - emit input token } + emit lexbuf input token } | "{ul" - { emit input (`Begin_list `Unordered) } + { emit input (List `Unordered) } | "{ol" - { emit input (`Begin_list `Ordered) } + { emit input (List `Ordered) } | "{li" - { emit input (`Begin_list_item `Li) } + { emit input (List_item `Li) } | "{-" - { emit input (`Begin_list_item `Dash) } + { emit input (List_item `Dash) } | "{table" - { emit input (`Begin_table_heavy) } + { emit input Table_heavy } | "{t" - { emit input (`Begin_table_light) } + { emit input Table_light } | "{tr" - { emit input `Begin_table_row } + { emit input Table_row } | "{th" - { emit input (`Begin_table_cell `Header) } + { emit input (Table_cell `Header) } | "{td" - { emit input (`Begin_table_cell `Data) } + { emit input (Table_cell `Data) } | '{' (['0'-'9']+ as level) ':' (([^ '}'] # space_char)* as label) { emit - input (`Begin_section_heading (heading_level input level, Some label)) } + input (Section_heading (heading_level input level, Some label)) } | '{' (['0'-'9']+ as level) - { emit input (`Begin_section_heading (heading_level input level, None)) } + { emit input (Section_heading (heading_level input level, None)) } | "@author" ((horizontal_space+ [^ '\r' '\n']*)? as author) - { emit input (`Tag (`Author author)) } + { emit input (Tag (`Author author)) } | "@deprecated" - { emit input (`Tag `Deprecated) } + { emit input (Tag `Deprecated) } | "@param" horizontal_space+ ((_ # space_char)+ as name) - { emit input (`Tag (`Param name)) } + { emit input (Tag (`Param name)) } | ("@raise" | "@raises") horizontal_space+ ((_ # space_char)+ as name) - { emit input (`Tag (`Raise name)) } + { emit input (Tag (`Raise name)) } | ("@return" | "@returns") - { emit input (`Tag `Return) } + { emit input (Tag `Return) } | "@see" horizontal_space* '<' ([^ '>']* as url) '>' - { emit input (`Tag (`See (`Url, url))) } + { emit input (Tag (`See (`Url, url))) } | "@see" horizontal_space* '\'' ([^ '\'']* as filename) '\'' - { emit input (`Tag (`See (`File, filename))) } + { emit input (Tag (`See (`File, filename))) } | "@see" horizontal_space* '"' ([^ '"']* as name) '"' - { emit input (`Tag (`See (`Document, name))) } + { emit input (Tag (`See (`Document, name))) } | "@since" ((horizontal_space+ [^ '\r' '\n']*)? as version) - { emit input (`Tag (`Since version)) } + { emit input (Tag (`Since version)) } | "@before" horizontal_space+ ((_ # space_char)+ as version) - { emit input (`Tag (`Before version)) } + { emit input (Tag (`Before version)) } | "@version" ((horizontal_space+ [^ '\r' '\n']*)? as version) - { emit input (`Tag (`Version version)) } + { emit input (Tag (`Version version)) } | "@canonical" ((horizontal_space+ [^ '\r' '\n']*)? as identifier) - { emit input (`Tag (`Canonical identifier)) } + { emit input (Tag (`Canonical identifier)) } | "@inline" - { emit input (`Tag `Inline) } + { emit input (Tag `Inline) } | "@open" - { emit input (`Tag `Open) } + { emit input (Tag `Open) } | "@closed" - { emit input (`Tag `Closed) } + { emit input (Tag `Closed) } | "@hidden" - { emit input (`Tag `Hidden) } + { emit input (Tag `Hidden) } | "]}" { emit input `Right_code_delimiter} @@ -591,27 +565,27 @@ and token input = parse | "@param" { warning input Parse_error.truncated_param; - emit input (`Tag (`Param "")) } + emit input (Tag (Param "")) } | ("@raise" | "@raises") as tag { warning input (Parse_error.truncated_raise tag); - emit input (`Tag (`Raise "")) } + emit input (Tag (Raise "")) } | "@before" { warning input Parse_error.truncated_before; - emit input (`Tag (`Before "")) } + emit input (Tag (Before "")) } | "@see" { warning input Parse_error.truncated_see; - emit input (`Word "@see") } + emit input (Word "@see") } | '@' ['a'-'z' 'A'-'Z']+ as tag { warning input (Parse_error.unknown_tag tag); - emit input (`Word tag) } + emit input (Word tag) } | '@' { warning input Parse_error.stray_at; - emit input (`Word "@") } + emit input (Word "@") } | '\r' { warning input Parse_error.stray_cr; @@ -629,7 +603,7 @@ and token input = parse and code_span buffer nesting_level start_offset input = parse | ']' { if nesting_level = 0 then - emit input (`Code_span (Buffer.contents buffer)) ~start_offset + emit input (Code_span (Buffer.contents buffer)) ~start_offset else begin Buffer.add_char buffer ']'; code_span buffer (nesting_level - 1) start_offset input lexbuf @@ -647,8 +621,8 @@ and code_span buffer nesting_level start_offset input = parse { warning input (Parse_error.not_allowed - ~what:(Token.describe (`Blank_line "\n\n")) - ~in_what:(Token.describe (`Code_span ""))); + ~what:(Token.describe (Blank_line "\n\n")) + ~in_what:(Token.describe (Code_span ""))); Buffer.add_char buffer ' '; code_span buffer nesting_level start_offset input lexbuf } | newline horizontal_space* @@ -659,9 +633,9 @@ and code_span buffer nesting_level start_offset input = parse { warning input (Parse_error.not_allowed - ~what:(Token.describe `End) - ~in_what:(Token.describe (`Code_span ""))); - emit input (`Code_span (Buffer.contents buffer)) ~start_offset } + ~what:(Token.describe END) + ~in_what:(Token.describe (Code_span ""))); + emit input (Code_span (Buffer.contents buffer)) ~start_offset } | _ as c { Buffer.add_char buffer c; @@ -689,7 +663,7 @@ and math kind buffer nesting_level start_offset input = parse warning input (Parse_error.not_allowed - ~what:(Token.describe (`Blank_line "\n")) + ~what:(Token.describe (Blank_line "\n")) ~in_what:(Token.describe (math_constr kind ""))); Buffer.add_char buffer '\n'; math kind buffer nesting_level start_offset input lexbuf @@ -701,7 +675,7 @@ and math kind buffer nesting_level start_offset input = parse { warning input (Parse_error.not_allowed - ~what:(Token.describe `End) + ~what:(Token.describe END) ~in_what:(Token.describe (math_constr kind ""))); emit input (math_constr kind (Buffer.contents buffer)) ~start_offset } | _ as c @@ -753,8 +727,8 @@ and verbatim buffer last_false_terminator start_offset input = parse warning input (Parse_error.not_allowed - ~what:(Token.describe `End) - ~in_what:(Token.describe (`Verbatim ""))) + ~what:(Token.describe END) + ~in_what:(Token.describe (Verbatim ""))) | Some location -> warning input @@ -778,7 +752,7 @@ and bad_markup_recovery start_offset input = parse input ~start_offset (Parse_error.bad_markup ("{" ^ rest) ~suggestion); - emit input (`Code_span text) ~start_offset} + emit input (Code_span text) ~start_offset} (* The second field of the metadata. This rule keeps whitespaces and newlines in the 'metadata' field except the @@ -791,33 +765,29 @@ and code_block_metadata_tail input = parse let meta = with_location_adjustments ~adjust_start_by:prefix ~adjust_end_by:suffix (fun _ -> Loc.at) input meta in - `Ok (Some meta) + Ok (Some meta) } | (newline | horizontal_space)* '[' - { `Ok None } + { Ok None } | _ as c - { `Invalid_char c } + { Error (`Invalid_char c) } | eof - { `Eof } + { Error `Eof } -and code_block allow_result_block start_offset content_offset metadata prefix delim input = parse +and code_block start_offset content_offset metadata prefix delim input = parse | ("]" (delim_char* as delim') "[") as terminator - { if delim = delim' && allow_result_block + { if delim = delim' then emit_code_block ~start_offset content_offset input metadata delim terminator prefix true - else ( - Buffer.add_string prefix terminator; - code_block allow_result_block start_offset content_offset metadata - prefix delim input lexbuf - ) - } + else + (Buffer.add_string prefix terminator; + code_block start_offset content_offset metadata prefix delim input lexbuf) } | ("]" (delim_char* as delim') "}") as terminator { if delim = delim' then emit_code_block ~start_offset content_offset input metadata delim terminator prefix false else ( Buffer.add_string prefix terminator; - code_block allow_result_block start_offset content_offset metadata - prefix delim input lexbuf + code_block start_offset content_offset metadata prefix delim input lexbuf ) } | eof @@ -828,6 +798,5 @@ and code_block allow_result_block start_offset content_offset metadata prefix de | (_ as c) { Buffer.add_char prefix c; - code_block allow_result_block start_offset content_offset metadata - prefix delim input lexbuf + code_block start_offset content_offset metadata prefix delim input lexbuf } diff --git a/src/parser/odoc_parser.ml b/src/parser/odoc_parser.ml index 2d9fdd5de1..316dc577cc 100644 --- a/src/parser/odoc_parser.ml +++ b/src/parser/odoc_parser.ml @@ -101,20 +101,15 @@ let position_of_point : t -> Loc.point -> Lexing.position = (* The main entry point for this module *) let parse_comment ~location ~text = - let warnings = ref [] in let reversed_newlines = reversed_newlines ~input:text in - let token_stream = - let lexbuf = Lexing.from_string text in - let offset_to_location = - offset_to_location ~reversed_newlines ~comment_location:location - in - let input : Lexer.input = - { file = location.Lexing.pos_fname; offset_to_location; warnings; lexbuf } - in - Stream.from (fun _token_index -> Some (Lexer.token input lexbuf)) + let lexbuf = Lexing.from_string text in + let lexer_state = + Lexer.{ warnings = [] + ; offset_to_location = offset_to_location ~reversed_newlines ~comment_location:location + ; file = "foo.ml" } in - let ast, warnings = Syntax.parse warnings token_stream in - { ast; warnings; reversed_newlines; original_pos = location } + let ast = Parser.main (Lexer.token lexer_state) lexbuf in + { ast; warnings = lexer_state.warnings; reversed_newlines; original_pos = location } (* Accessor functions, as [t] is opaque *) let warnings t = t.warnings diff --git a/src/parser/parser.mly b/src/parser/parser.mly new file mode 100644 index 0000000000..642a3dd777 --- /dev/null +++ b/src/parser/parser.mly @@ -0,0 +1,57 @@ +%{ +%} + +%token SPACE NEWLINE +%token RIGHT_BRACE +%token RIGHT_CODE_DELIMITER +%token COMMENT + +%token Blank_line +%token Single_newline +%token Space + + +%token Word + +%token MINUS PLUS BAR + +%token Style +%token Paragraph_style + +%token Modules + +%token Math_span +%token Math_block + +%token Raw_markup + +%token Tag + +%token Code_block +%token Code_span + +%token List +%token List_item + +%token Table_light +%token Table_heavy +%token Table_row +%token Table_cell + +%token Section_heading + +%token Simple_ref +%token Ref_with_replacement +%token Simple_link +%token Link_with_replacement + +%token Verbatim + +%token END + +%start main +%% + +let main := + | SPACE; { [] } + | NEWLINE; { [] } diff --git a/src/parser/test/dune b/src/parser/test/dune index c6769550dd..6f2295b846 100644 --- a/src/parser/test/dune +++ b/src/parser/test/dune @@ -7,4 +7,5 @@ (backend landmarks --auto)) (preprocess (pps ppx_expect)) + (modules serialize) (libraries sexplib0 odoc-parser)) diff --git a/src/parser/test/serialize.ml b/src/parser/test/serialize.ml new file mode 100644 index 0000000000..4d7e0ebf7d --- /dev/null +++ b/src/parser/test/serialize.ml @@ -0,0 +1,169 @@ + open Odoc_parser + +type sexp = Sexplib0.Sexp.t = Atom of string | List of sexp list + +module Location_to_sexp = struct + let point : Loc.point -> sexp = + fun { line; column } -> + List [ Atom (string_of_int line); Atom (string_of_int column) ] + + let span : Loc.span -> sexp = + fun { file; start; end_ } -> List [ Atom file; point start; point end_ ] + + let at : ('a -> sexp) -> 'a Loc.with_location -> sexp = + fun f { location; value } -> List [ span location; f value ] +end + +module Ast_to_sexp = struct + (* let at = Location_to_sexp.at *) + type at = { at : 'a. ('a -> sexp) -> 'a Loc.with_location -> sexp } + + let loc_at = { at = Location_to_sexp.at } + let str s = Atom s + let opt f s = match s with Some s -> List [ f s ] | None -> List [] + + let style : Ast.style -> sexp = function + | `Bold -> Atom "bold" + | `Italic -> Atom "italic" + | `Emphasis -> Atom "emphasis" + | `Superscript -> Atom "superscript" + | `Subscript -> Atom "subscript" + + let alignment : Ast.alignment option -> sexp = function + | Some `Left -> Atom "left" + | Some `Center -> Atom "center" + | Some `Right -> Atom "right" + | None -> Atom "default" + + let reference_kind : Ast.reference_kind -> sexp = function + | `Simple -> Atom "simple" + | `With_text -> Atom "with_text" + + let rec inline_element at : Ast.inline_element -> sexp = function + | `Space _ -> Atom "space" + | `Word w -> List [ Atom "word"; Atom w ] + | `Code_span c -> List [ Atom "code_span"; Atom c ] + | `Raw_markup (target, s) -> + List [ Atom "raw_markup"; opt str target; Atom s ] + | `Math_span s -> List [ Atom "math_span"; Atom s ] + | `Styled (s, es) -> + List [ style s; List (List.map (at.at (inline_element at)) es) ] + | `Reference (kind, r, es) -> + List + [ + reference_kind kind; + at.at str r; + List (List.map (at.at (inline_element at)) es); + ] + | `Link (u, es) -> + List [ str u; List (List.map (at.at (inline_element at)) es) ] + + let code_block_lang at { Ast.language; tags } = + List [ at.at str language; opt (at.at str) tags ] + + let rec nestable_block_element at : Ast.nestable_block_element -> sexp = + function + | `Paragraph es -> + List + [ Atom "paragraph"; List (List.map (at.at (inline_element at)) es) ] + | `Math_block s -> List [ Atom "math_block"; Atom s ] + | `Code_block { Ast.meta = None; content; output = None; _ } -> + List [ Atom "code_block"; at.at str content ] + | `Code_block { meta = Some meta; content; output = None; _ } -> + List [ Atom "code_block"; code_block_lang at meta; at.at str content ] + | `Code_block { meta = Some meta; content; output = Some output; _ } -> + List + [ + Atom "code_block"; + code_block_lang at meta; + at.at str content; + List + (List.map (fun v -> nestable_block_element at v.Loc.value) output); + ] + | `Code_block { meta = None; content = _; output = Some _output; _ } -> + List [ Atom "code_block_err" ] + | `Verbatim t -> List [ Atom "verbatim"; Atom t ] + | `Modules ps -> List [ Atom "modules"; List (List.map (at.at str) ps) ] + | `List (kind, weight, items) -> + let kind = + match kind with `Unordered -> "unordered" | `Ordered -> "ordered" + in + let weight = + match weight with `Light -> "light" | `Heavy -> "heavy" + in + let items = + items + |> List.map (fun item -> + List (List.map (at.at (nestable_block_element at)) item)) + |> fun items -> List items + in + List [ Atom kind; Atom weight; items ] + | `Table ((grid, align), s) -> + let syntax = function `Light -> "light" | `Heavy -> "heavy" in + let kind = function `Header -> "header" | `Data -> "data" in + let map name x f = List [ Atom name; List (List.map f x) ] in + let alignment = + match align with + | None -> List [ Atom "align"; Atom "no alignment" ] + | Some align -> map "align" align @@ alignment + in + List + [ + Atom "table"; + List [ Atom "syntax"; Atom (syntax s) ]; + ( map "grid" grid @@ fun row -> + map "row" row @@ fun (cell, k) -> + map (kind k) cell @@ at.at (nestable_block_element at) ); + alignment; + ] + + let tag at : Ast.tag -> sexp = function + | `Author s -> List [ Atom "@author"; Atom s ] + | `Deprecated es -> + List + (Atom "@deprecated" :: List.map (at.at (nestable_block_element at)) es) + | `Param (s, es) -> + List + ([ Atom "@param"; Atom s ] + @ List.map (at.at (nestable_block_element at)) es) + | `Raise (s, es) -> + List + ([ Atom "@raise"; Atom s ] + @ List.map (at.at (nestable_block_element at)) es) + | `Return es -> + List (Atom "@return" :: List.map (at.at (nestable_block_element at)) es) + | `See (kind, s, es) -> + let kind = + match kind with + | `Url -> "url" + | `File -> "file" + | `Document -> "document" + in + List + ([ Atom "@see"; Atom kind; Atom s ] + @ List.map (at.at (nestable_block_element at)) es) + | `Since s -> List [ Atom "@since"; Atom s ] + | `Before (s, es) -> + List + ([ Atom "@before"; Atom s ] + @ List.map (at.at (nestable_block_element at)) es) + | `Version s -> List [ Atom "@version"; Atom s ] + | `Canonical p -> List [ Atom "@canonical"; at.at str p ] + | `Inline -> Atom "@inline" + | `Open -> Atom "@open" + | `Closed -> Atom "@closed" + | `Hidden -> Atom "@hidden" + + let block_element at : Ast.block_element -> sexp = function + | #Ast.nestable_block_element as e -> nestable_block_element at e + | `Heading (level, label, es) -> + let label = List [ Atom "label"; opt str label ] in + let level = string_of_int level in + List + [ Atom level; label; List (List.map (at.at (inline_element at)) es) ] + | `Tag t -> tag at t + + let docs at : Ast.t -> sexp = + fun f -> List (List.map (at.at (block_element at)) f) +end + diff --git a/src/parser/test/test.ml b/src/parser/test/test.ml index 175828dc67..74b068a1d8 100644 --- a/src/parser/test/test.ml +++ b/src/parser/test/test.ml @@ -1,182 +1,5 @@ open Odoc_parser - -type sexp = Sexplib0.Sexp.t = Atom of string | List of sexp list - -module Location_to_sexp = struct - let point : Loc.point -> sexp = - fun { line; column } -> - List [ Atom (string_of_int line); Atom (string_of_int column) ] - - let span : Loc.span -> sexp = - fun { file; start; end_ } -> List [ Atom file; point start; point end_ ] - - let at : ('a -> sexp) -> 'a Loc.with_location -> sexp = - fun f { location; value } -> List [ span location; f value ] -end - -module Ast_to_sexp = struct - (* let at = Location_to_sexp.at *) - type at = { at : 'a. ('a -> sexp) -> 'a Loc.with_location -> sexp } - - let loc_at = { at = Location_to_sexp.at } - let str s = Atom s - let opt f s = match s with Some s -> List [ f s ] | None -> List [] - - let style : Ast.style -> sexp = function - | `Bold -> Atom "bold" - | `Italic -> Atom "italic" - | `Emphasis -> Atom "emphasis" - | `Superscript -> Atom "superscript" - | `Subscript -> Atom "subscript" - - let alignment : Ast.alignment option -> sexp = function - | Some `Left -> Atom "left" - | Some `Center -> Atom "center" - | Some `Right -> Atom "right" - | None -> Atom "default" - - let reference_kind : Ast.reference_kind -> sexp = function - | `Simple -> Atom "simple" - | `With_text -> Atom "with_text" - - let media : Ast.media -> sexp = function - | `Image -> Atom "image" - | `Video -> Atom "video" - | `Audio -> Atom "audio" - - let rec inline_element at : Ast.inline_element -> sexp = function - | `Space _ -> Atom "space" - | `Word w -> List [ Atom "word"; Atom w ] - | `Code_span c -> List [ Atom "code_span"; Atom c ] - | `Raw_markup (target, s) -> - List [ Atom "raw_markup"; opt str target; Atom s ] - | `Math_span s -> List [ Atom "math_span"; Atom s ] - | `Styled (s, es) -> - List [ style s; List (List.map (at.at (inline_element at)) es) ] - | `Reference (kind, r, es) -> - List - [ - reference_kind kind; - at.at str r; - List (List.map (at.at (inline_element at)) es); - ] - | `Link (u, es) -> - List [ str u; List (List.map (at.at (inline_element at)) es) ] - - let code_block_lang at { Ast.language; tags } = - List [ at.at str language; opt (at.at str) tags ] - - let media_href = function - | `Reference href -> List [ Atom "Reference"; Atom href ] - | `Link href -> List [ Atom "Link"; Atom href ] - - let rec nestable_block_element at : Ast.nestable_block_element -> sexp = - function - | `Paragraph es -> - List - [ Atom "paragraph"; List (List.map (at.at (inline_element at)) es) ] - | `Math_block s -> List [ Atom "math_block"; Atom s ] - | `Code_block { Ast.meta = None; content; output = None; _ } -> - List [ Atom "code_block"; at.at str content ] - | `Code_block { meta = Some meta; content; output = None; _ } -> - List [ Atom "code_block"; code_block_lang at meta; at.at str content ] - | `Code_block { meta = Some meta; content; output = Some output; _ } -> - List - [ - Atom "code_block"; - code_block_lang at meta; - at.at str content; - List - (List.map (fun v -> nestable_block_element at v.Loc.value) output); - ] - | `Code_block { meta = None; content = _; output = Some _output; _ } -> - List [ Atom "code_block_err" ] - | `Verbatim t -> List [ Atom "verbatim"; Atom t ] - | `Modules ps -> List [ Atom "modules"; List (List.map (at.at str) ps) ] - | `List (kind, weight, items) -> - let kind = - match kind with `Unordered -> "unordered" | `Ordered -> "ordered" - in - let weight = - match weight with `Light -> "light" | `Heavy -> "heavy" - in - let items = - items - |> List.map (fun item -> - List (List.map (at.at (nestable_block_element at)) item)) - |> fun items -> List items - in - List [ Atom kind; Atom weight; items ] - | `Table ((grid, align), s) -> - let syntax = function `Light -> "light" | `Heavy -> "heavy" in - let kind = function `Header -> "header" | `Data -> "data" in - let map name x f = List [ Atom name; List (List.map f x) ] in - let alignment = - match align with - | None -> List [ Atom "align"; Atom "no alignment" ] - | Some align -> map "align" align @@ alignment - in - List - [ - Atom "table"; - List [ Atom "syntax"; Atom (syntax s) ]; - ( map "grid" grid @@ fun row -> - map "row" row @@ fun (cell, k) -> - map (kind k) cell @@ at.at (nestable_block_element at) ); - alignment; - ] - | `Media (kind, href, c, m) -> - List [ reference_kind kind; at.at media_href href; Atom c; media m ] - - let tag at : Ast.tag -> sexp = function - | `Author s -> List [ Atom "@author"; Atom s ] - | `Deprecated es -> - List - (Atom "@deprecated" :: List.map (at.at (nestable_block_element at)) es) - | `Param (s, es) -> - List - ([ Atom "@param"; Atom s ] - @ List.map (at.at (nestable_block_element at)) es) - | `Raise (s, es) -> - List - ([ Atom "@raise"; Atom s ] - @ List.map (at.at (nestable_block_element at)) es) - | `Return es -> - List (Atom "@return" :: List.map (at.at (nestable_block_element at)) es) - | `See (kind, s, es) -> - let kind = - match kind with - | `Url -> "url" - | `File -> "file" - | `Document -> "document" - in - List - ([ Atom "@see"; Atom kind; Atom s ] - @ List.map (at.at (nestable_block_element at)) es) - | `Since s -> List [ Atom "@since"; Atom s ] - | `Before (s, es) -> - List - ([ Atom "@before"; Atom s ] - @ List.map (at.at (nestable_block_element at)) es) - | `Version s -> List [ Atom "@version"; Atom s ] - | `Canonical p -> List [ Atom "@canonical"; at.at str p ] - | `Inline -> Atom "@inline" - | `Open -> Atom "@open" - | `Closed -> Atom "@closed" - | `Hidden -> Atom "@hidden" - - let block_element at : Ast.block_element -> sexp = function - | #Ast.nestable_block_element as e -> nestable_block_element at e - | `Heading (level, label, es) -> - let label = List [ Atom "label"; opt str label ] in - let level = string_of_int level in - List - [ Atom level; label; List (List.map (at.at (inline_element at)) es) ] - | `Tag t -> tag at t - - let docs at : Ast.t -> sexp = - fun f -> List (List.map (at.at (block_element at)) f) -end +open Serialize let error err = Atom (Odoc_parser.Warning.to_string err) diff --git a/src/parser/token.ml b/src/parser/token.ml index 4db3b14eb5..c2f0a53230 100644 --- a/src/parser/token.ml +++ b/src/parser/token.ml @@ -1,3 +1,5 @@ +(* NOTE : (@faycarsons) keep as reference for the moment *) + (* This module contains the token type, emitted by the lexer, and consumed by the comment syntax parser. It also contains two functions that format tokens for error messages. *) @@ -6,6 +8,7 @@ type section_heading = [ `Begin_section_heading of int * string option ] type style = [ `Bold | `Italic | `Emphasis | `Superscript | `Subscript ] type paragraph_style = [ `Left | `Center | `Right ] + type tag = [ `Tag of [ `Author of string @@ -101,143 +104,112 @@ type t = | section_heading | tag ] -let print : [< t ] -> string = function - | `Begin_paragraph_style `Left -> "'{L'" - | `Begin_paragraph_style `Center -> "'{C'" - | `Begin_paragraph_style `Right -> "'{R'" - | `Begin_style `Bold -> "'{b'" - | `Begin_style `Italic -> "'{i'" - | `Begin_style `Emphasis -> "'{e'" - | `Begin_style `Superscript -> "'{^'" - | `Begin_style `Subscript -> "'{_'" - | `Begin_reference_with_replacement_text _ -> "'{{!'" - | `Begin_link_with_replacement_text _ -> "'{{:'" - | `Begin_list_item `Li -> "'{li ...}'" - | `Begin_list_item `Dash -> "'{- ...}'" - | `Begin_table_light -> "{t" - | `Begin_table_heavy -> "{table" - | `Begin_table_row -> "'{tr'" - | `Begin_table_cell `Header -> "'{th'" - | `Begin_table_cell `Data -> "'{td'" - | `Minus -> "'-'" - | `Plus -> "'+'" - | `Bar -> "'|'" - | `Begin_section_heading (level, label) -> +open Parser + +let print : Parser.token -> string = function + | Paragraph_style `Left -> "'{L'" + | Paragraph_style `Center -> "'{C'" + | Paragraph_style `Right -> "'{R'" + | Style `Bold -> "'{b'" + | Style `Italic -> "'{i'" + | Style `Emphasis -> "'{e'" + | Style `Superscript -> "'{^'" + | Style `Subscript -> "'{_'" + | Ref_with_replacement _ -> "'{{!'" + | Link_with_replacement _ -> "'{{:'" + | List_item `Li -> "'{li ...}'" + | List_item `Dash -> "'{- ...}'" + | Table_light -> "{t" + | Table_heavy -> "{table" + | Table_row -> "'{tr'" + | Table_cell `Header -> "'{th'" + | Table_cell `Data -> "'{td'" + | MINUS -> "'-'" + | PLUS -> "'+'" + | BAR -> "'|'" + | Section_heading (level, label) -> let label = match label with None -> "" | Some label -> ":" ^ label in Printf.sprintf "'{%i%s'" level label - | `Tag (`Author _) -> "'@author'" - | `Tag `Deprecated -> "'@deprecated'" - | `Tag (`Param _) -> "'@param'" - | `Tag (`Raise _) -> "'@raise'" - | `Tag `Return -> "'@return'" - | `Tag (`See _) -> "'@see'" - | `Tag (`Since _) -> "'@since'" - | `Tag (`Before _) -> "'@before'" - | `Tag (`Version _) -> "'@version'" - | `Tag (`Canonical _) -> "'@canonical'" - | `Tag `Inline -> "'@inline'" - | `Tag `Open -> "'@open'" - | `Tag `Closed -> "'@closed'" - | `Tag `Hidden -> "'@hidden" - | `Raw_markup (None, _) -> "'{%...%}'" - | `Raw_markup (Some target, _) -> "'{%" ^ target ^ ":...%}'" - | `Simple_media (`Reference _, `Image) -> "{image!...}" - | `Simple_media (`Reference _, `Audio) -> "{audio!...}" - | `Simple_media (`Reference _, `Video) -> "{video!...}" - | `Simple_media (`Link _, `Image) -> "{image:...}" - | `Simple_media (`Link _, `Audio) -> "{audio:...}" - | `Simple_media (`Link _, `Video) -> "{video:...}" - | `Media_with_replacement_text (`Reference _, `Image, _) -> - "{{image!...} ...}" - | `Media_with_replacement_text (`Reference _, `Audio, _) -> - "{{audio!...} ...}" - | `Media_with_replacement_text (`Reference _, `Video, _) -> - "{{video!...} ...}" - | `Media_with_replacement_text (`Link _, `Image, _) -> "{{image:...} ...}" - | `Media_with_replacement_text (`Link _, `Audio, _) -> "{{audio:...} ...}" - | `Media_with_replacement_text (`Link _, `Video, _) -> "{{video:...} ...}" + | Tag (Author _) -> "'@author'" + | Tag Deprecated -> "'@deprecated'" + | Tag (Param _) -> "'@param'" + | Tag (Raise _) -> "'@raise'" + | Tag Return -> "'@return'" + | Tag (See _) -> "'@see'" + | Tag (Since _) -> "'@since'" + | Tag (Before _) -> "'@before'" + | Tag (Version _) -> "'@version'" + | Tag (Canonical _) -> "'@canonical'" + | Tag Inline -> "'@inline'" + | Tag Open -> "'@open'" + | Tag Closed -> "'@closed'" + | Tag Hidden -> "'@hidden" + | Raw_markup (None, _) -> "'{%...%}'" + | Raw_markup (Some target, _) -> "'{%" ^ target ^ ":...%}'" (* [`Minus] and [`Plus] are interpreted as if they start list items. Therefore, for error messages based on [Token.describe] to be accurate, formatted [`Minus] and [`Plus] should always be plausibly list item bullets. *) -let describe : [< t | `Comment ] -> string = function - | `Word w -> Printf.sprintf "'%s'" w - | `Code_span _ -> "'[...]' (code)" - | `Raw_markup _ -> "'{%...%}' (raw markup)" - | `Begin_paragraph_style `Left -> "'{L ...}' (left alignment)" - | `Begin_paragraph_style `Center -> "'{C ...}' (center alignment)" - | `Begin_paragraph_style `Right -> "'{R ...}' (right alignment)" - | `Begin_style `Bold -> "'{b ...}' (boldface text)" - | `Begin_style `Italic -> "'{i ...}' (italic text)" - | `Begin_style `Emphasis -> "'{e ...}' (emphasized text)" - | `Begin_style `Superscript -> "'{^...}' (superscript)" - | `Begin_style `Subscript -> "'{_...}' (subscript)" - | `Math_span _ -> "'{m ...}' (math span)" - | `Math_block _ -> "'{math ...}' (math block)" - | `Simple_reference _ -> "'{!...}' (cross-reference)" - | `Begin_reference_with_replacement_text _ -> +let describe : Parser.token -> string = function + | Word w -> Printf.sprintf "'%s'" w + | Code_span _ -> "'[...]' (code)" + | Raw_markup _ -> "'{%...%}' (raw markup)" + | Paragraph_style `Left -> "'{L ...}' (left alignment)" + | Paragraph_style `Center -> "'{C ...}' (center alignment)" + | Paragraph_style `Right -> "'{R ...}' (right alignment)" + | Style `Bold -> "'{b ...}' (boldface text)" + | Style `Italic -> "'{i ...}' (italic text)" + | Style `Emphasis -> "'{e ...}' (emphasized text)" + | Style `Superscript -> "'{^...}' (superscript)" + | Style `Subscript -> "'{_...}' (subscript)" + | Math_span _ -> "'{m ...}' (math span)" + | Math_block _ -> "'{math ...}' (math block)" + | Simple_ref _ -> "'{!...}' (cross-reference)" + | Ref_with_replacement _ -> "'{{!...} ...}' (cross-reference)" - | `Simple_media (`Reference _, `Image) -> "'{image!...}' (image-reference)" - | `Simple_media (`Reference _, `Audio) -> "'{audio!...}' (audio-reference)" - | `Simple_media (`Reference _, `Video) -> "'{video!...}' (video-reference)" - | `Simple_media (`Link _, `Image) -> "'{image:...}' (image-link)" - | `Simple_media (`Link _, `Audio) -> "'{audio:...}' (audio-link)" - | `Simple_media (`Link _, `Video) -> "'{video:...}' (video-link)" - | `Media_with_replacement_text (`Reference _, `Image, _) -> - "'{{image!...} ...}' (image-reference)" - | `Media_with_replacement_text (`Reference _, `Audio, _) -> - "'{{audio!...} ...}' (audio-reference)" - | `Media_with_replacement_text (`Reference _, `Video, _) -> - "'{{video!...} ...}' (video-reference)" - | `Media_with_replacement_text (`Link _, `Image, _) -> - "'{{image:...} ...}' (image-link)" - | `Media_with_replacement_text (`Link _, `Audio, _) -> - "'{{audio:...} ...}' (audio-link)" - | `Media_with_replacement_text (`Link _, `Video, _) -> - "'{{video:...} ...}' (video-link)" - | `Simple_link _ -> "'{:...} (external link)'" - | `Begin_link_with_replacement_text _ -> "'{{:...} ...}' (external link)" - | `End -> "end of text" - | `Space _ -> "whitespace" - | `Single_newline _ -> "line break" - | `Blank_line _ -> "blank line" - | `Right_brace -> "'}'" - | `Right_code_delimiter -> "']}'" - | `Code_block _ -> "'{[...]}' (code block)" - | `Verbatim _ -> "'{v ... v}' (verbatim text)" - | `Modules _ -> "'{!modules ...}'" - | `Begin_list `Unordered -> "'{ul ...}' (bulleted list)" - | `Begin_list `Ordered -> "'{ol ...}' (numbered list)" - | `Begin_list_item `Li -> "'{li ...}' (list item)" - | `Begin_list_item `Dash -> "'{- ...}' (list item)" - | `Begin_table_light -> "'{t ...}' (table)" - | `Begin_table_heavy -> "'{table ...}' (table)" - | `Begin_table_row -> "'{tr ...}' (table row)" - | `Begin_table_cell `Header -> "'{th ... }' (table header cell)" - | `Begin_table_cell `Data -> "'{td ... }' (table data cell)" - | `Minus -> "'-' (bulleted list item)" - | `Plus -> "'+' (numbered list item)" - | `Bar -> "'|'" - | `Begin_section_heading (level, _) -> + | Simple_link _ -> "'{:...} (external link)'" + | Link_with_replacement _ -> "'{{:...} ...}' (external link)" + | END -> "end of text" + | SPACE _ -> "whitespace" + | Single_newline _ -> "line break" + | Blank_line _ -> "blank line" + | RIGHT_BRACE -> "'}'" + | RIGHT_CODE_DELIMITER -> "']}'" + | Code_block _ -> "'{[...]}' (code block)" + | Verbatim _ -> "'{v ... v}' (verbatim text)" + | Modules _ -> "'{!modules ...}'" + | List `Unordered -> "'{ul ...}' (bulleted list)" + | List `Ordered -> "'{ol ...}' (numbered list)" + | List_item `Li -> "'{li ...}' (list item)" + | List_item `Dash -> "'{- ...}' (list item)" + | Table_light -> "'{t ...}' (table)" + | Table_heavy -> "'{table ...}' (table)" + | Table_row -> "'{tr ...}' (table row)" + | Table_cell `Header -> "'{th ... }' (table header cell)" + | Table_cell `Data -> "'{td ... }' (table data cell)" + | MINUS -> "'-' (bulleted list item)" + | PLUS -> "'+' (numbered list item)" + | BAR -> "'|'" + | Section_heading (level, _) -> Printf.sprintf "'{%i ...}' (section heading)" level - | `Tag (`Author _) -> "'@author'" - | `Tag `Deprecated -> "'@deprecated'" - | `Tag (`Param _) -> "'@param'" - | `Tag (`Raise _) -> "'@raise'" - | `Tag `Return -> "'@return'" - | `Tag (`See _) -> "'@see'" - | `Tag (`Since _) -> "'@since'" - | `Tag (`Before _) -> "'@before'" - | `Tag (`Version _) -> "'@version'" - | `Tag (`Canonical _) -> "'@canonical'" - | `Tag `Inline -> "'@inline'" - | `Tag `Open -> "'@open'" - | `Tag `Closed -> "'@closed'" - | `Tag `Hidden -> "'@hidden" - | `Comment -> "top-level text" + | Tag (Author _) -> "'@author'" + | Tag Deprecated -> "'@deprecated'" + | Tag (Param _) -> "'@param'" + | Tag (Raise _) -> "'@raise'" + | Tag Return -> "'@return'" + | Tag (See _) -> "'@see'" + | Tag (Since _) -> "'@since'" + | Tag (Before _) -> "'@before'" + | Tag (Version _) -> "'@version'" + | Tag (Canonical _) -> "'@canonical'" + | Tag Inline -> "'@inline'" + | Tag Open -> "'@open'" + | Tag Closed -> "'@closed'" + | Tag Hidden -> "'@hidden" + | COMMENT -> "top-level text" let describe_element = function - | `Reference (`Simple, _, _) -> describe (`Simple_reference "") + | `Reference (`Simple, _, _) -> describe (Simple_ref "") | `Reference (`With_text, _, _) -> describe (`Begin_reference_with_replacement_text "") | `Link _ -> describe (`Begin_link_with_replacement_text "") From 6654f63788eca20f26e308559066705a20f82e54 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 26 Aug 2024 14:17:49 -0400 Subject: [PATCH 002/150] lexer uses new tokens --- src/parser/ast.ml | 4 +- src/parser/lexer.mll | 164 ++++++++++++++++++++++-------------------- src/parser/syntax.ml | 5 ++ src/parser/syntax.mli | 6 +- src/parser/token.ml | 19 ++--- 5 files changed, 109 insertions(+), 89 deletions(-) diff --git a/src/parser/ast.ml b/src/parser/ast.ml index 7bff58a6d7..4b98235d57 100644 --- a/src/parser/ast.ml +++ b/src/parser/ast.ml @@ -1,11 +1,11 @@ (** Abstract syntax tree representing ocamldoc comments *) type parser_tag = - Author + Author of string | Deprecated | Param of string | Raise of string - | Return + | Return | See of [ `Url | `File | `Document ] * string | Since of string | Before of string diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 1b6895df73..6407ec0670 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -212,10 +212,10 @@ let trim_trailing_space_or_accept_whitespace text = let emit_verbatim lexbuf input start_offset buffer = let t = Buffer.contents buffer - |> trim_trailing_space_or_accept_whitespace - |> trim_leading_space_or_accept_whitespace lexbuf input start_offset - |> trim_leading_blank_lines - |> trim_trailing_blank_lines in + |> trim_trailing_space_or_accept_whitespace + |> trim_leading_space_or_accept_whitespace lexbuf input start_offset + |> trim_leading_blank_lines + |> trim_trailing_blank_lines in emit lexbuf input (Verbatim t) ~start_offset (* The locations have to be treated carefully in this function. We need to ensure that @@ -376,28 +376,28 @@ and token input = parse { emit lexbuf input PLUS } | "{b" - { emit lexbuf input (Begin_style `Bold) } + { emit lexbuf input (Style `Bold) } | "{i" - { emit lexbuf input (Begin_style `Italic) } + { emit lexbuf input (Style `Italic) } | "{e" - { emit lexbuf input (Begin_style `Emphasis) } + { emit lexbuf input (Style `Emphasis) } | "{L" - { emit lexbuf input (Begin_paragraph_style `Left) } + { emit lexbuf input (Paragraph_style `Left) } | "{C" - { emit lexbuf input (Begin_paragraph_style `Center) } + { emit lexbuf input (Paragraph_style `Center) } | "{R" - { emit lexbuf input (Begin_paragraph_style `Right) } + { emit lexbuf input (Paragraph_style `Right) } | "{^" - { emit lexbuf input (Begin_style `Superscript) } + { emit lexbuf input (Style `Superscript) } | "{_" - { emit lexbuf input (Begin_style `Subscript) } + { emit lexbuf input (Style `Subscript) } | "{math" space_char { math Block (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) input lexbuf } @@ -432,14 +432,16 @@ and token input = parse emit ~start_offset lexbuf input (Code_block { meta = Some { language = lang_tag; tags = None }; delimiter = Some delimiter; content = empty_content; output = None}) in match code_block_metadata_tail input lexbuf with - | Ok metadata -> code_block start_offset (Lexing.lexeme_end lexbuf) (Some (lang_tag, metadata)) (Buffer.create 256) delimiter input lexbuf + | Ok metadata -> code_block start_offset (Lexing.lexeme_end lexbuf) (Some metadata) (Buffer.create 256) delimiter input lexbuf | Error `Eof -> warning lexbuf input ~start_offset Parse_error.truncated_code_block_meta; emit_truncated_code_block () | Error (`Invalid_char c) -> warning lexbuf input ~start_offset (Parse_error.language_tag_invalid_char lang_tag_ c); - code_block start_offset (Lexing.lexeme_end lexbuf) (Some (lang_tag, None)) (Buffer.create 256) delimiter input lexbuf + + (* NOTE : Metadata should not be `None`, fix this!! *) + code_block start_offset (Lexing.lexeme_end lexbuf) None (Buffer.create 256) delimiter input lexbuf } | "{@" horizontal_space* '[' @@ -466,144 +468,145 @@ and token input = parse emit lexbuf input token } | "{ul" - { emit input (List `Unordered) } + { emit lexbuf input (List `Unordered) } | "{ol" - { emit input (List `Ordered) } + { emit lexbuf input (List `Ordered) } | "{li" - { emit input (List_item `Li) } + { emit lexbuf input (List_item `Li) } | "{-" - { emit input (List_item `Dash) } + { emit lexbuf input (List_item `Dash) } | "{table" - { emit input Table_heavy } + { emit lexbuf input Table_heavy } | "{t" - { emit input Table_light } + { emit lexbuf input Table_light } | "{tr" - { emit input Table_row } + { emit lexbuf input Table_row } | "{th" - { emit input (Table_cell `Header) } + { emit lexbuf input (Table_cell `Header) } | "{td" - { emit input (Table_cell `Data) } + { emit lexbuf input (Table_cell `Data) } | '{' (['0'-'9']+ as level) ':' (([^ '}'] # space_char)* as label) - { emit - input (Section_heading (heading_level input level, Some label)) } + { emit lexbuf + input (Section_heading (heading_level lexbuf input level, Some label)) } | '{' (['0'-'9']+ as level) - { emit input (Section_heading (heading_level input level, None)) } + { emit lexbuf input (Section_heading (heading_level lexbuf input level, None)) } | "@author" ((horizontal_space+ [^ '\r' '\n']*)? as author) - { emit input (Tag (`Author author)) } + { emit lexbuf input (Tag (Author author)) } | "@deprecated" - { emit input (Tag `Deprecated) } + { emit lexbuf input (Tag Deprecated) } | "@param" horizontal_space+ ((_ # space_char)+ as name) - { emit input (Tag (`Param name)) } + { emit lexbuf input (Tag (Param name)) } | ("@raise" | "@raises") horizontal_space+ ((_ # space_char)+ as name) - { emit input (Tag (`Raise name)) } + { emit lexbuf input (Tag (Raise name)) } | ("@return" | "@returns") - { emit input (Tag `Return) } + { emit lexbuf input (Tag Return) } | "@see" horizontal_space* '<' ([^ '>']* as url) '>' - { emit input (Tag (`See (`Url, url))) } + { emit lexbuf input (Tag (See (`Url, url))) } | "@see" horizontal_space* '\'' ([^ '\'']* as filename) '\'' - { emit input (Tag (`See (`File, filename))) } + { emit lexbuf input (Tag (See (`File, filename))) } | "@see" horizontal_space* '"' ([^ '"']* as name) '"' - { emit input (Tag (`See (`Document, name))) } + { emit lexbuf input (Tag (See (`Document, name))) } | "@since" ((horizontal_space+ [^ '\r' '\n']*)? as version) - { emit input (Tag (`Since version)) } + { emit lexbuf input (Tag (Since version)) } | "@before" horizontal_space+ ((_ # space_char)+ as version) - { emit input (Tag (`Before version)) } + { emit lexbuf input (Tag (Before version)) } | "@version" ((horizontal_space+ [^ '\r' '\n']*)? as version) - { emit input (Tag (`Version version)) } + { emit lexbuf input (Tag (Version version)) } | "@canonical" ((horizontal_space+ [^ '\r' '\n']*)? as identifier) - { emit input (Tag (`Canonical identifier)) } + { emit lexbuf input (Tag (Canonical identifier)) } | "@inline" - { emit input (Tag `Inline) } + { emit lexbuf input (Tag Inline) } | "@open" - { emit input (Tag `Open) } + { emit lexbuf input (Tag Open) } | "@closed" - { emit input (Tag `Closed) } + { emit lexbuf input (Tag Closed) } | "@hidden" - { emit input (Tag `Hidden) } + { emit lexbuf input (Tag Hidden) } | "]}" - { emit input `Right_code_delimiter} + { emit lexbuf input RIGHT_CODE_DELIMITER} | '{' { try bad_markup_recovery (Lexing.lexeme_start lexbuf) input lexbuf with Failure _ -> - warning + warning lexbuf input (Parse_error.bad_markup "{" ~suggestion:"escape the brace with '\\{'."); - emit input (`Word "{") } + emit lexbuf input (Word "{") } | ']' - { warning input Parse_error.unpaired_right_bracket; - emit input (`Word "]") } + { warning lexbuf input Parse_error.unpaired_right_bracket; + emit lexbuf input (Word "]") } | "@param" - { warning input Parse_error.truncated_param; - emit input (Tag (Param "")) } + { warning lexbuf input Parse_error.truncated_param; + emit lexbuf input (Tag (Param "")) } | ("@raise" | "@raises") as tag - { warning input (Parse_error.truncated_raise tag); - emit input (Tag (Raise "")) } + { warning lexbuf input (Parse_error.truncated_raise tag); + emit lexbuf input (Tag (Raise "")) } | "@before" - { warning input Parse_error.truncated_before; - emit input (Tag (Before "")) } + { warning lexbuf input Parse_error.truncated_before; + emit lexbuf input (Tag (Before "")) } | "@see" - { warning input Parse_error.truncated_see; - emit input (Word "@see") } + { warning lexbuf input Parse_error.truncated_see; + emit lexbuf input (Word "@see") } | '@' ['a'-'z' 'A'-'Z']+ as tag - { warning input (Parse_error.unknown_tag tag); - emit input (Word tag) } + { warning lexbuf input (Parse_error.unknown_tag tag); + emit lexbuf input (Word tag) } | '@' - { warning input Parse_error.stray_at; - emit input (Word "@") } + { warning lexbuf input Parse_error.stray_at; + emit lexbuf input (Word "@") } | '\r' - { warning input Parse_error.stray_cr; + { warning lexbuf input Parse_error.stray_cr; token input lexbuf } | "{!modules:" ([^ '}']* as modules) eof - { warning + { warning + lexbuf input ~start_offset:(Lexing.lexeme_end lexbuf) (Parse_error.not_allowed - ~what:(Token.describe `End) - ~in_what:(Token.describe (`Modules ""))); - emit input (`Modules modules) } + ~what:(Token.describe END) + ~in_what:(Token.describe (Modules ""))); + emit lexbuf input (Modules modules) } and code_span buffer nesting_level start_offset input = parse | ']' { if nesting_level = 0 then - emit input (Code_span (Buffer.contents buffer)) ~start_offset + emit lexbuf input (Code_span (Buffer.contents buffer)) ~start_offset else begin Buffer.add_char buffer ']'; code_span buffer (nesting_level - 1) start_offset input lexbuf @@ -619,6 +622,7 @@ and code_span buffer nesting_level start_offset input = parse | newline horizontal_space* (newline horizontal_space*)+ { warning + lexbuf input (Parse_error.not_allowed ~what:(Token.describe (Blank_line "\n\n")) @@ -631,11 +635,12 @@ and code_span buffer nesting_level start_offset input = parse | eof { warning + lexbuf input (Parse_error.not_allowed ~what:(Token.describe END) ~in_what:(Token.describe (Code_span ""))); - emit input (Code_span (Buffer.contents buffer)) ~start_offset } + emit lexbuf input (Code_span (Buffer.contents buffer)) ~start_offset } | _ as c { Buffer.add_char buffer c; @@ -644,7 +649,7 @@ and code_span buffer nesting_level start_offset input = parse and math kind buffer nesting_level start_offset input = parse | '}' { if nesting_level == 0 then - emit input (math_constr kind (Buffer.contents buffer)) ~start_offset + emit lexbuf input (math_constr kind (Buffer.contents buffer)) ~start_offset else begin Buffer.add_char buffer '}'; math kind buffer (nesting_level - 1) start_offset input lexbuf @@ -660,7 +665,8 @@ and math kind buffer nesting_level start_offset input = parse { match kind with | Inline -> - warning + warning + lexbuf input (Parse_error.not_allowed ~what:(Token.describe (Blank_line "\n")) @@ -673,11 +679,12 @@ and math kind buffer nesting_level start_offset input = parse } | eof { warning + lexbuf input (Parse_error.not_allowed ~what:(Token.describe END) ~in_what:(Token.describe (math_constr kind ""))); - emit input (math_constr kind (Buffer.contents buffer)) ~start_offset } + emit lexbuf input (math_constr kind (Buffer.contents buffer)) ~start_offset } | _ as c { Buffer.add_char buffer c; math kind buffer nesting_level start_offset input lexbuf } @@ -714,7 +721,7 @@ and media tok_descr buffer nesting_level start_offset input = parse and verbatim buffer last_false_terminator start_offset input = parse | (space_char as c) "v}" { Buffer.add_char buffer c; - emit_verbatim input start_offset buffer } + emit_verbatim lexbuf input start_offset buffer } | "v}" { Buffer.add_string buffer "v}"; @@ -725,18 +732,20 @@ and verbatim buffer last_false_terminator start_offset input = parse { begin match last_false_terminator with | None -> warning + lexbuf input (Parse_error.not_allowed ~what:(Token.describe END) ~in_what:(Token.describe (Verbatim ""))) | Some location -> warning + lexbuf input ~start_offset:location ~end_offset:(location + 2) Parse_error.no_trailing_whitespace_in_verbatim end; - emit_verbatim input start_offset buffer } + emit_verbatim lexbuf input start_offset buffer } | _ as c { Buffer.add_char buffer c; @@ -749,10 +758,11 @@ and bad_markup_recovery start_offset input = parse { let suggestion = Printf.sprintf "did you mean '{!%s}' or '[%s]'?" text text in warning + lexbuf input ~start_offset (Parse_error.bad_markup ("{" ^ rest) ~suggestion); - emit input (Code_span text) ~start_offset} + emit lexbuf input (Code_span text) ~start_offset} (* The second field of the metadata. This rule keeps whitespaces and newlines in the 'metadata' field except the @@ -763,7 +773,7 @@ and code_block_metadata_tail input = parse ((space_char* '[') as suffix) { let meta = - with_location_adjustments ~adjust_start_by:prefix ~adjust_end_by:suffix (fun _ -> Loc.at) input meta + with_location_adjustments ~adjust_start_by:prefix ~adjust_end_by:suffix (fun _ -> Loc.at) lexbuf input meta in Ok (Some meta) } @@ -777,14 +787,14 @@ and code_block_metadata_tail input = parse and code_block start_offset content_offset metadata prefix delim input = parse | ("]" (delim_char* as delim') "[") as terminator { if delim = delim' - then emit_code_block ~start_offset content_offset input metadata delim terminator prefix true + then emit_code_block ~start_offset ~content_offset ~lexbuf input None (Some delim) terminator prefix None else (Buffer.add_string prefix terminator; code_block start_offset content_offset metadata prefix delim input lexbuf) } | ("]" (delim_char* as delim') "}") as terminator { if delim = delim' - then emit_code_block ~start_offset content_offset input metadata delim terminator prefix false + then emit_code_block ~start_offset ~content_offset ~lexbuf input None (Some delim ) terminator prefix None else ( Buffer.add_string prefix terminator; code_block start_offset content_offset metadata prefix delim input lexbuf @@ -792,8 +802,8 @@ and code_block start_offset content_offset metadata prefix delim input = parse } | eof { - warning input ~start_offset Parse_error.truncated_code_block; - emit_code_block ~start_offset content_offset input metadata delim "" prefix false + warning lexbuf input ~start_offset Parse_error.truncated_code_block; + emit_code_block ~start_offset ~content_offset ~lexbuf input None (Some delim ) "" prefix None } | (_ as c) { diff --git a/src/parser/syntax.ml b/src/parser/syntax.ml index 8d8c490400..a49feddb5e 100644 --- a/src/parser/syntax.ml +++ b/src/parser/syntax.ml @@ -1,3 +1,6 @@ +(* + + (* This module is a recursive descent parser for the ocamldoc syntax. The parser consumes a token stream of type [Token.t Stream.t], provided by the lexer, and produces a comment AST of the type defined in [Parser_.Ast]. @@ -1525,3 +1528,5 @@ let parse warnings tokens = in let ast = parse_block_elements () in (ast, List.rev !(input.warnings)) + + *) diff --git a/src/parser/syntax.mli b/src/parser/syntax.mli index a40b698410..e89945abf4 100644 --- a/src/parser/syntax.mli +++ b/src/parser/syntax.mli @@ -1,6 +1,8 @@ -(* Internal module, not exposed *) +(* Internal module, not exposed val parse : Warning.t list ref -> Token.t Loc.with_location Stream.t -> - Ast.t * Warning.t list + Ast.t * Warning.t list + +*) diff --git a/src/parser/token.ml b/src/parser/token.ml index c2f0a53230..9a05b26ee6 100644 --- a/src/parser/token.ml +++ b/src/parser/token.ml @@ -1,4 +1,5 @@ -(* NOTE : (@faycarsons) keep as reference for the moment *) +(* NOTE : (@faycarsons) keep as reference for the moment. + Should probably become utilities for Menhir-defined tokens *) (* This module contains the token type, emitted by the lexer, and consumed by the comment syntax parser. It also contains two functions that format tokens @@ -106,7 +107,9 @@ type t = open Parser -let print : Parser.token -> string = function + + +let[@warning "-8"] print : Parser.token -> string = function | Paragraph_style `Left -> "'{L'" | Paragraph_style `Center -> "'{C'" | Paragraph_style `Right -> "'{R'" @@ -150,7 +153,7 @@ let print : Parser.token -> string = function (* [`Minus] and [`Plus] are interpreted as if they start list items. Therefore, for error messages based on [Token.describe] to be accurate, formatted [`Minus] and [`Plus] should always be plausibly list item bullets. *) -let describe : Parser.token -> string = function +let[@warning "-8"] describe : Parser.token -> string = function | Word w -> Printf.sprintf "'%s'" w | Code_span _ -> "'[...]' (code)" | Raw_markup _ -> "'{%...%}' (raw markup)" @@ -170,7 +173,7 @@ let describe : Parser.token -> string = function | Simple_link _ -> "'{:...} (external link)'" | Link_with_replacement _ -> "'{{:...} ...}' (external link)" | END -> "end of text" - | SPACE _ -> "whitespace" + | SPACE -> "whitespace" | Single_newline _ -> "line break" | Blank_line _ -> "blank line" | RIGHT_BRACE -> "'}'" @@ -196,7 +199,7 @@ let describe : Parser.token -> string = function | Tag Deprecated -> "'@deprecated'" | Tag (Param _) -> "'@param'" | Tag (Raise _) -> "'@raise'" - | Tag Return -> "'@return'" + | Tag ( Return ) -> "'@return'" | Tag (See _) -> "'@see'" | Tag (Since _) -> "'@since'" | Tag (Before _) -> "'@before'" @@ -211,6 +214,6 @@ let describe : Parser.token -> string = function let describe_element = function | `Reference (`Simple, _, _) -> describe (Simple_ref "") | `Reference (`With_text, _, _) -> - describe (`Begin_reference_with_replacement_text "") - | `Link _ -> describe (`Begin_link_with_replacement_text "") - | `Heading (level, _, _) -> describe (`Begin_section_heading (level, None)) + describe (Ref_with_replacement "") + | `Link _ -> describe (Link_with_replacement "") + | `Heading (level, _, _) -> describe (Section_heading (level, None)) From 333e446a6fd8a0c94fef5dc0cea2b7bf6db4bdf3 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 26 Aug 2024 15:29:40 -0400 Subject: [PATCH 003/150] unwrap tokens received from lexer, notes --- src/parser/dune | 2 +- src/parser/lexer.mli | 2 +- src/parser/lexer.mll | 1 + src/parser/odoc_parser.ml | 6 +++++- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/parser/dune b/src/parser/dune index 9b9e8e1a7b..d1130fc9d7 100644 --- a/src/parser/dune +++ b/src/parser/dune @@ -12,4 +12,4 @@ (backend bisect_ppx)) (flags (:standard -w -50)) - (libraries astring result camlp-streams)) + (libraries astring result camlp-streams menhirLib)) diff --git a/src/parser/lexer.mli b/src/parser/lexer.mli index ff39d6c13b..a6f3b69b7f 100644 --- a/src/parser/lexer.mli +++ b/src/parser/lexer.mli @@ -6,4 +6,4 @@ type input = { mutable warnings : Warning.t list; } -val token : input -> Lexing.lexbuf -> Parser.token +val token : input -> Lexing.lexbuf -> Parser.token Loc.with_location diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 6407ec0670..b174ed4129 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -784,6 +784,7 @@ and code_block_metadata_tail input = parse | eof { Error `Eof } +(* NOTE : (@faycarsons) This is currently broken!! *) and code_block start_offset content_offset metadata prefix delim input = parse | ("]" (delim_char* as delim') "[") as terminator { if delim = delim' diff --git a/src/parser/odoc_parser.ml b/src/parser/odoc_parser.ml index 316dc577cc..0e8e879649 100644 --- a/src/parser/odoc_parser.ml +++ b/src/parser/odoc_parser.ml @@ -108,7 +108,11 @@ let parse_comment ~location ~text = ; offset_to_location = offset_to_location ~reversed_newlines ~comment_location:location ; file = "foo.ml" } in - let ast = Parser.main (Lexer.token lexer_state) lexbuf in + let unwrapped_token : Lexing.lexbuf -> Parser.token = fun lexbuf -> + let with_location = Lexer.token lexer_state lexbuf in + Loc.value with_location + in + let ast = Parser.main unwrapped_token lexbuf in { ast; warnings = lexer_state.warnings; reversed_newlines; original_pos = location } (* Accessor functions, as [t] is opaque *) From 1ba3a6353ec6ee5d89cb21ee380f416175277481 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 28 Aug 2024 11:33:21 -0400 Subject: [PATCH 004/150] remove modules directive from parser/test/dune --- src/parser/test/dune | 1 - 1 file changed, 1 deletion(-) diff --git a/src/parser/test/dune b/src/parser/test/dune index 6f2295b846..c6769550dd 100644 --- a/src/parser/test/dune +++ b/src/parser/test/dune @@ -7,5 +7,4 @@ (backend landmarks --auto)) (preprocess (pps ppx_expect)) - (modules serialize) (libraries sexplib0 odoc-parser)) From 21135d87907882a40d7db5efc0565b84740b612b Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 28 Aug 2024 18:52:38 -0400 Subject: [PATCH 005/150] Add media tokens, null parser will compile, separate types and utils --- src/parser/ast.ml | 20 +------ src/parser/error.ml | 9 +++ src/parser/lexer.mll | 65 +++++++++++++------- src/parser/parser.mly | 39 +++++++++--- src/parser/parser_types.ml | 19 ++++++ src/parser/parser_utils.ml | 111 ++++++++++++++++++++++++++++++++++ src/parser/test/serialize.ml | 11 ++++ src/parser/token.ml | 113 ----------------------------------- 8 files changed, 227 insertions(+), 160 deletions(-) create mode 100644 src/parser/error.ml create mode 100644 src/parser/parser_types.ml create mode 100644 src/parser/parser_utils.ml diff --git a/src/parser/ast.ml b/src/parser/ast.ml index 4b98235d57..e4c9dd54c2 100644 --- a/src/parser/ast.ml +++ b/src/parser/ast.ml @@ -1,21 +1,5 @@ (** Abstract syntax tree representing ocamldoc comments *) -type parser_tag = - Author of string - | Deprecated - | Param of string - | Raise of string - | Return - | See of [ `Url | `File | `Document ] * string - | Since of string - | Before of string - | Version of string - | Canonical of string - | Inline - | Open - | Closed - | Hidden - type list_kind = [ `Ordered | `Unordered ] type list_syntax = [ `Light | `Heavy ] type list_item = [ `Li | `Dash ] @@ -63,8 +47,8 @@ type code_block_meta = { tags : string with_location option; } -type media = Token.media -type media_href = Token.media_href +type media = [ `Audio | `Video | `Image ] +type media_href = [ `Reference of string | `Link of string ] type code_block = { meta : code_block_meta option; diff --git a/src/parser/error.ml b/src/parser/error.ml new file mode 100644 index 0000000000..3200573d39 --- /dev/null +++ b/src/parser/error.ml @@ -0,0 +1,9 @@ +type parser_error = + | Unclosed of + { opening : string + ; items : string + ; closing : string + } + | Expecting of string + +exception Parser_error of parser_error Loc.with_location diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index b174ed4129..7a039b0b0f 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -1,6 +1,6 @@ { - open Parser +open Parser let unescape_word : string -> string = fun s -> (* The common case is that there are no escape sequences. *) @@ -186,9 +186,31 @@ let reference_token media start target input lexbuf = match start with | "{!" -> Simple_ref target | "{{!" -> Ref_with_replacement target - | "{:" -> Simple_link target - | "{{:" -> Link_with_replacement target - | _ -> assert false + | "{:" -> Simple_link (target) + | "{{:" -> Link_with_replacement (target) + + | "{image!" -> Media (Reference target, Image) + | "{image:" -> Media (Link target, Image) + | "{audio!" -> Media (Reference target, Audio) + | "{audio:" -> Media (Link target, Audio) + | "{video!" -> Media (Reference target, Video) + | "{video:" -> Media (Link target, Video) + + | _ -> + let target, kind = + let open Parser_types in + match start with + | "{{image!" -> Reference target, Image + | "{{image:" -> Link target, Image + | "{{audio!" -> Reference target, Audio + | "{{audio:" -> Link target, Audio + | "{{video!" -> Reference target, Video + | "{{video:" -> Link target, Video + | _ -> assert false + in + let token_descr = "DUMMY_MEDIA_FOR_DEBUGGING" (* Parser_utils.describe (`Media_with_replacement_text (target, kind, "")) *) in + let content = media token_descr (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) input lexbuf in + Media_with_replacement (target, kind, content) let trim_leading_space_or_accept_whitespace lexbuf input start_offset text = match text.[0] with @@ -409,13 +431,13 @@ and token input = parse | "{!modules:" ([^ '}']* as modules) '}' { emit lexbuf input (Modules modules) } - | (media_start as start) +| (media_start as start) { let start_offset = Lexing.lexeme_start lexbuf in let target = reference_content input start start_offset (Buffer.create 16) lexbuf in - let token = (reference_token start target) in + let token = reference_token media start target input lexbuf in emit ~start_offset lexbuf input token } | "{[" @@ -463,8 +485,8 @@ and token input = parse input ~start_offset:(Lexing.lexeme_end lexbuf) (Parse_error.not_allowed - ~what:(Token.describe END) - ~in_what:(Token.describe token)); + ~what:(Parser_utils.describe END) + ~in_what:(Parser_utils.describe token)); emit lexbuf input token } | "{ul" @@ -599,8 +621,8 @@ and token input = parse input ~start_offset:(Lexing.lexeme_end lexbuf) (Parse_error.not_allowed - ~what:(Token.describe END) - ~in_what:(Token.describe (Modules ""))); + ~what:(Parser_utils.describe END) + ~in_what:(Parser_utils.describe (Modules ""))); emit lexbuf input (Modules modules) } and code_span buffer nesting_level start_offset input = parse @@ -625,8 +647,8 @@ and code_span buffer nesting_level start_offset input = parse lexbuf input (Parse_error.not_allowed - ~what:(Token.describe (Blank_line "\n\n")) - ~in_what:(Token.describe (Code_span ""))); + ~what:(Parser_utils.describe (Blank_line "\n\n")) + ~in_what:(Parser_utils.describe (Code_span ""))); Buffer.add_char buffer ' '; code_span buffer nesting_level start_offset input lexbuf } | newline horizontal_space* @@ -638,8 +660,8 @@ and code_span buffer nesting_level start_offset input = parse lexbuf input (Parse_error.not_allowed - ~what:(Token.describe END) - ~in_what:(Token.describe (Code_span ""))); + ~what:(Parser_utils.describe END) + ~in_what:(Parser_utils.describe (Code_span ""))); emit lexbuf input (Code_span (Buffer.contents buffer)) ~start_offset } | _ as c @@ -669,8 +691,8 @@ and math kind buffer nesting_level start_offset input = parse lexbuf input (Parse_error.not_allowed - ~what:(Token.describe (Blank_line "\n")) - ~in_what:(Token.describe (math_constr kind ""))); + ~what:(Parser_utils.describe (Blank_line "\n")) + ~in_what:(Parser_utils.describe (math_constr kind ""))); Buffer.add_char buffer '\n'; math kind buffer nesting_level start_offset input lexbuf | Block -> @@ -682,8 +704,8 @@ and math kind buffer nesting_level start_offset input = parse lexbuf input (Parse_error.not_allowed - ~what:(Token.describe END) - ~in_what:(Token.describe (math_constr kind ""))); + ~what:(Parser_utils.describe END) + ~in_what:(Parser_utils.describe (math_constr kind ""))); emit lexbuf input (math_constr kind (Buffer.contents buffer)) ~start_offset } | _ as c { Buffer.add_char buffer c; @@ -706,9 +728,10 @@ and media tok_descr buffer nesting_level start_offset input = parse media tok_descr buffer nesting_level start_offset input lexbuf } | eof { warning + lexbuf input (Parse_error.not_allowed - ~what:(Token.describe `End) + ~what:(Parser_utils.describe END) ~in_what:tok_descr); Buffer.contents buffer} | (newline) @@ -735,8 +758,8 @@ and verbatim buffer last_false_terminator start_offset input = parse lexbuf input (Parse_error.not_allowed - ~what:(Token.describe END) - ~in_what:(Token.describe (Verbatim ""))) + ~what:(Parser_utils.describe END) + ~in_what:(Parser_utils.describe (Verbatim ""))) | Some location -> warning lexbuf diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 642a3dd777..b3cd5f9083 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -1,4 +1,17 @@ %{ + [@@@warning "-32"] + + let point_of_position Lexing.{ pos_lnum; pos_cnum; _ } = + Loc.{ line = pos_lnum; column = pos_cnum } + + let to_location : (Lexing.position * Lexing.position) -> 'a -> 'a Loc.with_location = + fun (start, end_) inner -> + let open Loc in + let start_point = point_of_position start + and end_point = point_of_position end_ in + let location = { file = start.pos_fname; start = start_point; end_ = end_point } + in + { location; value = inner } %} %token SPACE NEWLINE @@ -10,8 +23,7 @@ %token Single_newline %token Space - -%token Word +%token Word %token MINUS PLUS BAR @@ -25,8 +37,6 @@ %token Raw_markup -%token Tag - %token Code_block %token Code_span @@ -40,11 +50,14 @@ %token Section_heading +%token Tag + %token Simple_ref %token Ref_with_replacement %token Simple_link %token Link_with_replacement - +%token Media +%token Media_with_replacement %token Verbatim %token END @@ -52,6 +65,16 @@ %start main %% -let main := - | SPACE; { [] } - | NEWLINE; { [] } +let located(x) == + matched = x; { location_of_position $loc matched } + +let main := + | _ws = whitespace; { [] } + | _ = error; { print_endline "Error"; [] } + +let whitespace := + | SPACE; { [] } | NEWLINE; { [] } + | _ = Space; { [] } + | _ = Blank_line; { [ [] ] } + | _ = Single_newline; { [] } + diff --git a/src/parser/parser_types.ml b/src/parser/parser_types.ml new file mode 100644 index 0000000000..6859b3dbbc --- /dev/null +++ b/src/parser/parser_types.ml @@ -0,0 +1,19 @@ +type media = Reference of string | Link of string +type media_target = Audio | Video | Image + + +type tag = + Author of string + | Deprecated + | Param of string + | Raise of string + | Return + | See of [ `Url | `File | `Document ] * string + | Since of string + | Before of string + | Version of string + | Canonical of string + | Inline + | Open + | Closed + | Hidden diff --git a/src/parser/parser_utils.ml b/src/parser/parser_utils.ml new file mode 100644 index 0000000000..146930606d --- /dev/null +++ b/src/parser/parser_utils.ml @@ -0,0 +1,111 @@ +open Parser + +let[@warning "-8"] print : Parser.token -> string = function + | Paragraph_style `Left -> "'{L'" + | Paragraph_style `Center -> "'{C'" + | Paragraph_style `Right -> "'{R'" + | Style `Bold -> "'{b'" + | Style `Italic -> "'{i'" + | Style `Emphasis -> "'{e'" + | Style `Superscript -> "'{^'" + | Style `Subscript -> "'{_'" + | Ref_with_replacement _ -> "'{{!'" + | Link_with_replacement _ -> "'{{:'" + | List_item `Li -> "'{li ...}'" + | List_item `Dash -> "'{- ...}'" + | Table_light -> "{t" + | Table_heavy -> "{table" + | Table_row -> "'{tr'" + | Table_cell `Header -> "'{th'" + | Table_cell `Data -> "'{td'" + | MINUS -> "'-'" + | PLUS -> "'+'" + | BAR -> "'|'" + | Section_heading (level, label) -> + let label = match label with None -> "" | Some label -> ":" ^ label in + Printf.sprintf "'{%i%s'" level label + | Tag (Author _) -> "'@author'" + | Tag Deprecated -> "'@deprecated'" + | Tag (Param _) -> "'@param'" + | Tag (Raise _) -> "'@raise'" + | Tag Return -> "'@return'" + | Tag (See _) -> "'@see'" + | Tag (Since _) -> "'@since'" + | Tag (Before _) -> "'@before'" + | Tag (Version _) -> "'@version'" + | Tag (Canonical _) -> "'@canonical'" + | Tag Inline -> "'@inline'" + | Tag Open -> "'@open'" + | Tag Closed -> "'@closed'" + | Tag Hidden -> "'@hidden" + | Raw_markup (None, _) -> "'{%...%}'" + | Raw_markup (Some target, _) -> "'{%" ^ target ^ ":...%}'" + +(* [`Minus] and [`Plus] are interpreted as if they start list items. Therefore, + for error messages based on [Token.describe] to be accurate, formatted + [`Minus] and [`Plus] should always be plausibly list item bullets. *) +let[@warning "-8"] describe : Parser.token -> string = function + | Word w -> Printf.sprintf "'%s'" w + | Code_span _ -> "'[...]' (code)" + | Raw_markup _ -> "'{%...%}' (raw markup)" + | Paragraph_style `Left -> "'{L ...}' (left alignment)" + | Paragraph_style `Center -> "'{C ...}' (center alignment)" + | Paragraph_style `Right -> "'{R ...}' (right alignment)" + | Style `Bold -> "'{b ...}' (boldface text)" + | Style `Italic -> "'{i ...}' (italic text)" + | Style `Emphasis -> "'{e ...}' (emphasized text)" + | Style `Superscript -> "'{^...}' (superscript)" + | Style `Subscript -> "'{_...}' (subscript)" + | Math_span _ -> "'{m ...}' (math span)" + | Math_block _ -> "'{math ...}' (math block)" + | Simple_ref _ -> "'{!...}' (cross-reference)" + | Ref_with_replacement _ -> + "'{{!...} ...}' (cross-reference)" + | Simple_link _ -> "'{:...} (external link)'" + | Link_with_replacement _ -> "'{{:...} ...}' (external link)" + | END -> "end of text" + | SPACE -> "whitespace" + | Single_newline _ -> "line break" + | Blank_line _ -> "blank line" + | RIGHT_BRACE -> "'}'" + | RIGHT_CODE_DELIMITER -> "']}'" + | Code_block _ -> "'{[...]}' (code block)" + | Verbatim _ -> "'{v ... v}' (verbatim text)" + | Modules _ -> "'{!modules ...}'" + | List `Unordered -> "'{ul ...}' (bulleted list)" + | List `Ordered -> "'{ol ...}' (numbered list)" + | List_item `Li -> "'{li ...}' (list item)" + | List_item `Dash -> "'{- ...}' (list item)" + | Table_light -> "'{t ...}' (table)" + | Table_heavy -> "'{table ...}' (table)" + | Table_row -> "'{tr ...}' (table row)" + | Table_cell `Header -> "'{th ... }' (table header cell)" + | Table_cell `Data -> "'{td ... }' (table data cell)" + | MINUS -> "'-' (bulleted list item)" + | PLUS -> "'+' (numbered list item)" + | BAR -> "'|'" + | Section_heading (level, _) -> + Printf.sprintf "'{%i ...}' (section heading)" level + | Tag (Author _) -> "'@author'" + | Tag Deprecated -> "'@deprecated'" + | Tag (Param _) -> "'@param'" + | Tag (Raise _) -> "'@raise'" + | Tag ( Return ) -> "'@return'" + | Tag (See _) -> "'@see'" + | Tag (Since _) -> "'@since'" + | Tag (Before _) -> "'@before'" + | Tag (Version _) -> "'@version'" + | Tag (Canonical _) -> "'@canonical'" + | Tag Inline -> "'@inline'" + | Tag Open -> "'@open'" + | Tag Closed -> "'@closed'" + | Tag Hidden -> "'@hidden" + | COMMENT -> "top-level text" + +(* NOTE : (@faycarsons) Should this be in Ast.ml? This takes an Ast.t no? *) +let describe_element = function + | `Reference (`Simple, _, _) -> describe (Simple_ref "") + | `Reference (`With_text, _, _) -> + describe (Ref_with_replacement "") + | `Link _ -> describe (Link_with_replacement "") + | `Heading (level, _, _) -> describe (Section_heading (level, None)) diff --git a/src/parser/test/serialize.ml b/src/parser/test/serialize.ml index 4d7e0ebf7d..8b8da753f4 100644 --- a/src/parser/test/serialize.ml +++ b/src/parser/test/serialize.ml @@ -61,6 +61,15 @@ module Ast_to_sexp = struct let code_block_lang at { Ast.language; tags } = List [ at.at str language; opt (at.at str) tags ] + let media : Ast.media -> sexp = function + | `Image -> Atom "image" + | `Video -> Atom "video" + | `Audio -> Atom "audio" + + let media_href = function + | `Reference href -> List [ Atom "Reference"; Atom href ] + | `Link href -> List [ Atom "Link"; Atom href ] + let rec nestable_block_element at : Ast.nestable_block_element -> sexp = function | `Paragraph es -> @@ -116,6 +125,8 @@ module Ast_to_sexp = struct map (kind k) cell @@ at.at (nestable_block_element at) ); alignment; ] + | `Media (kind, href, c, m) -> + List [ reference_kind kind; at.at media_href href; Atom c; media m ] let tag at : Ast.tag -> sexp = function | `Author s -> List [ Atom "@author"; Atom s ] diff --git a/src/parser/token.ml b/src/parser/token.ml index 9a05b26ee6..ebc2846c99 100644 --- a/src/parser/token.ml +++ b/src/parser/token.ml @@ -104,116 +104,3 @@ type t = | `Bar | section_heading | tag ] - -open Parser - - - -let[@warning "-8"] print : Parser.token -> string = function - | Paragraph_style `Left -> "'{L'" - | Paragraph_style `Center -> "'{C'" - | Paragraph_style `Right -> "'{R'" - | Style `Bold -> "'{b'" - | Style `Italic -> "'{i'" - | Style `Emphasis -> "'{e'" - | Style `Superscript -> "'{^'" - | Style `Subscript -> "'{_'" - | Ref_with_replacement _ -> "'{{!'" - | Link_with_replacement _ -> "'{{:'" - | List_item `Li -> "'{li ...}'" - | List_item `Dash -> "'{- ...}'" - | Table_light -> "{t" - | Table_heavy -> "{table" - | Table_row -> "'{tr'" - | Table_cell `Header -> "'{th'" - | Table_cell `Data -> "'{td'" - | MINUS -> "'-'" - | PLUS -> "'+'" - | BAR -> "'|'" - | Section_heading (level, label) -> - let label = match label with None -> "" | Some label -> ":" ^ label in - Printf.sprintf "'{%i%s'" level label - | Tag (Author _) -> "'@author'" - | Tag Deprecated -> "'@deprecated'" - | Tag (Param _) -> "'@param'" - | Tag (Raise _) -> "'@raise'" - | Tag Return -> "'@return'" - | Tag (See _) -> "'@see'" - | Tag (Since _) -> "'@since'" - | Tag (Before _) -> "'@before'" - | Tag (Version _) -> "'@version'" - | Tag (Canonical _) -> "'@canonical'" - | Tag Inline -> "'@inline'" - | Tag Open -> "'@open'" - | Tag Closed -> "'@closed'" - | Tag Hidden -> "'@hidden" - | Raw_markup (None, _) -> "'{%...%}'" - | Raw_markup (Some target, _) -> "'{%" ^ target ^ ":...%}'" - -(* [`Minus] and [`Plus] are interpreted as if they start list items. Therefore, - for error messages based on [Token.describe] to be accurate, formatted - [`Minus] and [`Plus] should always be plausibly list item bullets. *) -let[@warning "-8"] describe : Parser.token -> string = function - | Word w -> Printf.sprintf "'%s'" w - | Code_span _ -> "'[...]' (code)" - | Raw_markup _ -> "'{%...%}' (raw markup)" - | Paragraph_style `Left -> "'{L ...}' (left alignment)" - | Paragraph_style `Center -> "'{C ...}' (center alignment)" - | Paragraph_style `Right -> "'{R ...}' (right alignment)" - | Style `Bold -> "'{b ...}' (boldface text)" - | Style `Italic -> "'{i ...}' (italic text)" - | Style `Emphasis -> "'{e ...}' (emphasized text)" - | Style `Superscript -> "'{^...}' (superscript)" - | Style `Subscript -> "'{_...}' (subscript)" - | Math_span _ -> "'{m ...}' (math span)" - | Math_block _ -> "'{math ...}' (math block)" - | Simple_ref _ -> "'{!...}' (cross-reference)" - | Ref_with_replacement _ -> - "'{{!...} ...}' (cross-reference)" - | Simple_link _ -> "'{:...} (external link)'" - | Link_with_replacement _ -> "'{{:...} ...}' (external link)" - | END -> "end of text" - | SPACE -> "whitespace" - | Single_newline _ -> "line break" - | Blank_line _ -> "blank line" - | RIGHT_BRACE -> "'}'" - | RIGHT_CODE_DELIMITER -> "']}'" - | Code_block _ -> "'{[...]}' (code block)" - | Verbatim _ -> "'{v ... v}' (verbatim text)" - | Modules _ -> "'{!modules ...}'" - | List `Unordered -> "'{ul ...}' (bulleted list)" - | List `Ordered -> "'{ol ...}' (numbered list)" - | List_item `Li -> "'{li ...}' (list item)" - | List_item `Dash -> "'{- ...}' (list item)" - | Table_light -> "'{t ...}' (table)" - | Table_heavy -> "'{table ...}' (table)" - | Table_row -> "'{tr ...}' (table row)" - | Table_cell `Header -> "'{th ... }' (table header cell)" - | Table_cell `Data -> "'{td ... }' (table data cell)" - | MINUS -> "'-' (bulleted list item)" - | PLUS -> "'+' (numbered list item)" - | BAR -> "'|'" - | Section_heading (level, _) -> - Printf.sprintf "'{%i ...}' (section heading)" level - | Tag (Author _) -> "'@author'" - | Tag Deprecated -> "'@deprecated'" - | Tag (Param _) -> "'@param'" - | Tag (Raise _) -> "'@raise'" - | Tag ( Return ) -> "'@return'" - | Tag (See _) -> "'@see'" - | Tag (Since _) -> "'@since'" - | Tag (Before _) -> "'@before'" - | Tag (Version _) -> "'@version'" - | Tag (Canonical _) -> "'@canonical'" - | Tag Inline -> "'@inline'" - | Tag Open -> "'@open'" - | Tag Closed -> "'@closed'" - | Tag Hidden -> "'@hidden" - | COMMENT -> "top-level text" - -let describe_element = function - | `Reference (`Simple, _, _) -> describe (Simple_ref "") - | `Reference (`With_text, _, _) -> - describe (Ref_with_replacement "") - | `Link _ -> describe (Link_with_replacement "") - | `Heading (level, _, _) -> describe (Section_heading (level, None)) From a42097cba0fe734b433b179a4be36302d822a985 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 29 Aug 2024 14:56:11 -0400 Subject: [PATCH 006/150] trivial/whitespace tests passing --- src/parser/parser.mly | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index b3cd5f9083..ed20e2e95b 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -1,17 +1,21 @@ %{ [@@@warning "-32"] + open Error let point_of_position Lexing.{ pos_lnum; pos_cnum; _ } = Loc.{ line = pos_lnum; column = pos_cnum } - let to_location : (Lexing.position * Lexing.position) -> 'a -> 'a Loc.with_location = - fun (start, end_) inner -> + type lexspan = (Lexing.position * Lexing.position) + let to_location : lexspan -> Loc.span = + fun (start, end_) -> let open Loc in let start_point = point_of_position start and end_point = point_of_position end_ in - let location = { file = start.pos_fname; start = start_point; end_ = end_point } - in - { location; value = inner } + { file = start.pos_fname; start = start_point; end_ = end_point } + + let wrap_location : type a. lexspan -> a -> a Loc.with_location = fun loc value -> + let location = to_location loc in + { location; value } %} %token SPACE NEWLINE @@ -65,16 +69,15 @@ %start main %% -let located(x) == - matched = x; { location_of_position $loc matched } - let main := - | _ws = whitespace; { [] } - | _ = error; { print_endline "Error"; [] } + | _ = whitespace; { [] } + | END; { [] } + | _ = error; { raise @@ Parser_error (wrap_location $sloc (Expecting "UNREACHABLE")) } let whitespace := - | SPACE; { [] } | NEWLINE; { [] } - | _ = Space; { [] } - | _ = Blank_line; { [ [] ] } - | _ = Single_newline; { [] } + | SPACE; { `Space " " } + | NEWLINE; { `Space "\n" } + | ~ = Space; <`Space> + | ~ = Blank_line; <`Space> + | ~ = Single_newline; <`Space> From a2a908325e5ee182092c14bd033f3998d2b0a403 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 2 Sep 2024 17:05:54 -0400 Subject: [PATCH 007/150] Add inline element and list rules --- src/parser/parser.mly | 70 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 3 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index ed20e2e95b..c0a093a1e6 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -1,6 +1,7 @@ %{ [@@@warning "-32"] + open Error let point_of_position Lexing.{ pos_lnum; pos_cnum; _ } = Loc.{ line = pos_lnum; column = pos_cnum } @@ -13,9 +14,23 @@ and end_point = point_of_position end_ in { file = start.pos_fname; start = start_point; end_ = end_point } - let wrap_location : type a. lexspan -> a -> a Loc.with_location = fun loc value -> + let wrap_location : lexspan -> 'a -> 'a Loc.with_location = fun loc value -> let location = to_location loc in { location; value } + + let throw : lexspan -> Error.parser_error -> unit = fun loc error -> + raise @@ Parser_error (wrap_location loc error) + + + exception Debug of [ `DEBUG ] Loc.with_location + let raise_unimplemented : only_for_debugging:lexspan -> 'a = + fun ~only_for_debugging:loc -> + raise @@ Debug (wrap_location loc `DEBUG) + + let exn_location : only_for_debugging:lexspan -> exn = + fun ~only_for_debugging:loc -> Debug (wrap_location loc `DEBUG) + + let tag : Ast.tag -> Ast.block_element = fun tag -> `Tag tag %} %token SPACE NEWLINE @@ -61,18 +76,24 @@ %token Simple_link %token Link_with_replacement %token Media -%token Media_with_replacement +%token Media_with_replacement %token Verbatim %token END %start main + + %% +%public %inline located(token): + value = token; { wrap_location $sloc value } + let main := | _ = whitespace; { [] } + | tag_ = tag; { [ wrap_location $sloc tag_ ]} | END; { [] } - | _ = error; { raise @@ Parser_error (wrap_location $sloc (Expecting "UNREACHABLE")) } + | _ = error; { raise @@ exn_location ~only_for_debugging:( $loc ) } let whitespace := | SPACE; { `Space " " } @@ -81,3 +102,46 @@ let whitespace := | ~ = Blank_line; <`Space> | ~ = Single_newline; <`Space> +let inline_element := + | ~ = Word; <`Word> + | ~ = Code_span; <`Code_span> + | ~ = Raw_markup; <`Raw_markup> + | style = Style; inner = inline_element; { `Style (style, wrap_location $loc inner) } + | ~ = Math_span; <`Math_span> + | _ = ref; <> + +(* TODO: Determine how we want to handle recursive elements like refs and some of the tags that have nestable_block inners + Currently, this is broken *) +let ref := + | ref = Simple_ref; children = inline_element; { `Reference (`Simple, ref, wrap_location $loc children) } + | _ref_w_text = Ref_with_replacement; children = inline_element; { `Reference (`Replacement, ref, wrap_location $loc children) } + +let list_light := + | MINUS; unordered_items = separated_list(NEWLINE; MINUS, text); { `List (`Unordered, `Light, unordered_items) } + | PLUS; unordered_items = separated_list(NEWLINE; PLUS, text); { `List (`Ordered, `Light, unordered_items) } + +let text := + | ~ = Word; whitespace; <`Word> + +let tag := + | inner_tag = Tag; { + let open Parser_types in + match inner_tag with + | Version version -> tag @@ `Version version + | Since version -> tag @@ `Since version + | Before _version -> raise_unimplemented ~only_for_debugging:( $loc ) + | Canonical implementation -> tag @@ `Canonical (wrap_location $loc implementation) + | Inline -> tag `Inline + | Open -> tag `Open + | Closed -> tag `Closed + | Hidden -> tag `Hidden + | Deprecated -> raise_unimplemented ~only_for_debugging:( $loc ) + | Return -> raise_unimplemented ~only_for_debugging:( $loc ) + | Author _author -> raise_unimplemented ~only_for_debugging:( $loc ) + | Param _param -> raise_unimplemented ~only_for_debugging:( $loc ) + | Raise _exn -> raise_unimplemented ~only_for_debugging:( $loc ) + | See (_kind, _href) -> raise_unimplemented ~only_for_debugging:( $loc ) +} + +let style := ~ = Style; <> +let paragraph_style := ~ = Paragraph_style; <> From 36befac10f7937287e7d5bd57068a250e27c3baf Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 2 Sep 2024 17:15:53 -0400 Subject: [PATCH 008/150] Fix naming in list rule, add dummy 'nestable block element' rule --- src/parser/parser.mly | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index c0a093a1e6..1d0a8a406f 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -113,16 +113,16 @@ let inline_element := (* TODO: Determine how we want to handle recursive elements like refs and some of the tags that have nestable_block inners Currently, this is broken *) let ref := - | ref = Simple_ref; children = inline_element; { `Reference (`Simple, ref, wrap_location $loc children) } - | _ref_w_text = Ref_with_replacement; children = inline_element; { `Reference (`Replacement, ref, wrap_location $loc children) } + | ref_body = Simple_ref; children = inline_element; { `Reference (`Simple, ref_body, wrap_location $loc children) } + | ref_body = Ref_with_replacement; children = inline_element; { `Reference (`Replacement, ref_body, wrap_location $loc children) } let list_light := - | MINUS; unordered_items = separated_list(NEWLINE; MINUS, text); { `List (`Unordered, `Light, unordered_items) } - | PLUS; unordered_items = separated_list(NEWLINE; PLUS, text); { `List (`Ordered, `Light, unordered_items) } - -let text := - | ~ = Word; whitespace; <`Word> - + | MINUS; unordered_items = separated_list(NEWLINE; MINUS, nestable_block_element); { `List (`Unordered, `Light, unordered_items) } + | PLUS; ordered_items = separated_list(NEWLINE; PLUS, nestable_block_element); { `List (`Ordered, `Light, unordered_items) } + +let nestable_block_element := + | error; { raise_unimplemented ~only_for_debugging:($loc) } + let tag := | inner_tag = Tag; { let open Parser_types in From 54fef41e4a3c625391e1461fd1ab76fffc078bb2 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 2 Sep 2024 17:55:21 -0400 Subject: [PATCH 009/150] Refactor 'tag' rule to support variants w/ children --- src/parser/parser.mly | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 1d0a8a406f..5405fad596 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -1,8 +1,8 @@ %{ [@@@warning "-32"] - - open Error + open Parser_types + let point_of_position Lexing.{ pos_lnum; pos_cnum; _ } = Loc.{ line = pos_lnum; column = pos_cnum } @@ -124,23 +124,27 @@ let nestable_block_element := | error; { raise_unimplemented ~only_for_debugging:($loc) } let tag := + | inner_tag = Tag; children = nestable_block_element; { + match inner_tag with + | Before version -> tag @@ `Before (version, [ wrap_location $loc children ]) + | Deprecated -> tag @@ `Deprecated [ wrap_location $loc children ] + | Return -> tag @@ `Return [ wrap_location $loc children ] + | Param param_name -> tag @@ `Param (param_name, [ wrap_location $loc children ]) + | Raise exn -> tag @@ `Raise (exn, [ wrap_location $loc children ]) + | See (kind, href) -> tag @@ `See (kind, href, [ wrap_location $loc children ]) + | _ -> raise @@ exn_location ~only_for_debugging:( $loc ) + } | inner_tag = Tag; { - let open Parser_types in match inner_tag with | Version version -> tag @@ `Version version | Since version -> tag @@ `Since version - | Before _version -> raise_unimplemented ~only_for_debugging:( $loc ) | Canonical implementation -> tag @@ `Canonical (wrap_location $loc implementation) - | Inline -> tag `Inline - | Open -> tag `Open - | Closed -> tag `Closed - | Hidden -> tag `Hidden - | Deprecated -> raise_unimplemented ~only_for_debugging:( $loc ) - | Return -> raise_unimplemented ~only_for_debugging:( $loc ) - | Author _author -> raise_unimplemented ~only_for_debugging:( $loc ) - | Param _param -> raise_unimplemented ~only_for_debugging:( $loc ) - | Raise _exn -> raise_unimplemented ~only_for_debugging:( $loc ) - | See (_kind, _href) -> raise_unimplemented ~only_for_debugging:( $loc ) + | Author author -> tag @@ `Author author + | Inline -> `Tag `Inline + | Open -> `Tag `Open + | Closed -> `Tag `Closed + | Hidden -> `Tag `Hidden + | _ -> raise @@ exn_location ~only_for_debugging:( $loc ) } let style := ~ = Style; <> From 68cb3b089781958b996a64fee0a84465198393ad Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 2 Sep 2024 18:06:09 -0400 Subject: [PATCH 010/150] Rewrite 'located' parser util to use new menhir syntax; formatting --- src/parser/parser.mly | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 5405fad596..859b1b121e 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -86,8 +86,7 @@ %% -%public %inline located(token): - value = token; { wrap_location $sloc value } +let located(rule) == value = rule; { wrap_location $loc value } let main := | _ = whitespace; { [] } @@ -135,17 +134,17 @@ let tag := | _ -> raise @@ exn_location ~only_for_debugging:( $loc ) } | inner_tag = Tag; { - match inner_tag with - | Version version -> tag @@ `Version version - | Since version -> tag @@ `Since version - | Canonical implementation -> tag @@ `Canonical (wrap_location $loc implementation) - | Author author -> tag @@ `Author author - | Inline -> `Tag `Inline - | Open -> `Tag `Open - | Closed -> `Tag `Closed - | Hidden -> `Tag `Hidden - | _ -> raise @@ exn_location ~only_for_debugging:( $loc ) -} + match inner_tag with + | Version version -> tag @@ `Version version + | Since version -> tag @@ `Since version + | Canonical implementation -> tag @@ `Canonical (wrap_location $loc implementation) + | Author author -> tag @@ `Author author + | Inline -> `Tag `Inline + | Open -> `Tag `Open + | Closed -> `Tag `Closed + | Hidden -> `Tag `Hidden + | _ -> raise @@ exn_location ~only_for_debugging:( $loc ) + } let style := ~ = Style; <> let paragraph_style := ~ = Paragraph_style; <> From 0c4f9c1f0c48ca82282c22bd14dce216cf9b36a0 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 2 Sep 2024 18:16:09 -0400 Subject: [PATCH 011/150] move tag parser helpers to Menhir header --- src/parser/parser.mly | 47 ++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 859b1b121e..4932d690d5 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -31,6 +31,27 @@ fun ~only_for_debugging:loc -> Debug (wrap_location loc `DEBUG) let tag : Ast.tag -> Ast.block_element = fun tag -> `Tag tag + + let tag_with_element loc children = function + | Before version -> tag @@ `Before (version, [ wrap_location loc children ]) + | Deprecated -> tag @@ `Deprecated [ wrap_location loc children ] + | Return -> tag @@ `Return [ wrap_location loc children ] + | Param param_name -> tag @@ `Param (param_name, [ wrap_location loc children ]) + | Raise exn -> tag @@ `Raise (exn, [ wrap_location loc children ]) + | See (kind, href) -> tag @@ `See (kind, href, [ wrap_location loc children ]) + | _ -> raise @@ exn_location ~only_for_debugging:( loc ) + + let tag_bare loc = function + | Version version -> tag @@ `Version version + | Since version -> tag @@ `Since version + | Canonical implementation -> tag @@ `Canonical (wrap_location loc implementation) + | Author author -> tag @@ `Author author + | Inline -> `Tag `Inline + | Open -> `Tag `Open + | Closed -> `Tag `Closed + | Hidden -> `Tag `Hidden + | _ -> raise @@ exn_location ~only_for_debugging:( loc ) + %} %token SPACE NEWLINE @@ -83,7 +104,6 @@ %start main - %% let located(rule) == value = rule; { wrap_location $loc value } @@ -120,31 +140,12 @@ let list_light := | PLUS; ordered_items = separated_list(NEWLINE; PLUS, nestable_block_element); { `List (`Ordered, `Light, unordered_items) } let nestable_block_element := + | code = Verbatim; { `Verbatim code } | error; { raise_unimplemented ~only_for_debugging:($loc) } let tag := - | inner_tag = Tag; children = nestable_block_element; { - match inner_tag with - | Before version -> tag @@ `Before (version, [ wrap_location $loc children ]) - | Deprecated -> tag @@ `Deprecated [ wrap_location $loc children ] - | Return -> tag @@ `Return [ wrap_location $loc children ] - | Param param_name -> tag @@ `Param (param_name, [ wrap_location $loc children ]) - | Raise exn -> tag @@ `Raise (exn, [ wrap_location $loc children ]) - | See (kind, href) -> tag @@ `See (kind, href, [ wrap_location $loc children ]) - | _ -> raise @@ exn_location ~only_for_debugging:( $loc ) - } - | inner_tag = Tag; { - match inner_tag with - | Version version -> tag @@ `Version version - | Since version -> tag @@ `Since version - | Canonical implementation -> tag @@ `Canonical (wrap_location $loc implementation) - | Author author -> tag @@ `Author author - | Inline -> `Tag `Inline - | Open -> `Tag `Open - | Closed -> `Tag `Closed - | Hidden -> `Tag `Hidden - | _ -> raise @@ exn_location ~only_for_debugging:( $loc ) - } + | inner_tag = Tag; children = nestable_block_element; { tag_with_element $loc children inner_tag } + | inner_tag = Tag; { tag_bare $loc inner_tag } let style := ~ = Style; <> let paragraph_style := ~ = Paragraph_style; <> From 1408d3f48b8cb50835e8096ddd3b0f4f0a587507 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 3 Sep 2024 15:01:01 -0400 Subject: [PATCH 012/150] Add heavy list and nestable element rules --- src/parser/parser.mly | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 4932d690d5..3a47e3925d 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -110,7 +110,7 @@ let located(rule) == value = rule; { wrap_location $loc value } let main := | _ = whitespace; { [] } - | tag_ = tag; { [ wrap_location $sloc tag_ ]} + | t = tag; { [ wrap_location $sloc t ]} | END; { [] } | _ = error; { raise @@ exn_location ~only_for_debugging:( $loc ) } @@ -122,26 +122,45 @@ let whitespace := | ~ = Single_newline; <`Space> let inline_element := + | ~ = Space; <`Space> | ~ = Word; <`Word> | ~ = Code_span; <`Code_span> - | ~ = Raw_markup; <`Raw_markup> - | style = Style; inner = inline_element; { `Style (style, wrap_location $loc inner) } + | s = Raw_markup; { `Raw_markup ( None, s ) } + | style = Style; inner = inline_element; { `Styled (style, wrap_location $loc inner) } | ~ = Math_span; <`Math_span> - | _ = ref; <> + | ~ = ref; <> + | ~ = link; <> (* TODO: Determine how we want to handle recursive elements like refs and some of the tags that have nestable_block inners Currently, this is broken *) let ref := - | ref_body = Simple_ref; children = inline_element; { `Reference (`Simple, ref_body, wrap_location $loc children) } - | ref_body = Ref_with_replacement; children = inline_element; { `Reference (`Replacement, ref_body, wrap_location $loc children) } + | ref_body = Simple_ref; children = inline_element; { `Reference (`Simple, ref_body, [ wrap_location $loc children ]) } + | ref_body = Ref_with_replacement; children = inline_element; { `Reference (`With_text, ref_body, [ wrap_location $loc children ]) } + +let link := + | link_body = Simple_link; children = inline_element; { `Link ( link_body, [ wrap_location $loc children ] ) } + | link_body = Link_with_replacement; children = inline_element; { `Link ( link_body, [ wrap_location $loc children ] + )} let list_light := | MINUS; unordered_items = separated_list(NEWLINE; MINUS, nestable_block_element); { `List (`Unordered, `Light, unordered_items) } | PLUS; ordered_items = separated_list(NEWLINE; PLUS, nestable_block_element); { `List (`Ordered, `Light, unordered_items) } + +let list_heavy := + | list_type = List; + items = separated_list( + NEWLINE; _ = List_item; SPACE?; RIGHT_BRACE, + located(nestable_block_element) + ); { `List (list_kind, `Heavy, items) } +(* NOTE: (@faycarsons) For some reason the inline_element rule isn't type-checking despite having(??) all of the variants in Ast.inline_element *) let nestable_block_element := | code = Verbatim; { `Verbatim code } - | error; { raise_unimplemented ~only_for_debugging:($loc) } + | element = inline_element; { `Paragraph [ ( wrap_location $loc element : Ast.inline_element Loc.with_location ) ] } + | code_block = Code_block; <`Code_block> + | modules = Modules; { `Modules [ wrap_location $loc modules ]} + | ~ = list_light; <> + | ~ = list_heavy; <> let tag := | inner_tag = Tag; children = nestable_block_element; { tag_with_element $loc children inner_tag } From d255bdeac46950493ce34a732c73ac09ec17ff9f Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 3 Sep 2024 15:38:22 -0400 Subject: [PATCH 013/150] Use `located` modifier, formatting --- src/parser/parser.mly | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 3a47e3925d..22f4c0a53f 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -110,9 +110,9 @@ let located(rule) == value = rule; { wrap_location $loc value } let main := | _ = whitespace; { [] } - | t = tag; { [ wrap_location $sloc t ]} + | t = located(tag); { [ t ]} | END; { [] } - | _ = error; { raise @@ exn_location ~only_for_debugging:( $loc ) } + | _ = error; { raise @@ exn_location ~only_for_debugging:$loc } let whitespace := | SPACE; { `Space " " } @@ -125,40 +125,42 @@ let inline_element := | ~ = Space; <`Space> | ~ = Word; <`Word> | ~ = Code_span; <`Code_span> - | s = Raw_markup; { `Raw_markup ( None, s ) } - | style = Style; inner = inline_element; { `Styled (style, wrap_location $loc inner) } + | ~ = Raw_markup; <`Raw_markup> + | style = Style; inner = located( inline_element ); { `Styled (style, [ inner ]) } | ~ = Math_span; <`Math_span> | ~ = ref; <> | ~ = link; <> -(* TODO: Determine how we want to handle recursive elements like refs and some of the tags that have nestable_block inners - Currently, this is broken *) let ref := - | ref_body = Simple_ref; children = inline_element; { `Reference (`Simple, ref_body, [ wrap_location $loc children ]) } - | ref_body = Ref_with_replacement; children = inline_element; { `Reference (`With_text, ref_body, [ wrap_location $loc children ]) } + | ref_body = located(Simple_ref ); children = located( inline_element ); { `Reference (`Simple, ref_body, [ children ]) } + | ref_body = located(Ref_with_replacement); children = located( inline_element ); { `Reference (`With_text, ref_body, [ children ]) } +(* TODO : Fix the `with_replacement` producers in the following two rules, if they're broken. Ask what `with_replacement` refers to *) let link := - | link_body = Simple_link; children = inline_element; { `Link ( link_body, [ wrap_location $loc children ] ) } - | link_body = Link_with_replacement; children = inline_element; { `Link ( link_body, [ wrap_location $loc children ] - )} + | link_body = Simple_link; children = located(inline_element); { `Link (link_body, [ children ]) } + | link_body = Link_with_replacement; children = located(inline_element); { `Link (link_body, [ children ]) } let list_light := - | MINUS; unordered_items = separated_list(NEWLINE; MINUS, nestable_block_element); { `List (`Unordered, `Light, unordered_items) } - | PLUS; ordered_items = separated_list(NEWLINE; PLUS, nestable_block_element); { `List (`Ordered, `Light, unordered_items) } + | MINUS; unordered_items = separated_list(NEWLINE; MINUS, located(nestable_block_element)); { `List (`Unordered, `Light, [ unordered_items ]) } + | PLUS; ordered_items = separated_list(NEWLINE; PLUS, located(nestable_block_element)); { `List (`Ordered, `Light, [ ordered_items ]) } let list_heavy := - | list_type = List; + | list_kind = List; items = separated_list( NEWLINE; _ = List_item; SPACE?; RIGHT_BRACE, located(nestable_block_element) - ); { `List (list_kind, `Heavy, items) } - -(* NOTE: (@faycarsons) For some reason the inline_element rule isn't type-checking despite having(??) all of the variants in Ast.inline_element *) + ); { `List (list_kind, `Heavy, [ items ]) } + +let table := error; { raise @@ exn_location ~only_for_debugging:$loc } + let nestable_block_element := | code = Verbatim; { `Verbatim code } - | element = inline_element; { `Paragraph [ ( wrap_location $loc element : Ast.inline_element Loc.with_location ) ] } + | element = located( inline_element ); { `Paragraph [ element ] } | code_block = Code_block; <`Code_block> - | modules = Modules; { `Modules [ wrap_location $loc modules ]} + | modules = located(Modules); { `Modules [ modules ] } + | _ = table; { raise @@ exn_location ~only_for_debugging:$loc } + | _ = Media; { raise @@ exn_location ~only_for_debugging:$loc } + | ~ = Math_block; <`Math_block> | ~ = list_light; <> | ~ = list_heavy; <> From 6fa50f2834bff8b00a04c37fc5659e31ecabca00 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 3 Sep 2024 16:43:20 -0400 Subject: [PATCH 014/150] Add rules to `main` Add `nestable_block_element` to main, making for 2/3 top level rules done --- src/parser/parser.mly | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 22f4c0a53f..de4277af05 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -110,7 +110,16 @@ let located(rule) == value = rule; { wrap_location $loc value } let main := | _ = whitespace; { [] } - | t = located(tag); { [ t ]} + | t = located(tag); { [ t ] } + + (* NOTE : (@faycarsons) Is this type coercion really necessary?? I couldn't + figure out another way to get this producer to typecheck but this feels + hacky *) + | block = located(nestable_block_element); { + let block = (block :> Ast.block_element Loc.with_location) in + [ block ] + } + | END; { [] } | _ = error; { raise @@ exn_location ~only_for_debugging:$loc } @@ -149,13 +158,14 @@ let list_heavy := items = separated_list( NEWLINE; _ = List_item; SPACE?; RIGHT_BRACE, located(nestable_block_element) - ); { `List (list_kind, `Heavy, [ items ]) } + ); + { `List (list_kind, `Heavy, [ items ]) } let table := error; { raise @@ exn_location ~only_for_debugging:$loc } let nestable_block_element := | code = Verbatim; { `Verbatim code } - | element = located( inline_element ); { `Paragraph [ element ] } + | element = located(inline_element); { `Paragraph [ element ] } | code_block = Code_block; <`Code_block> | modules = located(Modules); { `Modules [ modules ] } | _ = table; { raise @@ exn_location ~only_for_debugging:$loc } From 146e4cc10b6612c3814fcc83c8df6ede0ec7b269 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 4 Sep 2024 12:52:11 -0400 Subject: [PATCH 015/150] Pass current filename to Menhir parser so that it is used in AST locations --- src/parser/odoc_parser.ml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/parser/odoc_parser.ml b/src/parser/odoc_parser.ml index 0e8e879649..1f1af21ab8 100644 --- a/src/parser/odoc_parser.ml +++ b/src/parser/odoc_parser.ml @@ -102,11 +102,14 @@ let position_of_point : t -> Loc.point -> Lexing.position = (* The main entry point for this module *) let parse_comment ~location ~text = let reversed_newlines = reversed_newlines ~input:text in - let lexbuf = Lexing.from_string text in + let lexbuf = Lexing.from_string text in + (* We cannot directly pass parameters to Menhir without converting our parser + to a module functor. So we pass our current filename to the lexbuf here *) + lexbuf.lex_curr_p <- { lexbuf.lex_curr_p with pos_fname = Lexing.(location.pos_fname) }; let lexer_state = Lexer.{ warnings = [] ; offset_to_location = offset_to_location ~reversed_newlines ~comment_location:location - ; file = "foo.ml" } + ; file = Lexing.(location.pos_fname) } in let unwrapped_token : Lexing.lexbuf -> Parser.token = fun lexbuf -> let with_location = Lexer.token lexer_state lexbuf in From 7f32a0033ecf2f09370080aea32317fcc1736fec Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 4 Sep 2024 14:00:07 -0400 Subject: [PATCH 016/150] refactor exception handling + debug exns --- src/parser/parser.mly | 45 +++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index de4277af05..c7095e0bd1 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -1,6 +1,5 @@ %{ [@@@warning "-32"] - open Error open Parser_types let point_of_position Lexing.{ pos_lnum; pos_cnum; _ } = @@ -18,28 +17,36 @@ let location = to_location loc in { location; value } - let throw : lexspan -> Error.parser_error -> unit = fun loc error -> - raise @@ Parser_error (wrap_location loc error) - - - exception Debug of [ `DEBUG ] Loc.with_location - let raise_unimplemented : only_for_debugging:lexspan -> 'a = - fun ~only_for_debugging:loc -> - raise @@ Debug (wrap_location loc `DEBUG) - - let exn_location : only_for_debugging:lexspan -> exn = - fun ~only_for_debugging:loc -> Debug (wrap_location loc `DEBUG) + let pp_tag : Parser_types.tag -> string = function + | Author _ -> "@author" + | Deprecated -> "@deprecated" + | Param _ -> "@param" + | Raise _ -> "@raise/@raises" + | Return -> "@return" + | See _ -> "@see" + | Since _ -> "@since" + | Before _ -> "@before" + | Version _ -> "@version" + | Canonical _ -> "@canonical" + | Inline | Open | Closed | Hidden -> "" + +type unimplemented = Top_level | Table | Media +exception Debug of unimplemented Loc.with_location +exception No_children of string Loc.with_location + + let exn_location : only_for_debugging:lexspan -> failed_on:unimplemented -> exn = + fun ~only_for_debugging:loc ~failed_on -> Debug (wrap_location loc failed_on) let tag : Ast.tag -> Ast.block_element = fun tag -> `Tag tag - let tag_with_element loc children = function + let tag_with_element (loc : lexspan) (children : Ast.nestable_block_element) : Parser_types.tag -> Ast.block_element = function | Before version -> tag @@ `Before (version, [ wrap_location loc children ]) | Deprecated -> tag @@ `Deprecated [ wrap_location loc children ] | Return -> tag @@ `Return [ wrap_location loc children ] | Param param_name -> tag @@ `Param (param_name, [ wrap_location loc children ]) | Raise exn -> tag @@ `Raise (exn, [ wrap_location loc children ]) | See (kind, href) -> tag @@ `See (kind, href, [ wrap_location loc children ]) - | _ -> raise @@ exn_location ~only_for_debugging:( loc ) + | _ -> assert false (* Unreachable *) let tag_bare loc = function | Version version -> tag @@ `Version version @@ -50,7 +57,7 @@ | Open -> `Tag `Open | Closed -> `Tag `Closed | Hidden -> `Tag `Hidden - | _ -> raise @@ exn_location ~only_for_debugging:( loc ) + | tag -> raise @@ No_children (wrap_location loc @@ Printf.sprintf "Tag %s expects children" (pp_tag tag)) %} @@ -121,7 +128,7 @@ let main := } | END; { [] } - | _ = error; { raise @@ exn_location ~only_for_debugging:$loc } + | _ = error; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Top_level } let whitespace := | SPACE; { `Space " " } @@ -161,15 +168,15 @@ let list_heavy := ); { `List (list_kind, `Heavy, [ items ]) } -let table := error; { raise @@ exn_location ~only_for_debugging:$loc } +let table := error; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Table } let nestable_block_element := | code = Verbatim; { `Verbatim code } | element = located(inline_element); { `Paragraph [ element ] } | code_block = Code_block; <`Code_block> | modules = located(Modules); { `Modules [ modules ] } - | _ = table; { raise @@ exn_location ~only_for_debugging:$loc } - | _ = Media; { raise @@ exn_location ~only_for_debugging:$loc } + | _ = table; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Table } + | _ = Media; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Media } | ~ = Math_block; <`Math_block> | ~ = list_light; <> | ~ = list_heavy; <> From bcb99df3cd2083d3053efcd6b2f4a630e7817092 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 4 Sep 2024 14:15:10 -0400 Subject: [PATCH 017/150] Add printer for Debug exceptions --- src/parser/parser.mly | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index c7095e0bd1..603692c2d4 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -30,8 +30,21 @@ | Canonical _ -> "@canonical" | Inline | Open | Closed | Hidden -> "" -type unimplemented = Top_level | Table | Media +type unimplemented = Top_level_error | Table | Media exception Debug of unimplemented Loc.with_location +let _ = Printexc.register_printer (function + | Debug unimplemented_token_with_location -> + begin + let Loc.{ location = _location; value = token } = unimplemented_token_with_location in + let error_message = match token with + | Top_level_error -> "Error in Parser.main rule" + | Table -> "table" + | Media -> "media" + in + Option.some @@ Printf.sprintf "Parser failed on: %s" error_message + end + | _ -> None +) exception No_children of string Loc.with_location let exn_location : only_for_debugging:lexspan -> failed_on:unimplemented -> exn = @@ -128,7 +141,7 @@ let main := } | END; { [] } - | _ = error; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Top_level } + | _ = error; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Top_level_error } let whitespace := | SPACE; { `Space " " } From b5287d545efe373792ca781af0af19639dfa3206 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 5 Sep 2024 14:10:21 -0400 Subject: [PATCH 018/150] Add rule for headers --- src/parser/parser.mly | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 603692c2d4..96d0623a8d 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -139,7 +139,7 @@ let main := let block = (block :> Ast.block_element Loc.with_location) in [ block ] } - + | header = located( heading ); { [ header ]} | END; { [] } | _ = error; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Top_level_error } @@ -194,6 +194,11 @@ let nestable_block_element := | ~ = list_light; <> | ~ = list_heavy; <> +let heading := + | (num, title) = Section_heading; children = list(located(inline_element)); { + `Heading (num, title, children) :> Ast.block_element + } + let tag := | inner_tag = Tag; children = nestable_block_element; { tag_with_element $loc children inner_tag } | inner_tag = Tag; { tag_bare $loc inner_tag } From ee2bc7014c31e4c820cff1fe3262b930fb23c705 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 5 Sep 2024 15:15:15 -0400 Subject: [PATCH 019/150] Refactor table tokens to follow Menhir all-caps convention --- src/parser/lexer.mll | 6 +++--- src/parser/parser.mly | 6 +++--- src/parser/parser_utils.ml | 12 ++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 7a039b0b0f..6b5f7f0a41 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -502,13 +502,13 @@ and token input = parse { emit lexbuf input (List_item `Dash) } | "{table" - { emit lexbuf input Table_heavy } + { emit lexbuf input TABLE_HEAVY } | "{t" - { emit lexbuf input Table_light } + { emit lexbuf input TABLE_LIGHT } | "{tr" - { emit lexbuf input Table_row } + { emit lexbuf input TABLE_ROW } | "{th" { emit lexbuf input (Table_cell `Header) } diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 96d0623a8d..f5da21592f 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -103,9 +103,9 @@ exception No_children of string Loc.with_location %token List %token List_item -%token Table_light -%token Table_heavy -%token Table_row +%token TABLE_LIGHT +%token TABLE_HEAVY +%token TABLE_ROW %token Table_cell %token Section_heading diff --git a/src/parser/parser_utils.ml b/src/parser/parser_utils.ml index 146930606d..5df2704f8f 100644 --- a/src/parser/parser_utils.ml +++ b/src/parser/parser_utils.ml @@ -13,9 +13,9 @@ let[@warning "-8"] print : Parser.token -> string = function | Link_with_replacement _ -> "'{{:'" | List_item `Li -> "'{li ...}'" | List_item `Dash -> "'{- ...}'" - | Table_light -> "{t" - | Table_heavy -> "{table" - | Table_row -> "'{tr'" + | TABLE_LIGHT -> "{t" + | TABLE_HEAVY -> "{table" + | TABLE_ROW -> "'{tr'" | Table_cell `Header -> "'{th'" | Table_cell `Data -> "'{td'" | MINUS -> "'-'" @@ -76,9 +76,9 @@ let[@warning "-8"] describe : Parser.token -> string = function | List `Ordered -> "'{ol ...}' (numbered list)" | List_item `Li -> "'{li ...}' (list item)" | List_item `Dash -> "'{- ...}' (list item)" - | Table_light -> "'{t ...}' (table)" - | Table_heavy -> "'{table ...}' (table)" - | Table_row -> "'{tr ...}' (table row)" + | TABLE_LIGHT -> "'{t ...}' (table)" + | TABLE_HEAVY -> "'{table ...}' (table)" + | TABLE_ROW -> "'{tr ...}' (table row)" | Table_cell `Header -> "'{th ... }' (table header cell)" | Table_cell `Data -> "'{td ... }' (table data cell)" | MINUS -> "'-' (bulleted list item)" From aa5d1f413734f1ed2a82da0585582f2bad2585e0 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 5 Sep 2024 16:10:49 -0400 Subject: [PATCH 020/150] Add table rules --- src/parser/lexer.mll | 2 +- src/parser/parser.mly | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 6b5f7f0a41..2dad5ae778 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -462,7 +462,7 @@ and token input = parse warning lexbuf input ~start_offset (Parse_error.language_tag_invalid_char lang_tag_ c); - (* NOTE : Metadata should not be `None`, fix this!! *) + (* NOTE : (@faycarsons) Metadata should not be `None` *) code_block start_offset (Lexing.lexeme_end lexbuf) None (Buffer.create 256) delimiter input lexbuf } diff --git a/src/parser/parser.mly b/src/parser/parser.mly index f5da21592f..5d677ad2dc 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -181,14 +181,27 @@ let list_heavy := ); { `List (list_kind, `Heavy, [ items ]) } -let table := error; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Table } +let heavy_cell := cell_kind = Table_cell; children = list(located(nestable_block_element)); { (children, cell_kind) } +let heavy_row == TABLE_ROW; cells = list(heavy_cell); { cells } +let table_heavy == TABLE_HEAVY; grid = list(heavy_row); { + (* Convert into an 'abstract table' which can be either a light or heavy syntax table. + We know this is a heavy table, which cannot have alignment, however, so the alignment field is `None` *) + let abstract : Ast.nestable_block_element Ast.abstract_table = (grid, None) in + (abstract, `Heavy) + } + +let table_light := TABLE_LIGHT; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Table } + +let table := + | ~ = table_heavy; <`Table> + | ~ = table_light; <`Table> let nestable_block_element := | code = Verbatim; { `Verbatim code } | element = located(inline_element); { `Paragraph [ element ] } | code_block = Code_block; <`Code_block> | modules = located(Modules); { `Modules [ modules ] } - | _ = table; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Table } + | ~ = table; <> | _ = Media; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Media } | ~ = Math_block; <`Math_block> | ~ = list_light; <> From 44e5cb6cf28a4c649de736a65804be4605f3bae0 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Fri, 6 Sep 2024 10:41:37 -0400 Subject: [PATCH 021/150] Simplify and add comment for `unwrapped_token` in `Odoc_parser.parse_comment` --- src/parser/odoc_parser.ml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/parser/odoc_parser.ml b/src/parser/odoc_parser.ml index 1f1af21ab8..6e11781082 100644 --- a/src/parser/odoc_parser.ml +++ b/src/parser/odoc_parser.ml @@ -111,9 +111,9 @@ let parse_comment ~location ~text = ; offset_to_location = offset_to_location ~reversed_newlines ~comment_location:location ; file = Lexing.(location.pos_fname) } in - let unwrapped_token : Lexing.lexbuf -> Parser.token = fun lexbuf -> - let with_location = Lexer.token lexer_state lexbuf in - Loc.value with_location + (* Remove the `Loc.with_location` wrapping our token because Menhir cannot handle that *) + let unwrapped_token lexbuf = + Lexer.token lexer_state lexbuf |> Loc.value in let ast = Parser.main unwrapped_token lexbuf in { ast; warnings = lexer_state.warnings; reversed_newlines; original_pos = location } From 6934b99b62a2b5d4f7573fd7a2cc3899ebe2542a Mon Sep 17 00:00:00 2001 From: faycarsons Date: Fri, 6 Sep 2024 11:58:09 -0400 Subject: [PATCH 022/150] Clean up `nestable_block_element` rule producers, list rule --- src/parser/parser.mly | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 5d677ad2dc..0e30a57761 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -181,9 +181,13 @@ let list_heavy := ); { `List (list_kind, `Heavy, [ items ]) } -let heavy_cell := cell_kind = Table_cell; children = list(located(nestable_block_element)); { (children, cell_kind) } -let heavy_row == TABLE_ROW; cells = list(heavy_cell); { cells } -let table_heavy == TABLE_HEAVY; grid = list(heavy_row); { +let odoc_list := + | ~ = list_light; <> + | ~ = list_heavy; <> + +let cell_heavy := cell_kind = Table_cell; children = list(located(nestable_block_element)); { (children, cell_kind) } +let row_heavy == TABLE_ROW; cells = list(cell_heavy); { cells } +let table_heavy == TABLE_HEAVY; grid = row_heavy+; RIGHT_BRACE; { (* Convert into an 'abstract table' which can be either a light or heavy syntax table. We know this is a heavy table, which cannot have alignment, however, so the alignment field is `None` *) let abstract : Ast.nestable_block_element Ast.abstract_table = (grid, None) in @@ -197,10 +201,10 @@ let table := | ~ = table_light; <`Table> let nestable_block_element := - | code = Verbatim; { `Verbatim code } - | element = located(inline_element); { `Paragraph [ element ] } - | code_block = Code_block; <`Code_block> - | modules = located(Modules); { `Modules [ modules ] } + | ~ = Verbatim; <`Verbatim> + | ~ = located(inline_element) +; <`Paragraph> + | ~ = Code_block; <`Code_block> + | ~ = located(Modules) +; <`Modules> | ~ = table; <> | _ = Media; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Media } | ~ = Math_block; <`Math_block> From a95314074283585a188fd833ce83500aec777339 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Fri, 6 Sep 2024 13:24:58 -0400 Subject: [PATCH 023/150] Add section delimiting comments, clean up `ref` producers --- src/parser/parser.mly | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 0e30a57761..f17543b759 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -126,8 +126,11 @@ exception No_children of string Loc.with_location %% +(* Utility which wraps the return value of a producer in `Loc.with_location` *) let located(rule) == value = rule; { wrap_location $loc value } +(* ENTRY-POINT *) + let main := | _ = whitespace; { [] } | t = located(tag); { [ t ] } @@ -150,6 +153,8 @@ let whitespace := | ~ = Blank_line; <`Space> | ~ = Single_newline; <`Space> +(* INLINE ELEMENTS *) + let inline_element := | ~ = Space; <`Space> | ~ = Word; <`Word> @@ -161,14 +166,16 @@ let inline_element := | ~ = link; <> let ref := - | ref_body = located(Simple_ref ); children = located( inline_element ); { `Reference (`Simple, ref_body, [ children ]) } - | ref_body = located(Ref_with_replacement); children = located( inline_element ); { `Reference (`With_text, ref_body, [ children ]) } + | ref_body = located(Simple_ref ); children = located(inline_element)*; { `Reference (`Simple, ref_body, children) } + | ref_body = located(Ref_with_replacement); children = located(inline_element)*; { `Reference (`With_text, ref_body, children) } (* TODO : Fix the `with_replacement` producers in the following two rules, if they're broken. Ask what `with_replacement` refers to *) let link := | link_body = Simple_link; children = located(inline_element); { `Link (link_body, [ children ]) } | link_body = Link_with_replacement; children = located(inline_element); { `Link (link_body, [ children ]) } +(* LIST *) + let list_light := | MINUS; unordered_items = separated_list(NEWLINE; MINUS, located(nestable_block_element)); { `List (`Unordered, `Light, [ unordered_items ]) } | PLUS; ordered_items = separated_list(NEWLINE; PLUS, located(nestable_block_element)); { `List (`Ordered, `Light, [ ordered_items ]) } @@ -181,10 +188,12 @@ let list_heavy := ); { `List (list_kind, `Heavy, [ items ]) } -let odoc_list := +let list_element := | ~ = list_light; <> | ~ = list_heavy; <> +(* TABLES *) + let cell_heavy := cell_kind = Table_cell; children = list(located(nestable_block_element)); { (children, cell_kind) } let row_heavy == TABLE_ROW; cells = list(cell_heavy); { cells } let table_heavy == TABLE_HEAVY; grid = row_heavy+; RIGHT_BRACE; { @@ -200,16 +209,17 @@ let table := | ~ = table_heavy; <`Table> | ~ = table_light; <`Table> +(* TOP-LEVEL ELEMENTS *) + let nestable_block_element := | ~ = Verbatim; <`Verbatim> | ~ = located(inline_element) +; <`Paragraph> | ~ = Code_block; <`Code_block> | ~ = located(Modules) +; <`Modules> + | ~ = list_element; <> | ~ = table; <> | _ = Media; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Media } | ~ = Math_block; <`Math_block> - | ~ = list_light; <> - | ~ = list_heavy; <> let heading := | (num, title) = Section_heading; children = list(located(inline_element)); { From 4c5fefd6728e7c527f1db60332f1cf6ef8865e8b Mon Sep 17 00:00:00 2001 From: faycarsons Date: Fri, 6 Sep 2024 13:49:48 -0400 Subject: [PATCH 024/150] Add braces and whitespace to producers --- src/parser/parser.mly | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index f17543b759..9df03bc451 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -132,7 +132,6 @@ let located(rule) == value = rule; { wrap_location $loc value } (* ENTRY-POINT *) let main := - | _ = whitespace; { [] } | t = located(tag); { [ t ] } (* NOTE : (@faycarsons) Is this type coercion really necessary?? I couldn't @@ -160,25 +159,25 @@ let inline_element := | ~ = Word; <`Word> | ~ = Code_span; <`Code_span> | ~ = Raw_markup; <`Raw_markup> - | style = Style; inner = located( inline_element ); { `Styled (style, [ inner ]) } + | style = Style; inner = located(inline_element)+; { `Styled (style, inner) } | ~ = Math_span; <`Math_span> | ~ = ref; <> | ~ = link; <> let ref := - | ref_body = located(Simple_ref ); children = located(inline_element)*; { `Reference (`Simple, ref_body, children) } - | ref_body = located(Ref_with_replacement); children = located(inline_element)*; { `Reference (`With_text, ref_body, children) } + | ref_body = located(Simple_ref ); children = located(inline_element)*; RIGHT_BRACE; { `Reference (`Simple, ref_body, children) } + | ref_body = located(Ref_with_replacement); children = located(inline_element)*; RIGHT_BRACE; { `Reference (`With_text, ref_body, children) } (* TODO : Fix the `with_replacement` producers in the following two rules, if they're broken. Ask what `with_replacement` refers to *) let link := - | link_body = Simple_link; children = located(inline_element); { `Link (link_body, [ children ]) } - | link_body = Link_with_replacement; children = located(inline_element); { `Link (link_body, [ children ]) } + | link_body = Simple_link; children = located(inline_element); RIGHT_BRACE; { `Link (link_body, [ children ]) } + | link_body = Link_with_replacement; children = located(inline_element); RIGHT_BRACE; { `Link (link_body, [ children ]) } (* LIST *) let list_light := - | MINUS; unordered_items = separated_list(NEWLINE; MINUS, located(nestable_block_element)); { `List (`Unordered, `Light, [ unordered_items ]) } - | PLUS; ordered_items = separated_list(NEWLINE; PLUS, located(nestable_block_element)); { `List (`Ordered, `Light, [ ordered_items ]) } + | MINUS; unordered_items = separated_list(NEWLINE; SPACE?; MINUS, located(nestable_block_element)); { `List (`Unordered, `Light, [ unordered_items ]) } + | PLUS; ordered_items = separated_list(NEWLINE; SPACE?; PLUS, located(nestable_block_element)); { `List (`Ordered, `Light, [ ordered_items ]) } let list_heavy := | list_kind = List; @@ -194,9 +193,9 @@ let list_element := (* TABLES *) -let cell_heavy := cell_kind = Table_cell; children = list(located(nestable_block_element)); { (children, cell_kind) } -let row_heavy == TABLE_ROW; cells = list(cell_heavy); { cells } -let table_heavy == TABLE_HEAVY; grid = row_heavy+; RIGHT_BRACE; { +let cell_heavy := cell_kind = Table_cell; SPACE?; children = list(located(nestable_block_element)); SPACE?; NEWLINE?; RIGHT_BRACE; { (children, cell_kind) } +let row_heavy == TABLE_ROW; cells = list(cell_heavy); SPACE*; NEWLINE?; SPACE*; RIGHT_BRACE; { cells } +let table_heavy == TABLE_HEAVY; grid = row_heavy+; SPACE*; NEWLINE?; SPACE*; RIGHT_BRACE; { (* Convert into an 'abstract table' which can be either a light or heavy syntax table. We know this is a heavy table, which cannot have alignment, however, so the alignment field is `None` *) let abstract : Ast.nestable_block_element Ast.abstract_table = (grid, None) in @@ -222,7 +221,7 @@ let nestable_block_element := | ~ = Math_block; <`Math_block> let heading := - | (num, title) = Section_heading; children = list(located(inline_element)); { + | (num, title) = Section_heading; children = list(located(inline_element)); RIGHT_BRACE; { `Heading (num, title, children) :> Ast.block_element } From 76a6c641cf057a28dc4ef380eb0609f0e14a6cca Mon Sep 17 00:00:00 2001 From: faycarsons Date: Fri, 6 Sep 2024 15:05:53 -0400 Subject: [PATCH 025/150] Refactor toplevels, add delimiters where missing, format --- src/parser/parser.mly | 53 +++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 9df03bc451..e5e67da70f 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -52,13 +52,13 @@ exception No_children of string Loc.with_location let tag : Ast.tag -> Ast.block_element = fun tag -> `Tag tag - let tag_with_element (loc : lexspan) (children : Ast.nestable_block_element) : Parser_types.tag -> Ast.block_element = function - | Before version -> tag @@ `Before (version, [ wrap_location loc children ]) - | Deprecated -> tag @@ `Deprecated [ wrap_location loc children ] - | Return -> tag @@ `Return [ wrap_location loc children ] - | Param param_name -> tag @@ `Param (param_name, [ wrap_location loc children ]) - | Raise exn -> tag @@ `Raise (exn, [ wrap_location loc children ]) - | See (kind, href) -> tag @@ `See (kind, href, [ wrap_location loc children ]) + let tag_with_element (children : Ast.nestable_block_element Loc.with_location list) : Parser_types.tag -> Ast.block_element = function + | Before version -> tag @@ `Before (version, children) + | Deprecated -> tag @@ `Deprecated children + | Return -> tag @@ `Return children + | Param param_name -> tag @@ `Param (param_name, children) + | Raise exn -> tag @@ `Raise (exn, children) + | See (kind, href) -> tag @@ `See (kind, href, children) | _ -> assert false (* Unreachable *) let tag_bare loc = function @@ -132,18 +132,14 @@ let located(rule) == value = rule; { wrap_location $loc value } (* ENTRY-POINT *) let main := - | t = located(tag); { [ t ] } - - (* NOTE : (@faycarsons) Is this type coercion really necessary?? I couldn't - figure out another way to get this producer to typecheck but this feels - hacky *) - | block = located(nestable_block_element); { - let block = (block :> Ast.block_element Loc.with_location) in - [ block ] - } - | header = located( heading ); { [ header ]} + | ~ = located(toplevel)+; <> | END; { [] } - | _ = error; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Top_level_error } + | error; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Top_level_error } + +let toplevel := + | ~ = tag; <> + | block = nestable_block_element; { (block :> Ast.block_element) } + | ~ = heading; <> let whitespace := | SPACE; { `Space " " } @@ -176,15 +172,18 @@ let link := (* LIST *) let list_light := - | MINUS; unordered_items = separated_list(NEWLINE; SPACE?; MINUS, located(nestable_block_element)); { `List (`Unordered, `Light, [ unordered_items ]) } - | PLUS; ordered_items = separated_list(NEWLINE; SPACE?; PLUS, located(nestable_block_element)); { `List (`Ordered, `Light, [ ordered_items ]) } + | MINUS; unordered_items = separated_list(NEWLINE; SPACE?; MINUS, located(nestable_block_element)); + { `List (`Unordered, `Light, [ unordered_items ]) } + + | PLUS; ordered_items = separated_list(NEWLINE; SPACE?; PLUS, located(nestable_block_element)); + { `List (`Ordered, `Light, [ ordered_items ]) } let list_heavy := | list_kind = List; items = separated_list( NEWLINE; _ = List_item; SPACE?; RIGHT_BRACE, located(nestable_block_element) - ); + ); RIGHT_BRACE; { `List (list_kind, `Heavy, [ items ]) } let list_element := @@ -194,8 +193,8 @@ let list_element := (* TABLES *) let cell_heavy := cell_kind = Table_cell; SPACE?; children = list(located(nestable_block_element)); SPACE?; NEWLINE?; RIGHT_BRACE; { (children, cell_kind) } -let row_heavy == TABLE_ROW; cells = list(cell_heavy); SPACE*; NEWLINE?; SPACE*; RIGHT_BRACE; { cells } -let table_heavy == TABLE_HEAVY; grid = row_heavy+; SPACE*; NEWLINE?; SPACE*; RIGHT_BRACE; { +let row_heavy == TABLE_ROW; cells = list(cell_heavy); RIGHT_BRACE; { cells } +let table_heavy == TABLE_HEAVY; grid = row_heavy*; RIGHT_BRACE; { (* Convert into an 'abstract table' which can be either a light or heavy syntax table. We know this is a heavy table, which cannot have alignment, however, so the alignment field is `None` *) let abstract : Ast.nestable_block_element Ast.abstract_table = (grid, None) in @@ -212,9 +211,9 @@ let table := let nestable_block_element := | ~ = Verbatim; <`Verbatim> - | ~ = located(inline_element) +; <`Paragraph> - | ~ = Code_block; <`Code_block> - | ~ = located(Modules) +; <`Modules> + | ~ = located(inline_element)+; <`Paragraph> + | ~ = Code_block; RIGHT_CODE_DELIMITER; <`Code_block> + | ~ = located(Modules)+; RIGHT_BRACE; <`Modules> | ~ = list_element; <> | ~ = table; <> | _ = Media; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Media } @@ -226,7 +225,7 @@ let heading := } let tag := - | inner_tag = Tag; children = nestable_block_element; { tag_with_element $loc children inner_tag } + | inner_tag = Tag; children = located(nestable_block_element)+; { tag_with_element children inner_tag } | inner_tag = Tag; { tag_bare $loc inner_tag } let style := ~ = Style; <> From 13773280a43ffb51e918a1cab0408e84665fbb73 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Fri, 6 Sep 2024 15:23:13 -0400 Subject: [PATCH 026/150] Formatting, fix `list_heavy` rule, swap empty + nonempty lists where needed --- src/parser/parser.mly | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index e5e67da70f..042763d699 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -141,12 +141,16 @@ let toplevel := | block = nestable_block_element; { (block :> Ast.block_element) } | ~ = heading; <> -let whitespace := - | SPACE; { `Space " " } - | NEWLINE; { `Space "\n" } - | ~ = Space; <`Space> - | ~ = Blank_line; <`Space> - | ~ = Single_newline; <`Space> +(* We'll need some of these later I'd imagine, not sure about the whole rule. + Which elements are strict about spaces or newlines? + + let whitespace := + | SPACE; { `Space " " } + | NEWLINE; { `Space "\n" } + | ~ = Space; <`Space> + | ~ = Blank_line; <`Space> + | ~ = Single_newline; <`Space> +*) (* INLINE ELEMENTS *) @@ -155,19 +159,22 @@ let inline_element := | ~ = Word; <`Word> | ~ = Code_span; <`Code_span> | ~ = Raw_markup; <`Raw_markup> - | style = Style; inner = located(inline_element)+; { `Styled (style, inner) } + | style = Style; inner = located(inline_element)*; { `Styled (style, inner) } | ~ = Math_span; <`Math_span> | ~ = ref; <> | ~ = link; <> let ref := - | ref_body = located(Simple_ref ); children = located(inline_element)*; RIGHT_BRACE; { `Reference (`Simple, ref_body, children) } - | ref_body = located(Ref_with_replacement); children = located(inline_element)*; RIGHT_BRACE; { `Reference (`With_text, ref_body, children) } + | ref_body = located(Simple_ref); children = located(inline_element)+; RIGHT_BRACE; + { `Reference (`Simple, ref_body, children) } + + | ref_body = located(Ref_with_replacement); children = located(inline_element)*; RIGHT_BRACE; + { `Reference (`With_text, ref_body, children) } (* TODO : Fix the `with_replacement` producers in the following two rules, if they're broken. Ask what `with_replacement` refers to *) let link := - | link_body = Simple_link; children = located(inline_element); RIGHT_BRACE; { `Link (link_body, [ children ]) } - | link_body = Link_with_replacement; children = located(inline_element); RIGHT_BRACE; { `Link (link_body, [ children ]) } + | link_body = Simple_link; children = located(inline_element)+; RIGHT_BRACE; { `Link (link_body, children) } + | link_body = Link_with_replacement; children = located(inline_element)+; RIGHT_BRACE; { `Link (link_body, children) } (* LIST *) @@ -178,13 +185,11 @@ let list_light := | PLUS; ordered_items = separated_list(NEWLINE; SPACE?; PLUS, located(nestable_block_element)); { `List (`Ordered, `Light, [ ordered_items ]) } +(* `List_item` is [ `Li | `Dash ], not sure how that's useful though. Can't find '{li' syntax in Odoc docs *) +let item_heavy == _ = List_item; ~ = located(nestable_block_element)*; RIGHT_BRACE; <> let list_heavy := - | list_kind = List; - items = separated_list( - NEWLINE; _ = List_item; SPACE?; RIGHT_BRACE, - located(nestable_block_element) - ); RIGHT_BRACE; - { `List (list_kind, `Heavy, [ items ]) } + | list_kind = List; items = item_heavy*; RIGHT_BRACE; + { `List (list_kind, `Heavy, items) } let list_element := | ~ = list_light; <> @@ -192,7 +197,7 @@ let list_element := (* TABLES *) -let cell_heavy := cell_kind = Table_cell; SPACE?; children = list(located(nestable_block_element)); SPACE?; NEWLINE?; RIGHT_BRACE; { (children, cell_kind) } +let cell_heavy := cell_kind = Table_cell; children = located(nestable_block_element)*; RIGHT_BRACE; { (children, cell_kind) } let row_heavy == TABLE_ROW; cells = list(cell_heavy); RIGHT_BRACE; { cells } let table_heavy == TABLE_HEAVY; grid = row_heavy*; RIGHT_BRACE; { (* Convert into an 'abstract table' which can be either a light or heavy syntax table. From bf52c6c7d2ad28d6ccd9a10aace79dc6694761c9 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Fri, 6 Sep 2024 16:47:38 -0400 Subject: [PATCH 027/150] remove unused `COMMENT` token --- src/parser/parser.mly | 1 - 1 file changed, 1 deletion(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 042763d699..0a1d637785 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -77,7 +77,6 @@ exception No_children of string Loc.with_location %token SPACE NEWLINE %token RIGHT_BRACE %token RIGHT_CODE_DELIMITER -%token COMMENT %token Blank_line %token Single_newline From 31e7b96c7dd88fe1c5e62fc7c8768f3b1f972abf Mon Sep 17 00:00:00 2001 From: faycarsons Date: Fri, 6 Sep 2024 16:54:48 -0400 Subject: [PATCH 028/150] Remove reference to `COMMENT` token in `parser_utils.ml` --- src/parser/parser_utils.ml | 1 - 1 file changed, 1 deletion(-) diff --git a/src/parser/parser_utils.ml b/src/parser/parser_utils.ml index 5df2704f8f..589ae7aac0 100644 --- a/src/parser/parser_utils.ml +++ b/src/parser/parser_utils.ml @@ -100,7 +100,6 @@ let[@warning "-8"] describe : Parser.token -> string = function | Tag Open -> "'@open'" | Tag Closed -> "'@closed'" | Tag Hidden -> "'@hidden" - | COMMENT -> "top-level text" (* NOTE : (@faycarsons) Should this be in Ast.ml? This takes an Ast.t no? *) let describe_element = function From bf33d8ee11c04b47ff7753bf430dd6cad488402c Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 10 Sep 2024 18:40:30 -0400 Subject: [PATCH 029/150] Add (previously forgotten) end-of-input handling :3 --- src/parser/parser.mly | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 0a1d637785..7efecf878b 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -131,7 +131,7 @@ let located(rule) == value = rule; { wrap_location $loc value } (* ENTRY-POINT *) let main := - | ~ = located(toplevel)+; <> + | ~ = located(toplevel)+; END; <> | END; { [] } | error; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Top_level_error } @@ -140,21 +140,17 @@ let toplevel := | block = nestable_block_element; { (block :> Ast.block_element) } | ~ = heading; <> -(* We'll need some of these later I'd imagine, not sure about the whole rule. - Which elements are strict about spaces or newlines? - - let whitespace := - | SPACE; { `Space " " } - | NEWLINE; { `Space "\n" } - | ~ = Space; <`Space> - | ~ = Blank_line; <`Space> - | ~ = Single_newline; <`Space> -*) +let whitespace := + | SPACE; { `Space " " } + | NEWLINE; { `Space "\n" } + | ~ = Space; <`Space> + | ~ = Blank_line; <`Space> + | ~ = Single_newline; <`Space> (* INLINE ELEMENTS *) let inline_element := - | ~ = Space; <`Space> + | ~ = whitespace; <> | ~ = Word; <`Word> | ~ = Code_span; <`Code_span> | ~ = Raw_markup; <`Raw_markup> From 5ab9a6898b1b53f4af42fb6eac344eea1dfc3a26 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 11 Sep 2024 10:02:16 -0400 Subject: [PATCH 030/150] Add toplevel whitespace handling --- src/parser/parser.mly | 1 + 1 file changed, 1 insertion(+) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 7efecf878b..f8760afc59 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -132,6 +132,7 @@ let located(rule) == value = rule; { wrap_location $loc value } let main := | ~ = located(toplevel)+; END; <> + | _ = whitespace; { [] } | END; { [] } | error; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Top_level_error } From 9d672be75dfbee61dd8252cf0ed0281322cbd71c Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 12 Sep 2024 17:46:05 -0400 Subject: [PATCH 031/150] Incomplete rules for light table syntax --- src/parser/dune | 3 +- src/parser/parser.mly | 108 +++++++++++++++++++++++++++++++++++------- 2 files changed, 93 insertions(+), 18 deletions(-) diff --git a/src/parser/dune b/src/parser/dune index d1130fc9d7..95dcaf6c6e 100644 --- a/src/parser/dune +++ b/src/parser/dune @@ -1,7 +1,8 @@ (ocamllex lexer) (menhir - (modules parser)) + (modules parser) + (flags --table --inspection --dump)) (library (name odoc_parser) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index f8760afc59..db13316c5a 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -30,22 +30,22 @@ | Canonical _ -> "@canonical" | Inline | Open | Closed | Hidden -> "" -type unimplemented = Top_level_error | Table | Media -exception Debug of unimplemented Loc.with_location -let _ = Printexc.register_printer (function - | Debug unimplemented_token_with_location -> - begin - let Loc.{ location = _location; value = token } = unimplemented_token_with_location in - let error_message = match token with - | Top_level_error -> "Error in Parser.main rule" - | Table -> "table" - | Media -> "media" - in - Option.some @@ Printf.sprintf "Parser failed on: %s" error_message - end - | _ -> None -) -exception No_children of string Loc.with_location + type unimplemented = Top_level_error | Table | Media + exception Debug of unimplemented Loc.with_location + let _ = Printexc.register_printer (function + | Debug unimplemented_token_with_location -> + begin + let Loc.{ location = _location; value = token } = unimplemented_token_with_location in + let error_message = match token with + | Top_level_error -> "Error in Parser.main rule" + | Table -> "table" + | Media -> "media" + in + Option.some @@ Printf.sprintf "Parser failed on: %s" error_message + end + | _ -> None + ) + exception No_children of string Loc.with_location let exn_location : only_for_debugging:lexspan -> failed_on:unimplemented -> exn = fun ~only_for_debugging:loc ~failed_on -> Debug (wrap_location loc failed_on) @@ -72,6 +72,53 @@ exception No_children of string Loc.with_location | Hidden -> `Tag `Hidden | tag -> raise @@ No_children (wrap_location loc @@ Printf.sprintf "Tag %s expects children" (pp_tag tag)) + + + type align_cell = + | Valid of Ast.alignment option + | Invalid + + (* This could be handled in the parser, I think *) + let valid_align_cell text = + begin + match String.length text with + | 0 -> Valid None + | 1 -> + begin + match text.[0] with + | ':' -> Valid (Some `Center) + | '-' -> Valid None + | _ -> Invalid + end + | len -> + if String.for_all (Char.equal '-') (String.sub text 1 (len - 2)) then + match text.[0], text.[pred len] with + | ':', '-' -> Valid (Some `Left) + | '-', ':' -> Valid (Some `Right) + | ':', ':' -> Valid (Some `Center) + | '-', '-' -> Valid (None) + | _ -> Invalid + else Invalid + end + + let rec valid_align_row ?(acc = []) = function + | cell :: rest -> + begin + match valid_align_cell cell with + | Valid alignment -> + valid_align_row ~acc:(alignment :: acc) rest + | Invalid -> + None + end + | [] -> + Some acc + + (* Wrap a list of words in a paragraph, used for 'light' table headers *) + let paragraph_of_words words = + let words = List.map (Loc.map (fun text -> `Word text)) words + and span = Loc.span @@ List.map Loc.location words in + Loc.at span (`Paragraph words) + %} %token SPACE NEWLINE @@ -202,7 +249,34 @@ let table_heavy == TABLE_HEAVY; grid = row_heavy*; RIGHT_BRACE; { (abstract, `Heavy) } -let table_light := TABLE_LIGHT; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Table } + +(* + cell -> nestable_block_element with_location list * Header | Data + row -> cell list + grid -> row list + abstract_table -> grid * alignment option list option + + table -> abstract_table * Light | Heavy +*) +let data_cell_light == BAR?; data = located(nestable_block_element)+; { inner, `Data } +let data_row_light := ~ = data_cell_light+; BAR?; NEWLINE; <> + +let alignment_cell_light == BAR?; inner = Word; { inner } +let header_cell_light == BAR?; inner = located(Word)+; { inner } + +let align_row_light == BAR?; ~ = alignment_cell_light+; BAR?; NEWLINE; <> +let header_row_light := ~ = header_cell_light+; BAR?; NEWLINE; <> + +let table_light := + (* If the first row is the alignment row then the rest should be data *) + | TABLE_LIGHT; align = align_row_light; data_rows = data_row_light+; RIGHT_BRACE; + {} + (* If there's only one row and it's not the align row, then it's data *) + | TABLE_LIGHT; header = header_row_light; align = align_row_light; data = data_row_light+; RIGHT_BRACE; + {} + (* Otherwise the first should be the headers, the second align, and the rest data *) + | TABLE_LIGHT; data = data_row_light; RIGHT_BRACE; + {} let table := | ~ = table_heavy; <`Table> From 39fc8d04d61e2dd999c1d182cb1c8fe01be26668 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Fri, 13 Sep 2024 15:06:54 -0400 Subject: [PATCH 032/150] Light table syntax - NOT COMPILING --- src/parser/parser.mly | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index db13316c5a..eecf23e64b 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -113,11 +113,14 @@ | [] -> Some acc + let valid_data_grid (rows : Ast.nestable_block_element Ast.row list) = ( rows :> Ast.nestable_block_element Ast.grid ) + (* Wrap a list of words in a paragraph, used for 'light' table headers *) let paragraph_of_words words = let words = List.map (Loc.map (fun text -> `Word text)) words and span = Loc.span @@ List.map Loc.location words in Loc.at span (`Paragraph words) + %} @@ -249,7 +252,6 @@ let table_heavy == TABLE_HEAVY; grid = row_heavy*; RIGHT_BRACE; { (abstract, `Heavy) } - (* cell -> nestable_block_element with_location list * Header | Data row -> cell list @@ -258,25 +260,41 @@ let table_heavy == TABLE_HEAVY; grid = row_heavy*; RIGHT_BRACE; { table -> abstract_table * Light | Heavy *) -let data_cell_light == BAR?; data = located(nestable_block_element)+; { inner, `Data } -let data_row_light := ~ = data_cell_light+; BAR?; NEWLINE; <> -let alignment_cell_light == BAR?; inner = Word; { inner } -let header_cell_light == BAR?; inner = located(Word)+; { inner } +(* This produces a single cell *) +let data_cell_light := BAR?; data = located(nestable_block_element)+; { ((data, `Data) : Ast.nestable_block_element Ast.cell) } +(* This produces a list of cells *) +let data_row_light := row = data_cell_light+; BAR?; NEWLINE; { (row : Ast.nestable_block_element Ast.row) } + +let alignment_cell_light == BAR?; inner = Word; { inner } let align_row_light == BAR?; ~ = alignment_cell_light+; BAR?; NEWLINE; <> + +let header_cell_light == BAR?; inner = located(Word)+; { inner } let header_row_light := ~ = header_cell_light+; BAR?; NEWLINE; <> let table_light := (* If the first row is the alignment row then the rest should be data *) | TABLE_LIGHT; align = align_row_light; data_rows = data_row_light+; RIGHT_BRACE; - {} + { + match valid_align_row align with + | Some _ as alignment -> (data_rows, alignment), `Light + | None -> (align :: data, None),`Light + } + (* If there's only one row and it's not the align row, then it's data *) | TABLE_LIGHT; header = header_row_light; align = align_row_light; data = data_row_light+; RIGHT_BRACE; - {} + { let headers = ( paragraph_of_words header :> Ast.nestable_block_element Loc.with_location ), `Header in + (* Why wont this typecheck??? If I make data a single row it works but not if it's a grid, but a grid is what we want here :( *) + let grid = valid_data_grid data in + match valid_align_row align with + | Some _ as alignment -> (grid, alignment), `Light + | None -> raise @@ Failure "Invalid align row" (* Handle this later, if align is not an align row then its a row of single words which need to ne inserted into paragraphs *) + } + (* Otherwise the first should be the headers, the second align, and the rest data *) | TABLE_LIGHT; data = data_row_light; RIGHT_BRACE; - {} + { (data, None), `Light } let table := | ~ = table_heavy; <`Table> From ba2c3f30c589eb735d4483f5707370e0b37ccf99 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 16 Sep 2024 13:23:46 -0400 Subject: [PATCH 033/150] Fix light table type conflict, reduce error remains --- src/parser/parser.mly | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index eecf23e64b..bf83df85b9 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -72,8 +72,6 @@ | Hidden -> `Tag `Hidden | tag -> raise @@ No_children (wrap_location loc @@ Printf.sprintf "Tag %s expects children" (pp_tag tag)) - - type align_cell = | Valid of Ast.alignment option | Invalid @@ -116,12 +114,12 @@ let valid_data_grid (rows : Ast.nestable_block_element Ast.row list) = ( rows :> Ast.nestable_block_element Ast.grid ) (* Wrap a list of words in a paragraph, used for 'light' table headers *) - let paragraph_of_words words = + let paragraph_of_words : string Loc.with_location list -> Ast.nestable_block_element Ast.cell = fun words -> let words = List.map (Loc.map (fun text -> `Word text)) words and span = Loc.span @@ List.map Loc.location words in - Loc.at span (`Paragraph words) + [Loc.at span ( `Paragraph words )], `Header - + let assert_row : ( Ast.nestable_block_element Ast.with_location list * [ `Header | `Data ] ) list -> ( Ast.nestable_block_element Ast.with_location list * [ `Header | `Data ] ) list = Fun.id %} %token SPACE NEWLINE @@ -262,16 +260,16 @@ let table_heavy == TABLE_HEAVY; grid = row_heavy*; RIGHT_BRACE; { *) (* This produces a single cell *) -let data_cell_light := BAR?; data = located(nestable_block_element)+; { ((data, `Data) : Ast.nestable_block_element Ast.cell) } +let data_cell_light == BAR?; data = located(nestable_block_element)+; { (data, `Data) } (* This produces a list of cells *) -let data_row_light := row = data_cell_light+; BAR?; NEWLINE; { (row : Ast.nestable_block_element Ast.row) } +let data_row_light := row = data_cell_light+; BAR?; NEWLINE; { (assert_row row) } let alignment_cell_light == BAR?; inner = Word; { inner } let align_row_light == BAR?; ~ = alignment_cell_light+; BAR?; NEWLINE; <> -let header_cell_light == BAR?; inner = located(Word)+; { inner } -let header_row_light := ~ = header_cell_light+; BAR?; NEWLINE; <> +let header_cell_light == BAR?; inner = located(Word)+; { paragraph_of_words inner } +let header_row_light := header_row = header_cell_light+; BAR?; NEWLINE; { ( header_row : Ast.nestable_block_element Ast.row ) } let table_light := (* If the first row is the alignment row then the rest should be data *) @@ -279,21 +277,21 @@ let table_light := { match valid_align_row align with | Some _ as alignment -> (data_rows, alignment), `Light - | None -> (align :: data, None),`Light + | None -> raise @@ Failure "Invalid align row" (* Handle this later, if align is not an align row then its a row of single words which need to ne inserted into paragraphs *) + } - (* If there's only one row and it's not the align row, then it's data *) + (* Otherwise the first should be the headers, the second align, and the rest data *) | TABLE_LIGHT; header = header_row_light; align = align_row_light; data = data_row_light+; RIGHT_BRACE; - { let headers = ( paragraph_of_words header :> Ast.nestable_block_element Loc.with_location ), `Header in - (* Why wont this typecheck??? If I make data a single row it works but not if it's a grid, but a grid is what we want here :( *) + { let grid = valid_data_grid data in match valid_align_row align with - | Some _ as alignment -> (grid, alignment), `Light + | Some _ as alignment -> (header :: grid, alignment), `Light | None -> raise @@ Failure "Invalid align row" (* Handle this later, if align is not an align row then its a row of single words which need to ne inserted into paragraphs *) } - (* Otherwise the first should be the headers, the second align, and the rest data *) - | TABLE_LIGHT; data = data_row_light; RIGHT_BRACE; + (* If there's only one row and it's not the align row, then it's data *) + | TABLE_LIGHT; data = data_row_light+; RIGHT_BRACE; { (data, None), `Light } let table := From e59d416ddb68c3286f7d1cfaa474013b163ed83d Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 16 Sep 2024 14:28:26 -0400 Subject: [PATCH 034/150] Add recovery from invalid align rows, clean up, better naming --- src/parser/parser.mly | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index bf83df85b9..70739b9bf0 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -100,7 +100,7 @@ end let rec valid_align_row ?(acc = []) = function - | cell :: rest -> + | Loc.{ value = cell; _ } :: rest -> begin match valid_align_cell cell with | Valid alignment -> @@ -114,11 +114,11 @@ let valid_data_grid (rows : Ast.nestable_block_element Ast.row list) = ( rows :> Ast.nestable_block_element Ast.grid ) (* Wrap a list of words in a paragraph, used for 'light' table headers *) - let paragraph_of_words : string Loc.with_location list -> Ast.nestable_block_element Ast.cell = fun words -> + let paragraph_of_words : string Loc.with_location list -> Ast.nestable_block_element Loc.with_location list = fun words -> let words = List.map (Loc.map (fun text -> `Word text)) words and span = Loc.span @@ List.map Loc.location words in - [Loc.at span ( `Paragraph words )], `Header - + [ Loc.at span ( `Paragraph words) ] + let recover_data = List.map (fun word -> paragraph_of_words [ word ], `Data) let assert_row : ( Ast.nestable_block_element Ast.with_location list * [ `Header | `Data ] ) list -> ( Ast.nestable_block_element Ast.with_location list * [ `Header | `Data ] ) list = Fun.id %} @@ -259,35 +259,36 @@ let table_heavy == TABLE_HEAVY; grid = row_heavy*; RIGHT_BRACE; { table -> abstract_table * Light | Heavy *) -(* This produces a single cell *) let data_cell_light == BAR?; data = located(nestable_block_element)+; { (data, `Data) } +let data_row_light := row = data_cell_light+; BAR?; NEWLINE; { row } -(* This produces a list of cells *) -let data_row_light := row = data_cell_light+; BAR?; NEWLINE; { (assert_row row) } +let align_cell == BAR?; inner = located(Word); { inner } +let align_row := ~ = align_cell+; BAR?; NEWLINE; <> -let alignment_cell_light == BAR?; inner = Word; { inner } -let align_row_light == BAR?; ~ = alignment_cell_light+; BAR?; NEWLINE; <> - -let header_cell_light == BAR?; inner = located(Word)+; { paragraph_of_words inner } -let header_row_light := header_row = header_cell_light+; BAR?; NEWLINE; { ( header_row : Ast.nestable_block_element Ast.row ) } +let header_cell_light == BAR?; inner = located(Word)+; { paragraph_of_words inner, `Header } +let header_row_light := header_row = header_cell_light+; BAR?; NEWLINE; { header_row } +(* NOTE: (@FayCarsons) Presently, behavior is to 'recover' when we have an invalid align row. Is this what we want? *) let table_light := (* If the first row is the alignment row then the rest should be data *) - | TABLE_LIGHT; align = align_row_light; data_rows = data_row_light+; RIGHT_BRACE; + | TABLE_LIGHT; align = align_row; data_rows = data_row_light+; RIGHT_BRACE; { match valid_align_row align with | Some _ as alignment -> (data_rows, alignment), `Light - | None -> raise @@ Failure "Invalid align row" (* Handle this later, if align is not an align row then its a row of single words which need to ne inserted into paragraphs *) - + | None -> + let align_as_data = recover_data align in + (align_as_data :: data_rows, None), `Light } (* Otherwise the first should be the headers, the second align, and the rest data *) - | TABLE_LIGHT; header = header_row_light; align = align_row_light; data = data_row_light+; RIGHT_BRACE; + | TABLE_LIGHT; header = header_row_light; align = align_row; data = data_row_light+; RIGHT_BRACE; { let grid = valid_data_grid data in match valid_align_row align with | Some _ as alignment -> (header :: grid, alignment), `Light - | None -> raise @@ Failure "Invalid align row" (* Handle this later, if align is not an align row then its a row of single words which need to ne inserted into paragraphs *) + | None -> + let align_as_data = recover_data align in + (header :: align_as_data :: data, None), `Light } (* If there's only one row and it's not the align row, then it's data *) From 1bc0add9894061c154fac101bbd15fb2569cc753 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 17 Sep 2024 15:43:57 -0400 Subject: [PATCH 035/150] Refactor light table syntax --- src/parser/parser.mly | 90 ++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 43 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 70739b9bf0..475ed5a745 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -30,7 +30,7 @@ | Canonical _ -> "@canonical" | Inline | Open | Closed | Hidden -> "" - type unimplemented = Top_level_error | Table | Media + type unimplemented = Top_level_error | Media exception Debug of unimplemented Loc.with_location let _ = Printexc.register_printer (function | Debug unimplemented_token_with_location -> @@ -38,7 +38,6 @@ let Loc.{ location = _location; value = token } = unimplemented_token_with_location in let error_message = match token with | Top_level_error -> "Error in Parser.main rule" - | Table -> "table" | Media -> "media" in Option.some @@ Printf.sprintf "Parser failed on: %s" error_message @@ -72,53 +71,59 @@ | Hidden -> `Tag `Hidden | tag -> raise @@ No_children (wrap_location loc @@ Printf.sprintf "Tag %s expects children" (pp_tag tag)) - type align_cell = - | Valid of Ast.alignment option - | Invalid + type align_error = + | Invalid_align (* An invalid align cell *) + | Not_align (* Not an align cell *) (* This could be handled in the parser, I think *) let valid_align_cell text = begin match String.length text with - | 0 -> Valid None + | 0 -> Ok None | 1 -> begin match text.[0] with - | ':' -> Valid (Some `Center) - | '-' -> Valid None - | _ -> Invalid + | ':' -> Ok (Some `Center) + | '-' -> Ok None + | _ -> Error Not_align end | len -> if String.for_all (Char.equal '-') (String.sub text 1 (len - 2)) then match text.[0], text.[pred len] with - | ':', '-' -> Valid (Some `Left) - | '-', ':' -> Valid (Some `Right) - | ':', ':' -> Valid (Some `Center) - | '-', '-' -> Valid (None) - | _ -> Invalid - else Invalid + | ':', '-' -> Ok (Some `Left) + | '-', ':' -> Ok (Some `Right) + | ':', ':' -> Ok (Some `Center) + | '-', '-' -> Ok None + | _ -> Error Invalid_align + else Error Not_align end - let rec valid_align_row ?(acc = []) = function - | Loc.{ value = cell; _ } :: rest -> + (* Short-circuiting fold, if we get something that isn't a valid alignment then bail and return it *) + let rec valid_align_row ?(acc = []) : Ast.inline_element Loc.with_location list -> (Ast.alignment option list, align_error) result = function + | Loc.{ value = `Word cell; _ } :: rest -> begin match valid_align_cell cell with - | Valid alignment -> + | Ok alignment -> valid_align_row ~acc:(alignment :: acc) rest - | Invalid -> - None + | Error Not_align when List.length acc > 0 -> Error Invalid_align + | Error err -> Error err end + | Loc.{ value = `Space "\n"; _ } :: _ when List.length acc > 0 -> + Error Invalid_align + | Loc.{ value = `Space _; _ } :: rest -> + valid_align_row ~acc rest | [] -> - Some acc + Ok acc + | _ -> Error Not_align - let valid_data_grid (rows : Ast.nestable_block_element Ast.row list) = ( rows :> Ast.nestable_block_element Ast.grid ) - (* Wrap a list of words in a paragraph, used for 'light' table headers *) - let paragraph_of_words : string Loc.with_location list -> Ast.nestable_block_element Loc.with_location list = fun words -> - let words = List.map (Loc.map (fun text -> `Word text)) words - and span = Loc.span @@ List.map Loc.location words in - [ Loc.at span ( `Paragraph words) ] - let recover_data = List.map (fun word -> paragraph_of_words [ word ], `Data) + let to_paragraph : Ast.inline_element Loc.with_location list -> Ast.nestable_block_element Loc.with_location list = fun words -> + let span = Loc.span @@ List.map Loc.location words in + [ Loc.at span (`Paragraph words) ] + + let recover_data : Ast.inline_element Loc.with_location list -> Ast.nestable_block_element Ast.row + = List.map (fun word -> let loc = Loc.location word in [ Loc.at loc @@ `Paragraph [ word ] ], `Data) + let assert_row : ( Ast.nestable_block_element Ast.with_location list * [ `Header | `Data ] ) list -> ( Ast.nestable_block_element Ast.with_location list * [ `Header | `Data ] ) list = Fun.id %} @@ -259,40 +264,39 @@ let table_heavy == TABLE_HEAVY; grid = row_heavy*; RIGHT_BRACE; { table -> abstract_table * Light | Heavy *) -let data_cell_light == BAR?; data = located(nestable_block_element)+; { (data, `Data) } -let data_row_light := row = data_cell_light+; BAR?; NEWLINE; { row } +let cell_light == BAR?; data = located(inline_element)+; { (to_paragraph data, `Data) } +let row_light := ~ = cell_light+; BAR?; NEWLINE; <> -let align_cell == BAR?; inner = located(Word); { inner } +let align_cell == BAR?; inner = located(inline_element); { inner } let align_row := ~ = align_cell+; BAR?; NEWLINE; <> -let header_cell_light == BAR?; inner = located(Word)+; { paragraph_of_words inner, `Header } -let header_row_light := header_row = header_cell_light+; BAR?; NEWLINE; { header_row } - (* NOTE: (@FayCarsons) Presently, behavior is to 'recover' when we have an invalid align row. Is this what we want? *) let table_light := (* If the first row is the alignment row then the rest should be data *) - | TABLE_LIGHT; align = align_row; data_rows = data_row_light+; RIGHT_BRACE; + | TABLE_LIGHT; align = align_row; data = row_light+; RIGHT_BRACE; { match valid_align_row align with - | Some _ as alignment -> (data_rows, alignment), `Light - | None -> + | Ok alignment -> (data, Some alignment), `Light + | Error Invalid_align -> (data, None), `Light + | Error Not_align -> let align_as_data = recover_data align in - (align_as_data :: data_rows, None), `Light + (align_as_data :: data, None), `Light } (* Otherwise the first should be the headers, the second align, and the rest data *) - | TABLE_LIGHT; header = header_row_light; align = align_row; data = data_row_light+; RIGHT_BRACE; + | TABLE_LIGHT; header = row_light; align = align_row; data = row_light+; RIGHT_BRACE; { - let grid = valid_data_grid data in match valid_align_row align with - | Some _ as alignment -> (header :: grid, alignment), `Light - | None -> + | Ok alignment -> (header :: data, Some alignment), `Light + | Error Invalid_align -> + (header :: data, None), `Light + | Error Not_align -> let align_as_data = recover_data align in (header :: align_as_data :: data, None), `Light } (* If there's only one row and it's not the align row, then it's data *) - | TABLE_LIGHT; data = data_row_light+; RIGHT_BRACE; + | TABLE_LIGHT; data = row_light+; RIGHT_BRACE; { (data, None), `Light } let table := From 1718cca6d102e8ff859f00bdbd028bf05e8229f3 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 18 Sep 2024 11:39:30 -0400 Subject: [PATCH 036/150] Remove inline opens in token declaration, add comment explaining alignment validation --- src/parser/parser.mly | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 475ed5a745..a1651871b5 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -1,5 +1,4 @@ %{ - [@@@warning "-32"] open Parser_types let point_of_position Lexing.{ pos_lnum; pos_cnum; _ } = @@ -37,8 +36,8 @@ begin let Loc.{ location = _location; value = token } = unimplemented_token_with_location in let error_message = match token with - | Top_level_error -> "Error in Parser.main rule" - | Media -> "media" + | Top_level_error -> "Error in Parser.main rule" + | Media -> "media" in Option.some @@ Printf.sprintf "Parser failed on: %s" error_message end @@ -98,7 +97,12 @@ else Error Not_align end - (* Short-circuiting fold, if we get something that isn't a valid alignment then bail and return it *) + (* NOTE: (@FayCarsons) + + This is a short-circuiting fold, if we get something that isn't a valid alignment then bail and return it. + When we get something that doesn't look like an align at all, we check to see if we've gotten + any valid aligns, if so we assume that the cell being considered is supposed to be an align and treat it as an error, + otherwise we assume the row is not supposed to be an align row *) let rec valid_align_row ?(acc = []) : Ast.inline_element Loc.with_location list -> (Ast.alignment option list, align_error) result = function | Loc.{ value = `Word cell; _ } :: rest -> begin @@ -168,8 +172,8 @@ %token Ref_with_replacement %token Simple_link %token Link_with_replacement -%token Media -%token Media_with_replacement +%token <(Parser_types.media * Parser_types.media_target)> Media +%token <(Parser_types.media * Parser_types.media_target * string)> Media_with_replacement %token Verbatim %token END From b07a20a9b18def5369e2e79f14bd464848158db8 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 18 Sep 2024 13:03:43 -0400 Subject: [PATCH 037/150] Cleanup, add TODO comments --- src/parser/parser.mly | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index a1651871b5..3f4e60b968 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -68,6 +68,7 @@ | Open -> `Tag `Open | Closed -> `Tag `Closed | Hidden -> `Tag `Hidden + (* TODO: replace this with real error handling. Don't throw exceptions *) | tag -> raise @@ No_children (wrap_location loc @@ Printf.sprintf "Tag %s expects children" (pp_tag tag)) type align_error = @@ -121,14 +122,16 @@ | _ -> Error Not_align (* Wrap a list of words in a paragraph, used for 'light' table headers *) - let to_paragraph : Ast.inline_element Loc.with_location list -> Ast.nestable_block_element Loc.with_location list = fun words -> - let span = Loc.span @@ List.map Loc.location words in - [ Loc.at span (`Paragraph words) ] + let to_paragraph : Ast.inline_element Loc.with_location list -> Ast.nestable_block_element Loc.with_location list + = fun words -> + let span = Loc.span @@ List.map Loc.location words in + [ Loc.at span (`Paragraph words) ] let recover_data : Ast.inline_element Loc.with_location list -> Ast.nestable_block_element Ast.row - = List.map (fun word -> let loc = Loc.location word in [ Loc.at loc @@ `Paragraph [ word ] ], `Data) - - let assert_row : ( Ast.nestable_block_element Ast.with_location list * [ `Header | `Data ] ) list -> ( Ast.nestable_block_element Ast.with_location list * [ `Header | `Data ] ) list = Fun.id + = List.map (fun word -> + let loc = Loc.location word in + [ Loc.at loc @@ `Paragraph [ word ] ], `Data + ) %} %token SPACE NEWLINE @@ -191,6 +194,7 @@ let main := | ~ = located(toplevel)+; END; <> | _ = whitespace; { [] } | END; { [] } + (* TODO: replace w/ real error handling *) | error; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Top_level_error } let toplevel := @@ -316,6 +320,7 @@ let nestable_block_element := | ~ = located(Modules)+; RIGHT_BRACE; <`Modules> | ~ = list_element; <> | ~ = table; <> + (* TODO: replace with real error handling *) | _ = Media; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Media } | ~ = Math_block; <`Math_block> From 981609e94af91647e608c3bae9233499bf914ca2 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 18 Sep 2024 13:24:44 -0400 Subject: [PATCH 038/150] Further cleanup, add empty table producer --- src/parser/parser.mly | 45 +++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 3f4e60b968..690123356f 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -104,22 +104,23 @@ When we get something that doesn't look like an align at all, we check to see if we've gotten any valid aligns, if so we assume that the cell being considered is supposed to be an align and treat it as an error, otherwise we assume the row is not supposed to be an align row *) - let rec valid_align_row ?(acc = []) : Ast.inline_element Loc.with_location list -> (Ast.alignment option list, align_error) result = function - | Loc.{ value = `Word cell; _ } :: rest -> - begin - match valid_align_cell cell with - | Ok alignment -> - valid_align_row ~acc:(alignment :: acc) rest - | Error Not_align when List.length acc > 0 -> Error Invalid_align - | Error err -> Error err - end - | Loc.{ value = `Space "\n"; _ } :: _ when List.length acc > 0 -> - Error Invalid_align - | Loc.{ value = `Space _; _ } :: rest -> - valid_align_row ~acc rest - | [] -> - Ok acc - | _ -> Error Not_align + let rec valid_align_row ?(acc = []) : Ast.inline_element Loc.with_location list -> (Ast.alignment option list, align_error) result + = function + | Loc.{ value = `Word cell; _ } :: rest -> + begin + match valid_align_cell cell with + | Ok alignment -> + valid_align_row ~acc:(alignment :: acc) rest + | Error Not_align when List.length acc > 0 -> Error Invalid_align + | Error err -> Error err + end + | Loc.{ value = `Space "\n"; _ } :: _ when List.length acc > 0 -> + Error Invalid_align + | Loc.{ value = `Space _; _ } :: rest -> + valid_align_row ~acc rest + | [] -> + Ok acc + | _ -> Error Not_align (* Wrap a list of words in a paragraph, used for 'light' table headers *) let to_paragraph : Ast.inline_element Loc.with_location list -> Ast.nestable_block_element Loc.with_location list @@ -263,15 +264,6 @@ let table_heavy == TABLE_HEAVY; grid = row_heavy*; RIGHT_BRACE; { (abstract, `Heavy) } -(* - cell -> nestable_block_element with_location list * Header | Data - row -> cell list - grid -> row list - abstract_table -> grid * alignment option list option - - table -> abstract_table * Light | Heavy -*) - let cell_light == BAR?; data = located(inline_element)+; { (to_paragraph data, `Data) } let row_light := ~ = cell_light+; BAR?; NEWLINE; <> @@ -307,6 +299,9 @@ let table_light := | TABLE_LIGHT; data = row_light+; RIGHT_BRACE; { (data, None), `Light } + (* If there's nothing inside, return an empty table *) + | TABLE_LIGHT; SPACE*; RIGHT_BRACE; { ([[]], None), `Light } + let table := | ~ = table_heavy; <`Table> | ~ = table_light; <`Table> From d53cdf2cfeffaa234aaeda38290b9113d9c9c3fc Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 18 Sep 2024 14:01:44 -0400 Subject: [PATCH 039/150] Add simple media rule --- src/parser/lexer.mll | 2 +- src/parser/parser.mly | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 2dad5ae778..66f06b98df 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -208,7 +208,7 @@ let reference_token media start target input lexbuf = | "{{video:" -> Link target, Video | _ -> assert false in - let token_descr = "DUMMY_MEDIA_FOR_DEBUGGING" (* Parser_utils.describe (`Media_with_replacement_text (target, kind, "")) *) in + let token_descr = Parser_utils.describe (Media_with_replacement (target, kind, "")) in let content = media token_descr (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) input lexbuf in Media_with_replacement (target, kind, content) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 690123356f..4c14c81088 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -133,6 +133,22 @@ let loc = Loc.location word in [ Loc.at loc @@ `Paragraph [ word ] ], `Data ) + + let media_kind_of_target = function + | Audio -> `Audio + | Video -> `Video + | Image -> `Image + + let href_of_media = function + | Reference name -> `Reference name + | Link uri -> `Link uri + + let split_simple_media Loc.{ location; value = (media, target) } = + (Loc.at location media, target) + + let split_replacement_media Loc.{ location; value = (media, target, content) } = + (Loc.at location media, target, content) + %} %token SPACE NEWLINE @@ -306,6 +322,14 @@ let table := | ~ = table_heavy; <`Table> | ~ = table_light; <`Table> +(* MEDIA *) + +let media := +| media = located(Media); inner = located(inline_element)*; RIGHT_BRACE; + { let (located_media_kind, media_href) = split_simple_media media in + let wrapped_located_kind = Loc.map href_of_media located_media_kind in + `Media (`Simple, located_media_kind, "", media_kind_of_target media_href) } + (* TOP-LEVEL ELEMENTS *) let nestable_block_element := From ca9ccc7ef4180311fb51e783f61da1da3833e390 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 18 Sep 2024 15:38:37 -0400 Subject: [PATCH 040/150] fix debug error handling --- src/parser/parser.mly | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 4c14c81088..539668156e 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -29,7 +29,7 @@ | Canonical _ -> "@canonical" | Inline | Open | Closed | Hidden -> "" - type unimplemented = Top_level_error | Media + type unimplemented = Top_level_error exception Debug of unimplemented Loc.with_location let _ = Printexc.register_printer (function | Debug unimplemented_token_with_location -> @@ -37,7 +37,6 @@ let Loc.{ location = _location; value = token } = unimplemented_token_with_location in let error_message = match token with | Top_level_error -> "Error in Parser.main rule" - | Media -> "media" in Option.some @@ Printf.sprintf "Parser failed on: %s" error_message end @@ -325,10 +324,14 @@ let table := (* MEDIA *) let media := -| media = located(Media); inner = located(inline_element)*; RIGHT_BRACE; - { let (located_media_kind, media_href) = split_simple_media media in - let wrapped_located_kind = Loc.map href_of_media located_media_kind in - `Media (`Simple, located_media_kind, "", media_kind_of_target media_href) } + | media = located(Media); RIGHT_BRACE; + { let (located_media_kind, media_href) = split_simple_media media in + let wrapped_located_kind = Loc.map href_of_media located_media_kind in + `Media (`Simple, wrapped_located_kind, "", media_kind_of_target media_href) } + | media = located(Media_with_replacement); RIGHT_BRACE; + { let (located_media_kind, media_href, content) = split_replacement_media media in + let wrapped_located_kind = Loc.map href_of_media located_media_kind in + `Media (`With_text, wrapped_located_kind, content, media_kind_of_target media_href) } (* TOP-LEVEL ELEMENTS *) @@ -339,8 +342,7 @@ let nestable_block_element := | ~ = located(Modules)+; RIGHT_BRACE; <`Modules> | ~ = list_element; <> | ~ = table; <> - (* TODO: replace with real error handling *) - | _ = Media; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Media } + | ~ = media; <> | ~ = Math_block; <`Math_block> let heading := From 4b33ea098d5577963439fb7a64e88ca7f0b22456 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 19 Sep 2024 11:13:02 -0400 Subject: [PATCH 041/150] Add new token descriptions to parser utils --- src/parser/parser_utils.ml | 55 ++++++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 5 deletions(-) diff --git a/src/parser/parser_utils.ml b/src/parser/parser_utils.ml index 589ae7aac0..c06f9a7fcd 100644 --- a/src/parser/parser_utils.ml +++ b/src/parser/parser_utils.ml @@ -1,6 +1,43 @@ open Parser -let[@warning "-8"] print : Parser.token -> string = function +let media_description ref_kind media_kind = + let open Parser_types in + let media_kind = match media_kind with + | Audio -> "audio" + | Video -> "video" + | Image -> "image" + in + let ref_kind = match ref_kind with + | Reference _ -> "!" + | Link _ -> ":" + in + ref_kind, media_kind + +let print : Parser.token -> string = function + | SPACE | Space _ -> "\t" + | NEWLINE | Single_newline _ -> "\n" + | Blank_line _ -> "\n\n" + | Simple_ref _ -> "{!" + | Ref_with_replacement _ -> "{{!" + | Simple_link _ -> "{:" + | Link_with_replacement _ -> "{{:" + | Modules _ -> "{!modules:" + | Media (ref_kind, media_kind) -> + let (ref_kind, media_kind) = media_description ref_kind media_kind in + Printf.sprintf "{%s%s" media_kind ref_kind + | Media_with_replacement (ref_kind, media_kind, _) -> + let (ref_kind, media_kind) = media_description ref_kind media_kind in + Printf.sprintf "{{%s%s" media_kind ref_kind + | Math_span _ -> + "{m" + | Math_block _ -> + "{math" + | Code_span _ -> "[" + | Code_block _ -> "{[" + | Word w -> w + | Verbatim _ -> "{v" + | RIGHT_CODE_DELIMITER -> "]}" + | RIGHT_BRACE -> "}" | Paragraph_style `Left -> "'{L'" | Paragraph_style `Center -> "'{C'" | Paragraph_style `Right -> "'{R'" @@ -9,8 +46,8 @@ let[@warning "-8"] print : Parser.token -> string = function | Style `Emphasis -> "'{e'" | Style `Superscript -> "'{^'" | Style `Subscript -> "'{_'" - | Ref_with_replacement _ -> "'{{!'" - | Link_with_replacement _ -> "'{{:'" + | List `Ordered -> "{ol" + | List `Unordered -> "{ul" | List_item `Li -> "'{li ...}'" | List_item `Dash -> "'{- ...}'" | TABLE_LIGHT -> "{t" @@ -40,11 +77,19 @@ let[@warning "-8"] print : Parser.token -> string = function | Tag Hidden -> "'@hidden" | Raw_markup (None, _) -> "'{%...%}'" | Raw_markup (Some target, _) -> "'{%" ^ target ^ ":...%}'" + | END -> "EOI" (* [`Minus] and [`Plus] are interpreted as if they start list items. Therefore, for error messages based on [Token.describe] to be accurate, formatted [`Minus] and [`Plus] should always be plausibly list item bullets. *) -let[@warning "-8"] describe : Parser.token -> string = function +let describe : Parser.token -> string = function + | Space _ -> "(horizontal space)" + | Media (ref_kind, media_kind) -> + let (ref_kind, media_kind) = media_description ref_kind media_kind in + Printf.sprintf "{%s%s" media_kind ref_kind + | Media_with_replacement (ref_kind, media_kind, _) -> + let (ref_kind, media_kind) = media_description ref_kind media_kind in + Printf.sprintf "{{%s%s" media_kind ref_kind | Word w -> Printf.sprintf "'%s'" w | Code_span _ -> "'[...]' (code)" | Raw_markup _ -> "'{%...%}' (raw markup)" @@ -65,7 +110,7 @@ let[@warning "-8"] describe : Parser.token -> string = function | Link_with_replacement _ -> "'{{:...} ...}' (external link)" | END -> "end of text" | SPACE -> "whitespace" - | Single_newline _ -> "line break" + | Single_newline _ | NEWLINE -> "line break" | Blank_line _ -> "blank line" | RIGHT_BRACE -> "'}'" | RIGHT_CODE_DELIMITER -> "']}'" From 7a512f269d103e03d10dc094f08f3e79a64b1439 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 19 Sep 2024 11:27:47 -0400 Subject: [PATCH 042/150] fix simple media producer --- src/parser/parser.mly | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 539668156e..30b6506c85 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -324,11 +324,11 @@ let table := (* MEDIA *) let media := - | media = located(Media); RIGHT_BRACE; + | media = located(Media); whitespace*; { let (located_media_kind, media_href) = split_simple_media media in let wrapped_located_kind = Loc.map href_of_media located_media_kind in `Media (`Simple, wrapped_located_kind, "", media_kind_of_target media_href) } - | media = located(Media_with_replacement); RIGHT_BRACE; + | media = located(Media_with_replacement); whitespace*; { let (located_media_kind, media_href, content) = split_replacement_media media in let wrapped_located_kind = Loc.map href_of_media located_media_kind in `Media (`With_text, wrapped_located_kind, content, media_kind_of_target media_href) } From d242ddaf9f8bad8d8a0e1699a00630b92622cd54 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 19 Sep 2024 11:37:12 -0400 Subject: [PATCH 043/150] fix heavy list rule --- src/parser/parser.mly | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 30b6506c85..838ce264af 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -261,7 +261,7 @@ let list_light := (* `List_item` is [ `Li | `Dash ], not sure how that's useful though. Can't find '{li' syntax in Odoc docs *) let item_heavy == _ = List_item; ~ = located(nestable_block_element)*; RIGHT_BRACE; <> let list_heavy := - | list_kind = List; items = item_heavy*; RIGHT_BRACE; + | list_kind = List; whitespace*; items = item_heavy*; RIGHT_BRACE; { `List (list_kind, `Heavy, items) } let list_element := From 25254165b250647a09951bf5962e181def699cd9 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 19 Sep 2024 13:51:16 -0400 Subject: [PATCH 044/150] Add whitespace to toplevel, closing brace to `Styled` --- src/parser/parser.mly | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 838ce264af..14e506bf97 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -208,15 +208,15 @@ let located(rule) == value = rule; { wrap_location $loc value } let main := | ~ = located(toplevel)+; END; <> - | _ = whitespace; { [] } + | _ = whitespace; END; { [] } | END; { [] } (* TODO: replace w/ real error handling *) | error; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Top_level_error } let toplevel := - | ~ = tag; <> - | block = nestable_block_element; { (block :> Ast.block_element) } - | ~ = heading; <> + | ~ = tag; whitespace*; <> + | block = nestable_block_element; whitespace*; { (block :> Ast.block_element) } + | ~ = heading; whitespace*; <> let whitespace := | SPACE; { `Space " " } @@ -232,7 +232,7 @@ let inline_element := | ~ = Word; <`Word> | ~ = Code_span; <`Code_span> | ~ = Raw_markup; <`Raw_markup> - | style = Style; inner = located(inline_element)*; { `Styled (style, inner) } + | style = Style; inner = located(inline_element)*; RIGHT_BRACE; { `Styled (style, inner) } | ~ = Math_span; <`Math_span> | ~ = ref; <> | ~ = link; <> @@ -259,9 +259,9 @@ let list_light := { `List (`Ordered, `Light, [ ordered_items ]) } (* `List_item` is [ `Li | `Dash ], not sure how that's useful though. Can't find '{li' syntax in Odoc docs *) -let item_heavy == _ = List_item; ~ = located(nestable_block_element)*; RIGHT_BRACE; <> +let item_heavy == _ = List_item; whitespace*; ~ = located(nestable_block_element)*; whitespace*; RIGHT_BRACE; <> let list_heavy := - | list_kind = List; whitespace*; items = item_heavy*; RIGHT_BRACE; + | list_kind = List; whitespace*; items = item_heavy*; whitespace*; RIGHT_BRACE; { `List (list_kind, `Heavy, items) } let list_element := From a905a03d1b1f3fef9428927ab614cda864a4a425 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 19 Sep 2024 13:57:40 -0400 Subject: [PATCH 045/150] Add whitespace handling to heavy list rule --- src/parser/parser.mly | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 14e506bf97..78fe92521e 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -259,7 +259,7 @@ let list_light := { `List (`Ordered, `Light, [ ordered_items ]) } (* `List_item` is [ `Li | `Dash ], not sure how that's useful though. Can't find '{li' syntax in Odoc docs *) -let item_heavy == _ = List_item; whitespace*; ~ = located(nestable_block_element)*; whitespace*; RIGHT_BRACE; <> +let item_heavy == _ = List_item; whitespace*; ~ = located(nestable_block_element)*; whitespace*; RIGHT_BRACE; whitespace*; <> let list_heavy := | list_kind = List; whitespace*; items = item_heavy*; whitespace*; RIGHT_BRACE; { `List (list_kind, `Heavy, items) } From f122c1642c3f7ae0b9e38f657953fdc4a5ffa036 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Fri, 20 Sep 2024 15:08:57 -0400 Subject: [PATCH 046/150] Cleanup, use $sloc vs $loc in `parser.mly` --- src/parser/lexer.mll | 3 ++- src/parser/parser.mly | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 66f06b98df..0683e946b6 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -237,7 +237,8 @@ let emit_verbatim lexbuf input start_offset buffer = |> trim_trailing_space_or_accept_whitespace |> trim_leading_space_or_accept_whitespace lexbuf input start_offset |> trim_leading_blank_lines - |> trim_trailing_blank_lines in + |> trim_trailing_blank_lines + in emit lexbuf input (Verbatim t) ~start_offset (* The locations have to be treated carefully in this function. We need to ensure that diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 78fe92521e..8cdad8c687 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -1,8 +1,8 @@ %{ open Parser_types - let point_of_position Lexing.{ pos_lnum; pos_cnum; _ } = - Loc.{ line = pos_lnum; column = pos_cnum } +let point_of_position Lexing.{ pos_lnum; pos_cnum; pos_bol; _ } = + Loc.{ line = pos_lnum; column = pos_cnum - pos_bol } type lexspan = (Lexing.position * Lexing.position) let to_location : lexspan -> Loc.span = @@ -202,7 +202,7 @@ %% (* Utility which wraps the return value of a producer in `Loc.with_location` *) -let located(rule) == value = rule; { wrap_location $loc value } +let located(rule) == inner = rule; { wrap_location $sloc inner } (* ENTRY-POINT *) From fdc2ee641da629414dbfd770bed4f575164ce583 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 23 Sep 2024 12:51:07 -0400 Subject: [PATCH 047/150] Rename parser utils and token printer modules --- src/parser/{parser_utils.ml => describe.ml} | 2 +- src/parser/lexer.mll | 34 +++++++++---------- src/parser/parser.mly | 16 ++++----- src/parser/{parser_types.ml => parser_aux.ml} | 0 4 files changed, 26 insertions(+), 26 deletions(-) rename src/parser/{parser_utils.ml => describe.ml} (99%) rename src/parser/{parser_types.ml => parser_aux.ml} (100%) diff --git a/src/parser/parser_utils.ml b/src/parser/describe.ml similarity index 99% rename from src/parser/parser_utils.ml rename to src/parser/describe.ml index c06f9a7fcd..408e7c5147 100644 --- a/src/parser/parser_utils.ml +++ b/src/parser/describe.ml @@ -1,7 +1,7 @@ open Parser let media_description ref_kind media_kind = - let open Parser_types in + let open Parser_aux in let media_kind = match media_kind with | Audio -> "audio" | Video -> "video" diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 0683e946b6..766a493cc7 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -198,7 +198,7 @@ let reference_token media start target input lexbuf = | _ -> let target, kind = - let open Parser_types in + let open Parser_aux in match start with | "{{image!" -> Reference target, Image | "{{image:" -> Link target, Image @@ -208,7 +208,7 @@ let reference_token media start target input lexbuf = | "{{video:" -> Link target, Video | _ -> assert false in - let token_descr = Parser_utils.describe (Media_with_replacement (target, kind, "")) in + let token_descr = Describe.describe (Media_with_replacement (target, kind, "")) in let content = media token_descr (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) input lexbuf in Media_with_replacement (target, kind, content) @@ -486,8 +486,8 @@ and token input = parse input ~start_offset:(Lexing.lexeme_end lexbuf) (Parse_error.not_allowed - ~what:(Parser_utils.describe END) - ~in_what:(Parser_utils.describe token)); + ~what:(Describe.describe END) + ~in_what:(Describe.describe token)); emit lexbuf input token } | "{ul" @@ -622,8 +622,8 @@ and token input = parse input ~start_offset:(Lexing.lexeme_end lexbuf) (Parse_error.not_allowed - ~what:(Parser_utils.describe END) - ~in_what:(Parser_utils.describe (Modules ""))); + ~what:(Describe.describe END) + ~in_what:(Describe.describe (Modules ""))); emit lexbuf input (Modules modules) } and code_span buffer nesting_level start_offset input = parse @@ -648,8 +648,8 @@ and code_span buffer nesting_level start_offset input = parse lexbuf input (Parse_error.not_allowed - ~what:(Parser_utils.describe (Blank_line "\n\n")) - ~in_what:(Parser_utils.describe (Code_span ""))); + ~what:(Describe.describe (Blank_line "\n\n")) + ~in_what:(Describe.describe (Code_span ""))); Buffer.add_char buffer ' '; code_span buffer nesting_level start_offset input lexbuf } | newline horizontal_space* @@ -661,8 +661,8 @@ and code_span buffer nesting_level start_offset input = parse lexbuf input (Parse_error.not_allowed - ~what:(Parser_utils.describe END) - ~in_what:(Parser_utils.describe (Code_span ""))); + ~what:(Describe.describe END) + ~in_what:(Describe.describe (Code_span ""))); emit lexbuf input (Code_span (Buffer.contents buffer)) ~start_offset } | _ as c @@ -692,8 +692,8 @@ and math kind buffer nesting_level start_offset input = parse lexbuf input (Parse_error.not_allowed - ~what:(Parser_utils.describe (Blank_line "\n")) - ~in_what:(Parser_utils.describe (math_constr kind ""))); + ~what:(Describe.describe (Blank_line "\n")) + ~in_what:(Describe.describe (math_constr kind ""))); Buffer.add_char buffer '\n'; math kind buffer nesting_level start_offset input lexbuf | Block -> @@ -705,8 +705,8 @@ and math kind buffer nesting_level start_offset input = parse lexbuf input (Parse_error.not_allowed - ~what:(Parser_utils.describe END) - ~in_what:(Parser_utils.describe (math_constr kind ""))); + ~what:(Describe.describe END) + ~in_what:(Describe.describe (math_constr kind ""))); emit lexbuf input (math_constr kind (Buffer.contents buffer)) ~start_offset } | _ as c { Buffer.add_char buffer c; @@ -732,7 +732,7 @@ and media tok_descr buffer nesting_level start_offset input = parse lexbuf input (Parse_error.not_allowed - ~what:(Parser_utils.describe END) + ~what:(Describe.describe END) ~in_what:tok_descr); Buffer.contents buffer} | (newline) @@ -759,8 +759,8 @@ and verbatim buffer last_false_terminator start_offset input = parse lexbuf input (Parse_error.not_allowed - ~what:(Parser_utils.describe END) - ~in_what:(Parser_utils.describe (Verbatim ""))) + ~what:(Describe.describe END) + ~in_what:(Describe.describe (Verbatim ""))) | Some location -> warning lexbuf diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 8cdad8c687..a2c5e6006b 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -1,8 +1,8 @@ %{ - open Parser_types + open Parser_aux -let point_of_position Lexing.{ pos_lnum; pos_cnum; pos_bol; _ } = - Loc.{ line = pos_lnum; column = pos_cnum - pos_bol } + let point_of_position Lexing.{ pos_lnum; pos_cnum; pos_bol; _ } = + Loc.{ line = pos_lnum; column = pos_cnum - pos_bol } type lexspan = (Lexing.position * Lexing.position) let to_location : lexspan -> Loc.span = @@ -16,7 +16,7 @@ let point_of_position Lexing.{ pos_lnum; pos_cnum; pos_bol; _ } = let location = to_location loc in { location; value } - let pp_tag : Parser_types.tag -> string = function + let pp_tag : Parser_aux.tag -> string = function | Author _ -> "@author" | Deprecated -> "@deprecated" | Param _ -> "@param" @@ -49,7 +49,7 @@ let point_of_position Lexing.{ pos_lnum; pos_cnum; pos_bol; _ } = let tag : Ast.tag -> Ast.block_element = fun tag -> `Tag tag - let tag_with_element (children : Ast.nestable_block_element Loc.with_location list) : Parser_types.tag -> Ast.block_element = function + let tag_with_element (children : Ast.nestable_block_element Loc.with_location list) : Parser_aux.tag -> Ast.block_element = function | Before version -> tag @@ `Before (version, children) | Deprecated -> tag @@ `Deprecated children | Return -> tag @@ `Return children @@ -185,14 +185,14 @@ let point_of_position Lexing.{ pos_lnum; pos_cnum; pos_bol; _ } = %token Section_heading -%token Tag +%token Tag %token Simple_ref %token Ref_with_replacement %token Simple_link %token Link_with_replacement -%token <(Parser_types.media * Parser_types.media_target)> Media -%token <(Parser_types.media * Parser_types.media_target * string)> Media_with_replacement +%token <(Parser_aux.media * Parser_aux.media_target)> Media +%token <(Parser_aux.media * Parser_aux.media_target * string)> Media_with_replacement %token Verbatim %token END diff --git a/src/parser/parser_types.ml b/src/parser/parser_aux.ml similarity index 100% rename from src/parser/parser_types.ml rename to src/parser/parser_aux.ml From 117c78b5218bf133e6468816d816333886142111 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 23 Sep 2024 12:53:08 -0400 Subject: [PATCH 048/150] Remove unused `Error` module --- src/parser/error.ml | 9 --------- 1 file changed, 9 deletions(-) delete mode 100644 src/parser/error.ml diff --git a/src/parser/error.ml b/src/parser/error.ml deleted file mode 100644 index 3200573d39..0000000000 --- a/src/parser/error.ml +++ /dev/null @@ -1,9 +0,0 @@ -type parser_error = - | Unclosed of - { opening : string - ; items : string - ; closing : string - } - | Expecting of string - -exception Parser_error of parser_error Loc.with_location From 366267d66fb17070c68d1e84af144786a0fdabea Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 23 Sep 2024 14:24:45 -0400 Subject: [PATCH 049/150] Fix `Ref` and `Simple_ref` rules --- src/parser/parser.mly | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index a2c5e6006b..65d9349d29 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -238,10 +238,10 @@ let inline_element := | ~ = link; <> let ref := - | ref_body = located(Simple_ref); children = located(inline_element)+; RIGHT_BRACE; + | ref_body = located(Simple_ref); children = located(inline_element)*; { `Reference (`Simple, ref_body, children) } - | ref_body = located(Ref_with_replacement); children = located(inline_element)*; RIGHT_BRACE; + | ref_body = located(Ref_with_replacement); children = located(inline_element)*; { `Reference (`With_text, ref_body, children) } (* TODO : Fix the `with_replacement` producers in the following two rules, if they're broken. Ask what `with_replacement` refers to *) From 1e2907e277a997af02ac82263ab2eaa5962d4bb7 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 24 Sep 2024 14:26:15 -0400 Subject: [PATCH 050/150] Fix modules rule --- src/parser/parser.mly | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 65d9349d29..153878921e 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -339,7 +339,7 @@ let nestable_block_element := | ~ = Verbatim; <`Verbatim> | ~ = located(inline_element)+; <`Paragraph> | ~ = Code_block; RIGHT_CODE_DELIMITER; <`Code_block> - | ~ = located(Modules)+; RIGHT_BRACE; <`Modules> + | ~ = located(Modules)+; <`Modules> | ~ = list_element; <> | ~ = table; <> | ~ = media; <> From d4698a1810a795fbca196bba80b7f2dea2af7370 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 24 Sep 2024 14:54:37 -0400 Subject: [PATCH 051/150] Fix heavy table rule --- src/parser/parser.mly | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 153878921e..83df05bcd2 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -270,9 +270,9 @@ let list_element := (* TABLES *) -let cell_heavy := cell_kind = Table_cell; children = located(nestable_block_element)*; RIGHT_BRACE; { (children, cell_kind) } -let row_heavy == TABLE_ROW; cells = list(cell_heavy); RIGHT_BRACE; { cells } -let table_heavy == TABLE_HEAVY; grid = row_heavy*; RIGHT_BRACE; { + let cell_heavy := cell_kind = Table_cell; whitespace?; children = located(nestable_block_element)*; whitespace?; RIGHT_BRACE; whitespace?; { (children, cell_kind) } +let row_heavy == TABLE_ROW; whitespace?; cells = list(cell_heavy); RIGHT_BRACE; whitespace?; { cells } +let table_heavy == TABLE_HEAVY; whitespace?; grid = row_heavy*; RIGHT_BRACE; { (* Convert into an 'abstract table' which can be either a light or heavy syntax table. We know this is a heavy table, which cannot have alignment, however, so the alignment field is `None` *) let abstract : Ast.nestable_block_element Ast.abstract_table = (grid, None) in From 85167f59955e0dc5caf273835e3f58710c819bd7 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 24 Sep 2024 16:05:16 -0400 Subject: [PATCH 052/150] fix code block rule --- src/parser/parser.mly | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 83df05bcd2..762ffd2902 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -227,6 +227,21 @@ let whitespace := (* INLINE ELEMENTS *) +(* TODO: enforce invariant: + +(* Convenient abbreviation for use in patterns. *) +type token_that_always_begins_an_inline_element = + [ `Word of string + | `Code_span of string + | `Raw_markup of string option * string + | `Begin_style of style + | `Simple_reference of string + | `Begin_reference_with_replacement_text of string + | `Simple_link of string + | `Begin_link_with_replacement_text of string + | `Math_span of string ] + +*) let inline_element := | ~ = whitespace; <> | ~ = Word; <`Word> @@ -338,7 +353,7 @@ let media := let nestable_block_element := | ~ = Verbatim; <`Verbatim> | ~ = located(inline_element)+; <`Paragraph> - | ~ = Code_block; RIGHT_CODE_DELIMITER; <`Code_block> + | ~ = Code_block; <`Code_block> | ~ = located(Modules)+; <`Modules> | ~ = list_element; <> | ~ = table; <> From eb0b0f47497652e1cff3f910347dd58c8559b434 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 26 Sep 2024 09:53:53 -0400 Subject: [PATCH 053/150] Fix light table parsing --- src/parser/lexer.mll | 2 +- src/parser/parser.mly | 114 ++++++++++++++++++++++-------------------- 2 files changed, 60 insertions(+), 56 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 766a493cc7..d4c1675710 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -452,7 +452,7 @@ and token input = parse in let emit_truncated_code_block () = let empty_content = with_location_adjustments (fun _ -> Loc.at) lexbuf input "" in - emit ~start_offset lexbuf input (Code_block { meta = Some { language = lang_tag; tags = None }; delimiter = Some delimiter; content = empty_content; output = None}) + emit ~start_offset lexbuf input (Code_block { meta = Some { language = lang_tag; tags = None }; delimiter = Some delimiter; content = empty_content; output = None}) in match code_block_metadata_tail input lexbuf with | Ok metadata -> code_block start_offset (Lexing.lexeme_end lexbuf) (Some metadata) (Buffer.create 256) delimiter input lexbuf diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 762ffd2902..c7d6ebe449 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -74,28 +74,42 @@ | Invalid_align (* An invalid align cell *) | Not_align (* Not an align cell *) - (* This could be handled in the parser, I think *) - let valid_align_cell text = - begin - match String.length text with - | 0 -> Ok None - | 1 -> - begin - match text.[0] with - | ':' -> Ok (Some `Center) - | '-' -> Ok None - | _ -> Error Not_align - end - | len -> - if String.for_all (Char.equal '-') (String.sub text 1 (len - 2)) then - match text.[0], text.[pred len] with - | ':', '-' -> Ok (Some `Left) - | '-', ':' -> Ok (Some `Right) - | ':', ':' -> Ok (Some `Center) - | '-', '-' -> Ok None - | _ -> Error Invalid_align - else Error Not_align - end + let valid_elements (cell : Ast.inline_element list): string option = + let rec go acc = function + | `Word _ :: _ when Option.is_some acc -> None + | `Word word :: rest -> + go (Some word) rest + | `Space _ :: rest -> + go acc rest + | _ :: _ -> None + | [] -> acc + in + go None cell + + let valid_word word = + match String.length word with + | 0 -> Ok None + | 1 -> + begin + match word.[0] with + | ':' -> Ok (Some `Center) + | '-' -> Ok None + | _ -> Error Not_align + end + | len -> + if String.for_all (Char.equal '-') (String.sub word 1 (len - 2)) then + match word.[0], word.[pred len] with + | ':', '-' -> Ok (Some `Left) + | '-', ':' -> Ok (Some `Right) + | ':', ':' -> Ok (Some `Center) + | '-', '-' -> Ok None + | _ -> Error Invalid_align + else Error Not_align + + let valid_align_cell cell = + match List.map Loc.value cell |> valid_elements with + | Some word -> valid_word word + | None -> Error Not_align (* NOTE: (@FayCarsons) @@ -103,35 +117,28 @@ When we get something that doesn't look like an align at all, we check to see if we've gotten any valid aligns, if so we assume that the cell being considered is supposed to be an align and treat it as an error, otherwise we assume the row is not supposed to be an align row *) - let rec valid_align_row ?(acc = []) : Ast.inline_element Loc.with_location list -> (Ast.alignment option list, align_error) result - = function - | Loc.{ value = `Word cell; _ } :: rest -> - begin - match valid_align_cell cell with - | Ok alignment -> - valid_align_row ~acc:(alignment :: acc) rest - | Error Not_align when List.length acc > 0 -> Error Invalid_align - | Error err -> Error err - end - | Loc.{ value = `Space "\n"; _ } :: _ when List.length acc > 0 -> - Error Invalid_align - | Loc.{ value = `Space _; _ } :: rest -> - valid_align_row ~acc rest - | [] -> - Ok acc + let valid_align_row (row : Ast.inline_element Loc.with_location list list): (Ast.alignment option list, align_error) result = + let (align, not_align) = List.map valid_align_cell row |> List.partition (function Ok _ | Error Invalid_align -> true | _ -> false) in + let total_len = List.length row in + let rec invert acc : (Ast.alignment option, align_error) result list -> (Ast.alignment option list, align_error) result = function + | Ok align :: rest -> invert (align :: acc) rest + | Error err :: _ -> Error err + | [] -> Ok (List.rev acc) + in + match not_align with + | [] -> invert [] align + | _ :: _ when List.length align > (total_len / 2) -> Error Invalid_align | _ -> Error Not_align + (* Wrap a list of words in a paragraph, used for 'light' table headers *) let to_paragraph : Ast.inline_element Loc.with_location list -> Ast.nestable_block_element Loc.with_location list = fun words -> let span = Loc.span @@ List.map Loc.location words in [ Loc.at span (`Paragraph words) ] - let recover_data : Ast.inline_element Loc.with_location list -> Ast.nestable_block_element Ast.row - = List.map (fun word -> - let loc = Loc.location word in - [ Loc.at loc @@ `Paragraph [ word ] ], `Data - ) + let to_data_row : Ast.inline_element Loc.with_location list list -> Ast.nestable_block_element Ast.row + = List.map (fun elts -> to_paragraph elts, `Data) let media_kind_of_target = function | Audio -> `Audio @@ -259,7 +266,6 @@ let ref := | ref_body = located(Ref_with_replacement); children = located(inline_element)*; { `Reference (`With_text, ref_body, children) } -(* TODO : Fix the `with_replacement` producers in the following two rules, if they're broken. Ask what `with_replacement` refers to *) let link := | link_body = Simple_link; children = located(inline_element)+; RIGHT_BRACE; { `Link (link_body, children) } | link_body = Link_with_replacement; children = located(inline_element)+; RIGHT_BRACE; { `Link (link_body, children) } @@ -285,7 +291,7 @@ let list_element := (* TABLES *) - let cell_heavy := cell_kind = Table_cell; whitespace?; children = located(nestable_block_element)*; whitespace?; RIGHT_BRACE; whitespace?; { (children, cell_kind) } +let cell_heavy := cell_kind = Table_cell; whitespace?; children = located(nestable_block_element)*; whitespace?; RIGHT_BRACE; whitespace?; { (children, cell_kind) } let row_heavy == TABLE_ROW; whitespace?; cells = list(cell_heavy); RIGHT_BRACE; whitespace?; { cells } let table_heavy == TABLE_HEAVY; whitespace?; grid = row_heavy*; RIGHT_BRACE; { (* Convert into an 'abstract table' which can be either a light or heavy syntax table. @@ -294,40 +300,38 @@ let table_heavy == TABLE_HEAVY; whitespace?; grid = row_heavy*; RIGHT_BRACE; { (abstract, `Heavy) } -let cell_light == BAR?; data = located(inline_element)+; { (to_paragraph data, `Data) } -let row_light := ~ = cell_light+; BAR?; NEWLINE; <> - -let align_cell == BAR?; inner = located(inline_element); { inner } -let align_row := ~ = align_cell+; BAR?; NEWLINE; <> +let cell_light == BAR?; ~ = located(inline_element)+; <> (* Ast.cell *) +let row_light := ~ = cell_light+; BAR?; NEWLINE; <> (* Ast.row *) (* NOTE: (@FayCarsons) Presently, behavior is to 'recover' when we have an invalid align row. Is this what we want? *) let table_light := (* If the first row is the alignment row then the rest should be data *) - | TABLE_LIGHT; align = align_row; data = row_light+; RIGHT_BRACE; + | TABLE_LIGHT; align = row_light; data = row_light+; RIGHT_BRACE; { match valid_align_row align with | Ok alignment -> (data, Some alignment), `Light | Error Invalid_align -> (data, None), `Light | Error Not_align -> - let align_as_data = recover_data align in + let align_as_data = to_data_row align in (align_as_data :: data, None), `Light } (* Otherwise the first should be the headers, the second align, and the rest data *) - | TABLE_LIGHT; header = row_light; align = align_row; data = row_light+; RIGHT_BRACE; + | TABLE_LIGHT; header = row_light; align = row_light; data = row_light+; RIGHT_BRACE; { + let data = List.map to_data_row data in match valid_align_row align with | Ok alignment -> (header :: data, Some alignment), `Light | Error Invalid_align -> (header :: data, None), `Light | Error Not_align -> - let align_as_data = recover_data align in + let align_as_data = recover align in (header :: align_as_data :: data, None), `Light } (* If there's only one row and it's not the align row, then it's data *) | TABLE_LIGHT; data = row_light+; RIGHT_BRACE; - { (data, None), `Light } + { (List.map to_data_row data, None), `Light } (* If there's nothing inside, return an empty table *) | TABLE_LIGHT; SPACE*; RIGHT_BRACE; { ([[]], None), `Light } From 54ea1381759a141ed23c639379e3cc99d9b4ca15 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 30 Sep 2024 13:59:14 -0400 Subject: [PATCH 054/150] Fixed tagging and consolidation of elements in row processing --- src/parser/parser.mly | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index c7d6ebe449..11a2b77100 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -137,8 +137,8 @@ let span = Loc.span @@ List.map Loc.location words in [ Loc.at span (`Paragraph words) ] - let to_data_row : Ast.inline_element Loc.with_location list list -> Ast.nestable_block_element Ast.row - = List.map (fun elts -> to_paragraph elts, `Data) + let tagged_row tag : Ast.inline_element Loc.with_location list list -> Ast.nestable_block_element Ast.row + = List.map (fun elts -> to_paragraph elts, tag) let media_kind_of_target = function | Audio -> `Audio @@ -312,14 +312,15 @@ let table_light := | Ok alignment -> (data, Some alignment), `Light | Error Invalid_align -> (data, None), `Light | Error Not_align -> - let align_as_data = to_data_row align in + let align_as_data = tagged_row `Data align in (align_as_data :: data, None), `Light } (* Otherwise the first should be the headers, the second align, and the rest data *) | TABLE_LIGHT; header = row_light; align = row_light; data = row_light+; RIGHT_BRACE; { - let data = List.map to_data_row data in + let data = List.map (tagged_row `Data) data + and header = tagged_row `Header header in match valid_align_row align with | Ok alignment -> (header :: data, Some alignment), `Light | Error Invalid_align -> @@ -331,7 +332,7 @@ let table_light := (* If there's only one row and it's not the align row, then it's data *) | TABLE_LIGHT; data = row_light+; RIGHT_BRACE; - { (List.map to_data_row data, None), `Light } + { (List.map (tagged_row `Data) data, None), `Light } (* If there's nothing inside, return an empty table *) | TABLE_LIGHT; SPACE*; RIGHT_BRACE; { ([[]], None), `Light } From f11faef4594ad51e7ae2a44fc0a01476403cfc47 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 30 Sep 2024 14:08:33 -0400 Subject: [PATCH 055/150] Fix row processing --- src/parser/parser.mly | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 11a2b77100..000aff2453 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -137,9 +137,13 @@ let span = Loc.span @@ List.map Loc.location words in [ Loc.at span (`Paragraph words) ] - let tagged_row tag : Ast.inline_element Loc.with_location list list -> Ast.nestable_block_element Ast.row + (* Merges inline elements within a cell into a single paragraph element, and tags cells w/ tag *) + let merged_tagged_row tag : Ast.inline_element Loc.with_location list list -> Ast.nestable_block_element Ast.row = List.map (fun elts -> to_paragraph elts, tag) + let as_data = merged_tagged_row `Data + let as_header = merged_tagged_row `Header + let media_kind_of_target = function | Audio -> `Audio | Video -> `Video @@ -308,31 +312,32 @@ let table_light := (* If the first row is the alignment row then the rest should be data *) | TABLE_LIGHT; align = row_light; data = row_light+; RIGHT_BRACE; { + let data = List.map as_data data in match valid_align_row align with | Ok alignment -> (data, Some alignment), `Light | Error Invalid_align -> (data, None), `Light | Error Not_align -> - let align_as_data = tagged_row `Data align in + let align_as_data = as_data align in (align_as_data :: data, None), `Light } (* Otherwise the first should be the headers, the second align, and the rest data *) | TABLE_LIGHT; header = row_light; align = row_light; data = row_light+; RIGHT_BRACE; { - let data = List.map (tagged_row `Data) data - and header = tagged_row `Header header in + let data = List.map as_data data + and header = as_header header in match valid_align_row align with | Ok alignment -> (header :: data, Some alignment), `Light | Error Invalid_align -> (header :: data, None), `Light | Error Not_align -> - let align_as_data = recover align in + let align_as_data = as_data align in (header :: align_as_data :: data, None), `Light } (* If there's only one row and it's not the align row, then it's data *) | TABLE_LIGHT; data = row_light+; RIGHT_BRACE; - { (List.map (tagged_row `Data) data, None), `Light } + { (List.map as_data data, None), `Light } (* If there's nothing inside, return an empty table *) | TABLE_LIGHT; SPACE*; RIGHT_BRACE; { ([[]], None), `Light } From 68eff9e90a18f17cd81d169b176ac4c6253d94ae Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 15 Oct 2024 14:55:09 -0400 Subject: [PATCH 056/150] Cleanup, remove unused, remove comments --- src/parser/parser.mly | 145 ++++++++++++++++-------------------------- 1 file changed, 55 insertions(+), 90 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 000aff2453..73a85bb900 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -29,24 +29,8 @@ | Canonical _ -> "@canonical" | Inline | Open | Closed | Hidden -> "" - type unimplemented = Top_level_error - exception Debug of unimplemented Loc.with_location - let _ = Printexc.register_printer (function - | Debug unimplemented_token_with_location -> - begin - let Loc.{ location = _location; value = token } = unimplemented_token_with_location in - let error_message = match token with - | Top_level_error -> "Error in Parser.main rule" - in - Option.some @@ Printf.sprintf "Parser failed on: %s" error_message - end - | _ -> None - ) exception No_children of string Loc.with_location - let exn_location : only_for_debugging:lexspan -> failed_on:unimplemented -> exn = - fun ~only_for_debugging:loc ~failed_on -> Debug (wrap_location loc failed_on) - let tag : Ast.tag -> Ast.block_element = fun tag -> `Tag tag let tag_with_element (children : Ast.nestable_block_element Loc.with_location list) : Parser_aux.tag -> Ast.block_element = function @@ -67,13 +51,15 @@ | Open -> `Tag `Open | Closed -> `Tag `Closed | Hidden -> `Tag `Hidden - (* TODO: replace this with real error handling. Don't throw exceptions *) + (* NOTE: (@FayCarsons) This should be unreachable, remove after dev *) | tag -> raise @@ No_children (wrap_location loc @@ Printf.sprintf "Tag %s expects children" (pp_tag tag)) type align_error = | Invalid_align (* An invalid align cell *) | Not_align (* Not an align cell *) + (* This could be made a bit more specific by allowing Space elements only at + the beginning and end *) let valid_elements (cell : Ast.inline_element list): string option = let rec go acc = function | `Word _ :: _ when Option.is_some acc -> None @@ -90,48 +76,48 @@ match String.length word with | 0 -> Ok None | 1 -> - begin - match word.[0] with - | ':' -> Ok (Some `Center) - | '-' -> Ok None - | _ -> Error Not_align - end + begin + match word.[0] with + | ':' -> Ok (Some `Center) + | '-' -> Ok None + | _ -> Error Not_align + end | len -> - if String.for_all (Char.equal '-') (String.sub word 1 (len - 2)) then - match word.[0], word.[pred len] with - | ':', '-' -> Ok (Some `Left) - | '-', ':' -> Ok (Some `Right) - | ':', ':' -> Ok (Some `Center) - | '-', '-' -> Ok None - | _ -> Error Invalid_align - else Error Not_align + if String.for_all (Char.equal '-') (String.sub word 1 (len - 2)) then + match word.[0], word.[pred len] with + | ':', '-' -> Ok (Some `Left) + | '-', ':' -> Ok (Some `Right) + | ':', ':' -> Ok (Some `Center) + | '-', '-' -> Ok None + | _ -> Error Invalid_align + else Error Not_align let valid_align_cell cell = match List.map Loc.value cell |> valid_elements with | Some word -> valid_word word | None -> Error Not_align - (* NOTE: (@FayCarsons) + let sequence : ('elt, 'err) result list -> ('elt list, 'err) result = + fun list -> + let rec go acc : ('elt, 'err) result list -> ('elt list, 'err) result = + function + | Ok x :: xs -> go (x :: acc) xs + | Error err :: _ -> Error err + | [] -> Ok (List.rev acc) + in + go [] list - This is a short-circuiting fold, if we get something that isn't a valid alignment then bail and return it. + (* NOTE: (@FayCarsons) When we get something that doesn't look like an align at all, we check to see if we've gotten any valid aligns, if so we assume that the cell being considered is supposed to be an align and treat it as an error, otherwise we assume the row is not supposed to be an align row *) let valid_align_row (row : Ast.inline_element Loc.with_location list list): (Ast.alignment option list, align_error) result = let (align, not_align) = List.map valid_align_cell row |> List.partition (function Ok _ | Error Invalid_align -> true | _ -> false) in - let total_len = List.length row in - let rec invert acc : (Ast.alignment option, align_error) result list -> (Ast.alignment option list, align_error) result = function - | Ok align :: rest -> invert (align :: acc) rest - | Error err :: _ -> Error err - | [] -> Ok (List.rev acc) - in - match not_align with - | [] -> invert [] align - | _ :: _ when List.length align > (total_len / 2) -> Error Invalid_align + match align, not_align with + | _ :: _, _ :: _ -> Error Invalid_align + | _ :: _, [] -> sequence align | _ -> Error Not_align - - (* Wrap a list of words in a paragraph, used for 'light' table headers *) let to_paragraph : Ast.inline_element Loc.with_location list -> Ast.nestable_block_element Loc.with_location list = fun words -> let span = Loc.span @@ List.map Loc.location words in @@ -140,7 +126,6 @@ (* Merges inline elements within a cell into a single paragraph element, and tags cells w/ tag *) let merged_tagged_row tag : Ast.inline_element Loc.with_location list list -> Ast.nestable_block_element Ast.row = List.map (fun elts -> to_paragraph elts, tag) - let as_data = merged_tagged_row `Data let as_header = merged_tagged_row `Header @@ -202,8 +187,8 @@ %token Ref_with_replacement %token Simple_link %token Link_with_replacement -%token <(Parser_aux.media * Parser_aux.media_target)> Media -%token <(Parser_aux.media * Parser_aux.media_target * string)> Media_with_replacement +%token Media +%token Media_with_replacement %token Verbatim %token END @@ -221,8 +206,6 @@ let main := | ~ = located(toplevel)+; END; <> | _ = whitespace; END; { [] } | END; { [] } - (* TODO: replace w/ real error handling *) - | error; { raise @@ exn_location ~only_for_debugging:$loc ~failed_on:Top_level_error } let toplevel := | ~ = tag; whitespace*; <> @@ -233,26 +216,18 @@ let whitespace := | SPACE; { `Space " " } | NEWLINE; { `Space "\n" } | ~ = Space; <`Space> - | ~ = Blank_line; <`Space> | ~ = Single_newline; <`Space> +let heading := + | (num, title) = Section_heading; children = list(located(inline_element)); RIGHT_BRACE; + { `Heading (num, title, children) } + +let tag := + | inner_tag = Tag; children = located(nestable_block_element)+; { tag_with_element children inner_tag } + | inner_tag = Tag; { tag_bare $loc inner_tag } + (* INLINE ELEMENTS *) -(* TODO: enforce invariant: - -(* Convenient abbreviation for use in patterns. *) -type token_that_always_begins_an_inline_element = - [ `Word of string - | `Code_span of string - | `Raw_markup of string option * string - | `Begin_style of style - | `Simple_reference of string - | `Begin_reference_with_replacement_text of string - | `Simple_link of string - | `Begin_link_with_replacement_text of string - | `Math_span of string ] - -*) let inline_element := | ~ = whitespace; <> | ~ = Word; <`Word> @@ -267,7 +242,9 @@ let ref := | ref_body = located(Simple_ref); children = located(inline_element)*; { `Reference (`Simple, ref_body, children) } - | ref_body = located(Ref_with_replacement); children = located(inline_element)*; + | ref_body = located(Ref_with_replacement); + children = located(inline_element)*; + RIGHT_BRACE; { `Reference (`With_text, ref_body, children) } let link := @@ -283,7 +260,7 @@ let list_light := | PLUS; ordered_items = separated_list(NEWLINE; SPACE?; PLUS, located(nestable_block_element)); { `List (`Ordered, `Light, [ ordered_items ]) } -(* `List_item` is [ `Li | `Dash ], not sure how that's useful though. Can't find '{li' syntax in Odoc docs *) +(* NOTE: (@FayCarsons) `List_item` is [ `Li | `Dash ], not sure how that's useful though. Can't find '{li' syntax in Odoc docs *) let item_heavy == _ = List_item; whitespace*; ~ = located(nestable_block_element)*; whitespace*; RIGHT_BRACE; whitespace*; <> let list_heavy := | list_kind = List; whitespace*; items = item_heavy*; whitespace*; RIGHT_BRACE; @@ -297,28 +274,26 @@ let list_element := let cell_heavy := cell_kind = Table_cell; whitespace?; children = located(nestable_block_element)*; whitespace?; RIGHT_BRACE; whitespace?; { (children, cell_kind) } let row_heavy == TABLE_ROW; whitespace?; cells = list(cell_heavy); RIGHT_BRACE; whitespace?; { cells } -let table_heavy == TABLE_HEAVY; whitespace?; grid = row_heavy*; RIGHT_BRACE; { - (* Convert into an 'abstract table' which can be either a light or heavy syntax table. - We know this is a heavy table, which cannot have alignment, however, so the alignment field is `None` *) - let abstract : Ast.nestable_block_element Ast.abstract_table = (grid, None) in +let table_heavy == TABLE_HEAVY; whitespace?; grid = row_heavy*; RIGHT_BRACE; + { + let abstract = (grid, None) in (abstract, `Heavy) } let cell_light == BAR?; ~ = located(inline_element)+; <> (* Ast.cell *) let row_light := ~ = cell_light+; BAR?; NEWLINE; <> (* Ast.row *) -(* NOTE: (@FayCarsons) Presently, behavior is to 'recover' when we have an invalid align row. Is this what we want? *) let table_light := (* If the first row is the alignment row then the rest should be data *) | TABLE_LIGHT; align = row_light; data = row_light+; RIGHT_BRACE; { let data = List.map as_data data in match valid_align_row align with - | Ok alignment -> (data, Some alignment), `Light - | Error Invalid_align -> (data, None), `Light + | Ok alignment -> ((data, Some alignment), `Light) + | Error Invalid_align -> ((data, None), `Light) | Error Not_align -> - let align_as_data = as_data align in - (align_as_data :: data, None), `Light + let rows = as_data align :: data in + ((rows, None), `Light) } (* Otherwise the first should be the headers, the second align, and the rest data *) @@ -331,8 +306,8 @@ let table_light := | Error Invalid_align -> (header :: data, None), `Light | Error Not_align -> - let align_as_data = as_data align in - (header :: align_as_data :: data, None), `Light + let rows = header :: as_data align :: data in + (rows, None), `Light } (* If there's only one row and it's not the align row, then it's data *) @@ -340,7 +315,8 @@ let table_light := { (List.map as_data data, None), `Light } (* If there's nothing inside, return an empty table *) - | TABLE_LIGHT; SPACE*; RIGHT_BRACE; { ([[]], None), `Light } + | TABLE_LIGHT; SPACE*; RIGHT_BRACE; + { ([[]], None), `Light } let table := | ~ = table_heavy; <`Table> @@ -370,14 +346,3 @@ let nestable_block_element := | ~ = media; <> | ~ = Math_block; <`Math_block> -let heading := - | (num, title) = Section_heading; children = list(located(inline_element)); RIGHT_BRACE; { - `Heading (num, title, children) :> Ast.block_element - } - -let tag := - | inner_tag = Tag; children = located(nestable_block_element)+; { tag_with_element children inner_tag } - | inner_tag = Tag; { tag_bare $loc inner_tag } - -let style := ~ = Style; <> -let paragraph_style := ~ = Paragraph_style; <> From f3704c09d5e423d9e13e9c9b58d787307971e7f1 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 17 Oct 2024 14:41:58 -0400 Subject: [PATCH 057/150] Enforce invariants in parser Enforce "no block elements after tags" invariant. Remove optional content for block elements which must have content. --- src/parser/parser.mly | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 73a85bb900..80b73907fc 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -202,22 +202,29 @@ let located(rule) == inner = rule; { wrap_location $sloc inner } (* ENTRY-POINT *) +(* A comment can either contain block elements, block elements and then tags, or + just tags, but not tags and then block elements *) let main := - | ~ = located(toplevel)+; END; <> - | _ = whitespace; END; { [] } + | ~ = located(toplevel)+; whitespace*; END; <> + | elements = located(toplevel)+; tags = located(tag)+; whitespace*; END; + { elements @ tags } + | ~ = located(tag)+; whitespace*; END; <> + | whitespace; END; { [] } | END; { [] } let toplevel := - | ~ = tag; whitespace*; <> - | block = nestable_block_element; whitespace*; { (block :> Ast.block_element) } - | ~ = heading; whitespace*; <> + | block = nestable_block_element; { (block :> Ast.block_element) } + | ~ = heading; <> -let whitespace := +let horizontal_whitespace := | SPACE; { `Space " " } - | NEWLINE; { `Space "\n" } | ~ = Space; <`Space> | ~ = Single_newline; <`Space> +let whitespace := + | horizontal_whitespace; {} + | NEWLINE; { () } + let heading := | (num, title) = Section_heading; children = list(located(inline_element)); RIGHT_BRACE; { `Heading (num, title, children) } @@ -229,11 +236,11 @@ let tag := (* INLINE ELEMENTS *) let inline_element := - | ~ = whitespace; <> + | ~ = Space; <`Space> | ~ = Word; <`Word> | ~ = Code_span; <`Code_span> | ~ = Raw_markup; <`Raw_markup> - | style = Style; inner = located(inline_element)*; RIGHT_BRACE; { `Styled (style, inner) } + | style = Style; inner = located(inline_element)+; RIGHT_BRACE; { `Styled (style, inner) } | ~ = Math_span; <`Math_span> | ~ = ref; <> | ~ = link; <> @@ -243,7 +250,7 @@ let ref := { `Reference (`Simple, ref_body, children) } | ref_body = located(Ref_with_replacement); - children = located(inline_element)*; + children = located(inline_element)+; RIGHT_BRACE; { `Reference (`With_text, ref_body, children) } @@ -254,17 +261,17 @@ let link := (* LIST *) let list_light := - | MINUS; unordered_items = separated_list(NEWLINE; SPACE?; MINUS, located(nestable_block_element)); + | MINUS; unordered_items = separated_list(NEWLINE; MINUS, located(nestable_block_element)); { `List (`Unordered, `Light, [ unordered_items ]) } | PLUS; ordered_items = separated_list(NEWLINE; SPACE?; PLUS, located(nestable_block_element)); { `List (`Ordered, `Light, [ ordered_items ]) } (* NOTE: (@FayCarsons) `List_item` is [ `Li | `Dash ], not sure how that's useful though. Can't find '{li' syntax in Odoc docs *) -let item_heavy == _ = List_item; whitespace*; ~ = located(nestable_block_element)*; whitespace*; RIGHT_BRACE; whitespace*; <> +let item_heavy == _ = List_item; whitespace*; ~ = located(nestable_block_element)+; whitespace*; RIGHT_BRACE; whitespace*; <> let list_heavy := | list_kind = List; whitespace*; items = item_heavy*; whitespace*; RIGHT_BRACE; - { `List (list_kind, `Heavy, items) } + { `List (list_kind, `Heavy, items) } let list_element := | ~ = list_light; <> @@ -274,11 +281,7 @@ let list_element := let cell_heavy := cell_kind = Table_cell; whitespace?; children = located(nestable_block_element)*; whitespace?; RIGHT_BRACE; whitespace?; { (children, cell_kind) } let row_heavy == TABLE_ROW; whitespace?; cells = list(cell_heavy); RIGHT_BRACE; whitespace?; { cells } -let table_heavy == TABLE_HEAVY; whitespace?; grid = row_heavy*; RIGHT_BRACE; - { - let abstract = (grid, None) in - (abstract, `Heavy) - } +let table_heavy == TABLE_HEAVY; whitespace?; grid = row_heavy+; RIGHT_BRACE; { ((grid, None), `Heavy) } let cell_light == BAR?; ~ = located(inline_element)+; <> (* Ast.cell *) let row_light := ~ = cell_light+; BAR?; NEWLINE; <> (* Ast.row *) From f1488166159a943869dc7c7f4c1a23070235f341 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 21 Oct 2024 13:45:25 -0400 Subject: [PATCH 058/150] Refactor tag parsing --- src/parser/describe.ml | 108 ++++++++++++++++++--------------------- src/parser/lexer.mll | 38 +++++++------- src/parser/parser.mly | 94 +++++++++++++++------------------- src/parser/parser_aux.ml | 38 +++++++++----- 4 files changed, 134 insertions(+), 144 deletions(-) diff --git a/src/parser/describe.ml b/src/parser/describe.ml index 408e7c5147..19e34b7a47 100644 --- a/src/parser/describe.ml +++ b/src/parser/describe.ml @@ -1,17 +1,15 @@ open Parser let media_description ref_kind media_kind = - let open Parser_aux in - let media_kind = match media_kind with + let open Parser_aux in + let media_kind = + match media_kind with | Audio -> "audio" | Video -> "video" - | Image -> "image" - in - let ref_kind = match ref_kind with - | Reference _ -> "!" - | Link _ -> ":" - in - ref_kind, media_kind + | Image -> "image" + in + let ref_kind = match ref_kind with Reference _ -> "!" | Link _ -> ":" in + (ref_kind, media_kind) let print : Parser.token -> string = function | SPACE | Space _ -> "\t" @@ -22,16 +20,14 @@ let print : Parser.token -> string = function | Simple_link _ -> "{:" | Link_with_replacement _ -> "{{:" | Modules _ -> "{!modules:" - | Media (ref_kind, media_kind) -> - let (ref_kind, media_kind) = media_description ref_kind media_kind in - Printf.sprintf "{%s%s" media_kind ref_kind - | Media_with_replacement (ref_kind, media_kind, _) -> - let (ref_kind, media_kind) = media_description ref_kind media_kind in - Printf.sprintf "{{%s%s" media_kind ref_kind - | Math_span _ -> - "{m" - | Math_block _ -> - "{math" + | Media (ref_kind, media_kind) -> + let ref_kind, media_kind = media_description ref_kind media_kind in + Printf.sprintf "{%s%s" media_kind ref_kind + | Media_with_replacement (ref_kind, media_kind, _) -> + let ref_kind, media_kind = media_description ref_kind media_kind in + Printf.sprintf "{{%s%s" media_kind ref_kind + | Math_span _ -> "{m" + | Math_block _ -> "{math" | Code_span _ -> "[" | Code_block _ -> "{[" | Word w -> w @@ -61,20 +57,20 @@ let print : Parser.token -> string = function | Section_heading (level, label) -> let label = match label with None -> "" | Some label -> ":" ^ label in Printf.sprintf "'{%i%s'" level label - | Tag (Author _) -> "'@author'" - | Tag Deprecated -> "'@deprecated'" - | Tag (Param _) -> "'@param'" - | Tag (Raise _) -> "'@raise'" - | Tag Return -> "'@return'" - | Tag (See _) -> "'@see'" - | Tag (Since _) -> "'@since'" - | Tag (Before _) -> "'@before'" - | Tag (Version _) -> "'@version'" - | Tag (Canonical _) -> "'@canonical'" - | Tag Inline -> "'@inline'" - | Tag Open -> "'@open'" - | Tag Closed -> "'@closed'" - | Tag Hidden -> "'@hidden" + | Author _ -> "'@author'" + | DEPRECATED -> "'@deprecated'" + | Param _ -> "'@param'" + | Raise _ -> "'@raise'" + | RETURN -> "'@return'" + | See _ -> "'@see'" + | Since _ -> "'@since'" + | Before _ -> "'@before'" + | Version _ -> "'@version'" + | Canonical _ -> "'@canonical'" + | INLINE -> "'@inline'" + | OPEN -> "'@open'" + | CLOSED -> "'@closed'" + | HIDDEN -> "'@hidden" | Raw_markup (None, _) -> "'{%...%}'" | Raw_markup (Some target, _) -> "'{%" ^ target ^ ":...%}'" | END -> "EOI" @@ -84,12 +80,12 @@ let print : Parser.token -> string = function [`Minus] and [`Plus] should always be plausibly list item bullets. *) let describe : Parser.token -> string = function | Space _ -> "(horizontal space)" - | Media (ref_kind, media_kind) -> - let (ref_kind, media_kind) = media_description ref_kind media_kind in - Printf.sprintf "{%s%s" media_kind ref_kind - | Media_with_replacement (ref_kind, media_kind, _) -> - let (ref_kind, media_kind) = media_description ref_kind media_kind in - Printf.sprintf "{{%s%s" media_kind ref_kind + | Media (ref_kind, media_kind) -> + let ref_kind, media_kind = media_description ref_kind media_kind in + Printf.sprintf "{%s%s" media_kind ref_kind + | Media_with_replacement (ref_kind, media_kind, _) -> + let ref_kind, media_kind = media_description ref_kind media_kind in + Printf.sprintf "{{%s%s" media_kind ref_kind | Word w -> Printf.sprintf "'%s'" w | Code_span _ -> "'[...]' (code)" | Raw_markup _ -> "'{%...%}' (raw markup)" @@ -104,8 +100,7 @@ let describe : Parser.token -> string = function | Math_span _ -> "'{m ...}' (math span)" | Math_block _ -> "'{math ...}' (math block)" | Simple_ref _ -> "'{!...}' (cross-reference)" - | Ref_with_replacement _ -> - "'{{!...} ...}' (cross-reference)" + | Ref_with_replacement _ -> "'{{!...} ...}' (cross-reference)" | Simple_link _ -> "'{:...} (external link)'" | Link_with_replacement _ -> "'{{:...} ...}' (external link)" | END -> "end of text" @@ -131,25 +126,24 @@ let describe : Parser.token -> string = function | BAR -> "'|'" | Section_heading (level, _) -> Printf.sprintf "'{%i ...}' (section heading)" level - | Tag (Author _) -> "'@author'" - | Tag Deprecated -> "'@deprecated'" - | Tag (Param _) -> "'@param'" - | Tag (Raise _) -> "'@raise'" - | Tag ( Return ) -> "'@return'" - | Tag (See _) -> "'@see'" - | Tag (Since _) -> "'@since'" - | Tag (Before _) -> "'@before'" - | Tag (Version _) -> "'@version'" - | Tag (Canonical _) -> "'@canonical'" - | Tag Inline -> "'@inline'" - | Tag Open -> "'@open'" - | Tag Closed -> "'@closed'" - | Tag Hidden -> "'@hidden" + | Author _ -> "'@author'" + | DEPRECATED -> "'@deprecated'" + | Param _ -> "'@param'" + | Raise _ -> "'@raise'" + | RETURN -> "'@return'" + | See _ -> "'@see'" + | Since _ -> "'@since'" + | Before _ -> "'@before'" + | Version _ -> "'@version'" + | Canonical _ -> "'@canonical'" + | INLINE -> "'@inline'" + | OPEN -> "'@open'" + | CLOSED -> "'@closed'" + | HIDDEN -> "'@hidden" (* NOTE : (@faycarsons) Should this be in Ast.ml? This takes an Ast.t no? *) let describe_element = function | `Reference (`Simple, _, _) -> describe (Simple_ref "") - | `Reference (`With_text, _, _) -> - describe (Ref_with_replacement "") + | `Reference (`With_text, _, _) -> describe (Ref_with_replacement "") | `Link _ -> describe (Link_with_replacement "") | `Heading (level, _, _) -> describe (Section_heading (level, None)) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index d4c1675710..7562a4aa06 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -525,52 +525,52 @@ and token input = parse { emit lexbuf input (Section_heading (heading_level lexbuf input level, None)) } | "@author" ((horizontal_space+ [^ '\r' '\n']*)? as author) - { emit lexbuf input (Tag (Author author)) } + { emit lexbuf input (Author author) } | "@deprecated" - { emit lexbuf input (Tag Deprecated) } + { emit lexbuf input DEPRECATED } | "@param" horizontal_space+ ((_ # space_char)+ as name) - { emit lexbuf input (Tag (Param name)) } + { emit lexbuf input (Param name) } | ("@raise" | "@raises") horizontal_space+ ((_ # space_char)+ as name) - { emit lexbuf input (Tag (Raise name)) } + { emit lexbuf input (Raise name) } | ("@return" | "@returns") - { emit lexbuf input (Tag Return) } + { emit lexbuf input RETURN } | "@see" horizontal_space* '<' ([^ '>']* as url) '>' - { emit lexbuf input (Tag (See (`Url, url))) } + { emit lexbuf input (See (`Url, url)) } | "@see" horizontal_space* '\'' ([^ '\'']* as filename) '\'' - { emit lexbuf input (Tag (See (`File, filename))) } + { emit lexbuf input (See (`File, filename)) } | "@see" horizontal_space* '"' ([^ '"']* as name) '"' - { emit lexbuf input (Tag (See (`Document, name))) } + { emit lexbuf input (See (`Document, name)) } | "@since" ((horizontal_space+ [^ '\r' '\n']*)? as version) - { emit lexbuf input (Tag (Since version)) } + { emit lexbuf input (Since version) } | "@before" horizontal_space+ ((_ # space_char)+ as version) - { emit lexbuf input (Tag (Before version)) } + { emit lexbuf input (Before version) } | "@version" ((horizontal_space+ [^ '\r' '\n']*)? as version) - { emit lexbuf input (Tag (Version version)) } + { emit lexbuf input (Version version) } | "@canonical" ((horizontal_space+ [^ '\r' '\n']*)? as identifier) - { emit lexbuf input (Tag (Canonical identifier)) } + { emit lexbuf input (Canonical identifier) } | "@inline" - { emit lexbuf input (Tag Inline) } + { emit lexbuf input INLINE } | "@open" - { emit lexbuf input (Tag Open) } + { emit lexbuf input OPEN } | "@closed" - { emit lexbuf input (Tag Closed) } + { emit lexbuf input CLOSED } | "@hidden" - { emit lexbuf input (Tag Hidden) } + { emit lexbuf input HIDDEN } | "]}" { emit lexbuf input RIGHT_CODE_DELIMITER} @@ -590,15 +590,15 @@ and token input = parse | "@param" { warning lexbuf input Parse_error.truncated_param; - emit lexbuf input (Tag (Param "")) } + emit lexbuf input (Param "") } | ("@raise" | "@raises") as tag { warning lexbuf input (Parse_error.truncated_raise tag); - emit lexbuf input (Tag (Raise "")) } + emit lexbuf input (Raise "") } | "@before" { warning lexbuf input Parse_error.truncated_before; - emit lexbuf input (Tag (Before "")) } + emit lexbuf input (Before "") } | "@see" { warning lexbuf input Parse_error.truncated_see; diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 80b73907fc..fa47424362 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -1,59 +1,10 @@ %{ open Parser_aux - let point_of_position Lexing.{ pos_lnum; pos_cnum; pos_bol; _ } = - Loc.{ line = pos_lnum; column = pos_cnum - pos_bol } - - type lexspan = (Lexing.position * Lexing.position) - let to_location : lexspan -> Loc.span = - fun (start, end_) -> - let open Loc in - let start_point = point_of_position start - and end_point = point_of_position end_ in - { file = start.pos_fname; start = start_point; end_ = end_point } - let wrap_location : lexspan -> 'a -> 'a Loc.with_location = fun loc value -> - let location = to_location loc in + let location = Parser_aux.to_location loc in { location; value } - let pp_tag : Parser_aux.tag -> string = function - | Author _ -> "@author" - | Deprecated -> "@deprecated" - | Param _ -> "@param" - | Raise _ -> "@raise/@raises" - | Return -> "@return" - | See _ -> "@see" - | Since _ -> "@since" - | Before _ -> "@before" - | Version _ -> "@version" - | Canonical _ -> "@canonical" - | Inline | Open | Closed | Hidden -> "" - - exception No_children of string Loc.with_location - - let tag : Ast.tag -> Ast.block_element = fun tag -> `Tag tag - - let tag_with_element (children : Ast.nestable_block_element Loc.with_location list) : Parser_aux.tag -> Ast.block_element = function - | Before version -> tag @@ `Before (version, children) - | Deprecated -> tag @@ `Deprecated children - | Return -> tag @@ `Return children - | Param param_name -> tag @@ `Param (param_name, children) - | Raise exn -> tag @@ `Raise (exn, children) - | See (kind, href) -> tag @@ `See (kind, href, children) - | _ -> assert false (* Unreachable *) - - let tag_bare loc = function - | Version version -> tag @@ `Version version - | Since version -> tag @@ `Since version - | Canonical implementation -> tag @@ `Canonical (wrap_location loc implementation) - | Author author -> tag @@ `Author author - | Inline -> `Tag `Inline - | Open -> `Tag `Open - | Closed -> `Tag `Closed - | Hidden -> `Tag `Hidden - (* NOTE: (@FayCarsons) This should be unreachable, remove after dev *) - | tag -> raise @@ No_children (wrap_location loc @@ Printf.sprintf "Tag %s expects children" (pp_tag tag)) - type align_error = | Invalid_align (* An invalid align cell *) | Not_align (* Not an align cell *) @@ -181,7 +132,18 @@ %token Section_heading -%token Tag +(* Tags *) +%token Author +%token DEPRECATED +%token Param +%token Raise +%token RETURN +%token <[ `Url | `File | `Document ] * string> See +%token Since +%token Before +%token Version +%token Canonical +%token INLINE OPEN CLOSED HIDDEN %token Simple_ref %token Ref_with_replacement @@ -229,9 +191,33 @@ let heading := | (num, title) = Section_heading; children = list(located(inline_element)); RIGHT_BRACE; { `Heading (num, title, children) } -let tag := - | inner_tag = Tag; children = located(nestable_block_element)+; { tag_with_element children inner_tag } - | inner_tag = Tag; { tag_bare $loc inner_tag } +let tag == + | ~ = tag_with_content; <`Tag> + | ~ = tag_bare; <`Tag> + +let tag_with_content := + | version = Before; children = located(nestable_block_element)+; + { `Before (version, children) } + | DEPRECATED; children = located(nestable_block_element)+; + { `Deprecated children } + | RETURN; children = located(nestable_block_element)+; + { `Return children } + | ident = Param; children = located(nestable_block_element)+; + { `Param (ident, children) } + | exn = Raise; children = located(nestable_block_element)+; + { `Raise (exn, children) } + | (kind, href) = See; children = located(nestable_block_element)+; + { `See (kind, href, children) } + +let tag_bare := + | version = Version; { `Version version } + | version = Since; { `Since version } + | impl = located(Canonical); { `Canonical impl } + | version = Author; { `Author version } + | OPEN; { `Open } + | INLINE; { `Inline } + | CLOSED; { `Closed } + | HIDDEN; { `Hidden } (* INLINE ELEMENTS *) diff --git a/src/parser/parser_aux.ml b/src/parser/parser_aux.ml index 6859b3dbbc..a0f00308d9 100644 --- a/src/parser/parser_aux.ml +++ b/src/parser/parser_aux.ml @@ -1,19 +1,29 @@ -type media = Reference of string | Link of string +type media = Reference of string | Link of string type media_target = Audio | Video | Image - -type tag = - Author of string +type tag = + | Author of string | Deprecated - | Param of string - | Raise of string + | Param of string + | Raise of string | Return | See of [ `Url | `File | `Document ] * string - | Since of string - | Before of string - | Version of string - | Canonical of string - | Inline - | Open - | Closed - | Hidden + | Since of string + | Before of string + | Version of string + | Canonical of string + | Inline + | Open + | Closed + | Hidden + +let point_of_position Lexing.{ pos_lnum; pos_cnum; _ } = + Loc.{ line = pos_lnum; column = pos_cnum } + +type lexspan = Lexing.position * Lexing.position +let to_location : lexspan -> Loc.span = + fun (start, end_) -> + let open Loc in + let start_point = point_of_position start + and end_point = point_of_position end_ in + { file = start.pos_fname; start = start_point; end_ = end_point } From 54de5e8a6d8551885e67f5c7a4c0fc594eab83f0 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Fri, 15 Nov 2024 14:21:22 -0500 Subject: [PATCH 059/150] intermediate tree and unpack fn --- src/parser/intermediate.ml | 178 +++++++++++++++++++++++++++++++++++++ src/parser/odoc_parser.ml | 30 ++++--- src/parser/parser.mly | 17 ++-- 3 files changed, 207 insertions(+), 18 deletions(-) create mode 100644 src/parser/intermediate.ml diff --git a/src/parser/intermediate.ml b/src/parser/intermediate.ml new file mode 100644 index 0000000000..7a5f19bb1b --- /dev/null +++ b/src/parser/intermediate.ml @@ -0,0 +1,178 @@ +(** Abstract syntax tree representing ocamldoc comments *) + +type +'a warning = [ `Warning of 'a * Warning.t ] + +type inline_element = + [ `Space of string + | `Word of string + | `Code_span of string + | `Raw_markup of string option * string + | `Styled of Ast.style * inline_element Ast.with_location list + | `Reference of + Ast.reference_kind + * string Ast.with_location + * inline_element Ast.with_location list + | `Link of string * inline_element Ast.with_location list + | `Math_span of string (** @since 2.0.0 *) + | Ast.inline_element warning ] + +type 'a cell = 'a Ast.with_location list * [ `Header | `Data ] +type 'a row = 'a cell list +type 'a grid = 'a row list +type 'a abstract_table = 'a grid * Ast.alignment option list option + +and nestable_block_element = + [ `Paragraph of inline_element Ast.with_location list + | `Code_block of Ast.code_block + | `Verbatim of string + | `Modules of string Ast.with_location list + | `List of + Ast.list_kind + * Ast.list_syntax + * nestable_block_element Ast.with_location list list + | `Table of table + | `Math_block of string (** @since 2.0.0 *) + | `Media of + Ast.reference_kind * Ast.media_href Ast.with_location * string * Ast.media + | Ast.nestable_block_element warning ] + +and table = nestable_block_element abstract_table * [ `Light | `Heavy ] + +type internal_tag = + [ `Canonical of string Ast.with_location + | `Inline + | `Open + | `Closed + | `Hidden ] +(** Internal tags are used to exercise fine control over the output of odoc. They + are never rendered in the output *) + +type ocamldoc_tag = + [ `Author of string + | `Deprecated of nestable_block_element Ast.with_location list + | `Param of string * nestable_block_element Ast.with_location list + | `Raise of string * nestable_block_element Ast.with_location list + | `Return of nestable_block_element Ast.with_location list + | `See of + [ `Url | `File | `Document ] + * string + * nestable_block_element Ast.with_location list + | `Since of string + | `Before of string * nestable_block_element Ast.with_location list + | `Version of string ] + +type tag = [ ocamldoc_tag | internal_tag | Ast.tag warning ] +type heading = int * string option * inline_element Ast.with_location list + +type block_element = + [ nestable_block_element | `Heading of heading | `Tag of tag ] + +type t = block_element Ast.with_location list + +module Unpack = struct + let children f = + List.fold_left + (fun (elts, warnings) lelt -> + let elt, elt_warnings = f lelt.Loc.value in + ( Loc.{ value = elt; location = lelt.Loc.location } :: elts, + elt_warnings @ warnings )) + ([], []) + + let rec inline : inline_element -> Ast.inline_element * Warning.t list = + function + | `Warning (node, warning) -> (node, [ warning ]) + | `Code_span s -> (`Code_span s, []) + | `Link (s, child_elts) -> + let children, warnings = children inline child_elts in + (`Link (s, children), warnings) + | `Math_span s -> (`Math_span s, []) + | `Raw_markup (s1, s2) -> (`Raw_markup (s1, s2), []) + | `Reference (k, s, child_elts) -> + let children, warnings = children inline child_elts in + (`Reference (k, s, children), warnings) + | `Space s -> (`Space s, []) + | `Styled (s, child_elts) -> + let children, warnings = children inline child_elts in + (`Styled (s, children), warnings) + | `Word s -> (`Word s, []) + + let rec nestable_block : + nestable_block_element -> Ast.nestable_block_element * Warning.t list = + function + | `Warning (node, warning) -> (node, [ warning ]) + | `Paragraph child_elts -> + let child_elts, warnings = children inline child_elts in + (`Paragraph child_elts, warnings) + | `List (kind, syntax, child_elts) -> + let child_elts, warnings = + List.map (children nestable_block) child_elts |> List.split + in + (`List (kind, syntax, child_elts), List.flatten warnings) + | `Table _ -> assert false + | (`Code_block _ | `Verbatim _ | `Modules _ | `Math_block _ | `Media _) as + node -> + (node, []) + + and table : table -> Ast.table * Warning.t list = + fun ((grid, alignment), syntax) -> + let rec go row_acc warning_acc : + nestable_block_element grid -> + Ast.nestable_block_element grid * Warning.t list = function + | row :: rows -> + let row, warnings = + List.map + (fun (cell : nestable_block_element cell) -> + let cell, tag = cell in + let cell, warnings = children nestable_block cell in + ((cell, tag), warnings)) + row + |> List.split + in + go (row :: row_acc) (List.flatten warnings @ warning_acc) rows + | [] -> (row_acc, warning_acc) + in + let grid, warnings = go [] [] grid in + (((grid, alignment), syntax), warnings) + + let tag : tag -> Ast.tag * Warning.t list = function + | `Warning (node, warning) -> (node, [ warning ]) + | `Deprecated child_elts -> + let child_elts, warnings = children nestable_block child_elts in + (`Deprecated child_elts, warnings) + | ( `Param (name, child_elts) + | `Raise (name, child_elts) + | `Before (name, child_elts) ) as node -> ( + let child_elts, warnings = children nestable_block child_elts in + match node with + | `Param _ -> (`Param (name, child_elts), warnings) + | `Raise _ -> (`Raise (name, child_elts), warnings) + | `Before _ -> (`Before (name, child_elts), warnings)) + | `Return child_elts -> + let child_elts, warnings = children nestable_block child_elts in + (`Return child_elts, warnings) + | `See (ref_kind, ref, child_elts) -> + let child_elts, warnings = children nestable_block child_elts in + (`See (ref_kind, ref, child_elts), warnings) + | ( `Author _ | `Since _ | `Version _ | `Closed | `Open | `Hidden + | `Canonical _ | `Inline ) as t -> + (t, []) +end + +let unpack : t -> Ast.t * Warning.t list = + fun self -> + let open Loc in + let go (nodes, warnings) : + block_element with_location -> Ast.t * Warning.t list = function + | { value = `Heading (n, s, child_elts); _ } as loc -> + let child_elts, warnings = Unpack.children Unpack.inline child_elts in + let heading = `Heading (n, s, child_elts) in + ({ loc with value = heading } :: nodes, warnings) + | { value = `Tag tag; _ } as loc -> + let tag, tag_warnings = Unpack.tag tag in + ({ loc with value = `Tag tag } :: nodes, tag_warnings @ warnings) + | { value = #nestable_block_element as node; _ } as loc -> + let node, node_warnings = Unpack.nestable_block node in + let node = (node :> Ast.block_element) in + ({ loc with value = node } :: nodes, node_warnings @ warnings) + in + List.fold_left go ([], []) self diff --git a/src/parser/odoc_parser.ml b/src/parser/odoc_parser.ml index 6e11781082..5ae833d0aa 100644 --- a/src/parser/odoc_parser.ml +++ b/src/parser/odoc_parser.ml @@ -103,20 +103,30 @@ let position_of_point : t -> Loc.point -> Lexing.position = let parse_comment ~location ~text = let reversed_newlines = reversed_newlines ~input:text in let lexbuf = Lexing.from_string text in - (* We cannot directly pass parameters to Menhir without converting our parser + (* We cannot directly pass parameters to Menhir without converting our parser to a module functor. So we pass our current filename to the lexbuf here *) - lexbuf.lex_curr_p <- { lexbuf.lex_curr_p with pos_fname = Lexing.(location.pos_fname) }; - let lexer_state = - Lexer.{ warnings = [] - ; offset_to_location = offset_to_location ~reversed_newlines ~comment_location:location - ; file = Lexing.(location.pos_fname) } + lexbuf.lex_curr_p <- + { lexbuf.lex_curr_p with pos_fname = Lexing.(location.pos_fname) }; + let lexer_state = + Lexer. + { + warnings = []; + offset_to_location = + offset_to_location ~reversed_newlines ~comment_location:location; + file = Lexing.(location.pos_fname); + } in (* Remove the `Loc.with_location` wrapping our token because Menhir cannot handle that *) - let unwrapped_token lexbuf = - Lexer.token lexer_state lexbuf |> Loc.value + let unwrapped_token lexbuf = Lexer.token lexer_state lexbuf |> Loc.value in + let ast, parser_warnings = + Parser.main unwrapped_token lexbuf |> Intermediate.unpack in - let ast = Parser.main unwrapped_token lexbuf in - { ast; warnings = lexer_state.warnings; reversed_newlines; original_pos = location } + { + ast; + warnings = parser_warnings @ lexer_state.warnings; + reversed_newlines; + original_pos = location; + } (* Accessor functions, as [t] is opaque *) let warnings t = t.warnings diff --git a/src/parser/parser.mly b/src/parser/parser.mly index fa47424362..529e4b9db4 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -11,7 +11,7 @@ (* This could be made a bit more specific by allowing Space elements only at the beginning and end *) - let valid_elements (cell : Ast.inline_element list): string option = + let valid_elements (cell : Intermediate.inline_element list): string option = let rec go acc = function | `Word _ :: _ when Option.is_some acc -> None | `Word word :: rest -> @@ -62,20 +62,21 @@ When we get something that doesn't look like an align at all, we check to see if we've gotten any valid aligns, if so we assume that the cell being considered is supposed to be an align and treat it as an error, otherwise we assume the row is not supposed to be an align row *) - let valid_align_row (row : Ast.inline_element Loc.with_location list list): (Ast.alignment option list, align_error) result = + let valid_align_row (row : Intermediate.inline_element Loc.with_location list + list): (Ast.alignment option list, align_error) result = let (align, not_align) = List.map valid_align_cell row |> List.partition (function Ok _ | Error Invalid_align -> true | _ -> false) in match align, not_align with | _ :: _, _ :: _ -> Error Invalid_align | _ :: _, [] -> sequence align | _ -> Error Not_align - let to_paragraph : Ast.inline_element Loc.with_location list -> Ast.nestable_block_element Loc.with_location list + let to_paragraph : Intermediate.inline_element Loc.with_location list -> Intermediate.nestable_block_element Loc.with_location list = fun words -> let span = Loc.span @@ List.map Loc.location words in [ Loc.at span (`Paragraph words) ] (* Merges inline elements within a cell into a single paragraph element, and tags cells w/ tag *) - let merged_tagged_row tag : Ast.inline_element Loc.with_location list list -> Ast.nestable_block_element Ast.row + let merged_tagged_row tag : Intermediate.inline_element Loc.with_location list list -> Intermediate.nestable_block_element Intermediate.row = List.map (fun elts -> to_paragraph elts, tag) let as_data = merged_tagged_row `Data let as_header = merged_tagged_row `Header @@ -155,7 +156,7 @@ %token END -%start main +%start main %% @@ -175,7 +176,7 @@ let main := | END; { [] } let toplevel := - | block = nestable_block_element; { (block :> Ast.block_element) } + | block = nestable_block_element; { ( block :> Intermediate.block_element) } | ~ = heading; <> let horizontal_whitespace := @@ -269,8 +270,8 @@ let cell_heavy := cell_kind = Table_cell; whitespace?; children = located(nestab let row_heavy == TABLE_ROW; whitespace?; cells = list(cell_heavy); RIGHT_BRACE; whitespace?; { cells } let table_heavy == TABLE_HEAVY; whitespace?; grid = row_heavy+; RIGHT_BRACE; { ((grid, None), `Heavy) } -let cell_light == BAR?; ~ = located(inline_element)+; <> (* Ast.cell *) -let row_light := ~ = cell_light+; BAR?; NEWLINE; <> (* Ast.row *) +let cell_light == BAR?; ~ = located(inline_element)+; <> (* Intermediate.cell *) +let row_light := ~ = cell_light+; BAR?; NEWLINE; <> (* Intermediate.row *) let table_light := (* If the first row is the alignment row then the rest should be data *) From ab1dc0cbd0991732bd36197c1063b43c53a29699 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Fri, 15 Nov 2024 16:17:47 -0500 Subject: [PATCH 060/150] Tokens working, warning being created in parser --- src/parser/dune | 2 +- src/parser/intermediate.ml | 25 +++--- src/parser/lexer.mll | 37 +++++---- src/parser/odoc_parser.ml | 3 +- src/parser/parser.mly | 25 ++++-- src/parser/parser_aux.ml | 14 ++-- src/parser/token.ml | 106 -------------------------- src/parser/{describe.ml => tokens.ml} | 72 ++++++++++++++--- 8 files changed, 123 insertions(+), 161 deletions(-) delete mode 100644 src/parser/token.ml rename src/parser/{describe.ml => tokens.ml} (76%) diff --git a/src/parser/dune b/src/parser/dune index 95dcaf6c6e..06471b9775 100644 --- a/src/parser/dune +++ b/src/parser/dune @@ -2,7 +2,7 @@ (menhir (modules parser) - (flags --table --inspection --dump)) + (flags --table --external-tokens Tokens)) (library (name odoc_parser) diff --git a/src/parser/intermediate.ml b/src/parser/intermediate.ml index 7a5f19bb1b..29cbf04694 100644 --- a/src/parser/intermediate.ml +++ b/src/parser/intermediate.ml @@ -1,6 +1,7 @@ (** Abstract syntax tree representing ocamldoc comments *) -type +'a warning = [ `Warning of 'a * Warning.t ] +type partial_warning = filename:string -> Warning.t +type +'a warning = [ `Warning of 'a * partial_warning ] type inline_element = [ `Space of string @@ -78,7 +79,7 @@ module Unpack = struct elt_warnings @ warnings )) ([], []) - let rec inline : inline_element -> Ast.inline_element * Warning.t list = + let rec inline : inline_element -> Ast.inline_element * partial_warning list = function | `Warning (node, warning) -> (node, [ warning ]) | `Code_span s -> (`Code_span s, []) @@ -97,8 +98,8 @@ module Unpack = struct | `Word s -> (`Word s, []) let rec nestable_block : - nestable_block_element -> Ast.nestable_block_element * Warning.t list = - function + nestable_block_element -> + Ast.nestable_block_element * partial_warning list = function | `Warning (node, warning) -> (node, [ warning ]) | `Paragraph child_elts -> let child_elts, warnings = children inline child_elts in @@ -113,11 +114,11 @@ module Unpack = struct node -> (node, []) - and table : table -> Ast.table * Warning.t list = + and table : table -> Ast.table * partial_warning list = fun ((grid, alignment), syntax) -> let rec go row_acc warning_acc : nestable_block_element grid -> - Ast.nestable_block_element grid * Warning.t list = function + Ast.nestable_block_element grid * partial_warning list = function | row :: rows -> let row, warnings = List.map @@ -134,7 +135,7 @@ module Unpack = struct let grid, warnings = go [] [] grid in (((grid, alignment), syntax), warnings) - let tag : tag -> Ast.tag * Warning.t list = function + let tag : tag -> Ast.tag * partial_warning list = function | `Warning (node, warning) -> (node, [ warning ]) | `Deprecated child_elts -> let child_elts, warnings = children nestable_block child_elts in @@ -158,11 +159,11 @@ module Unpack = struct (t, []) end -let unpack : t -> Ast.t * Warning.t list = - fun self -> +let unpack : filename:string -> t -> Ast.t * Warning.t list = + fun ~filename self -> let open Loc in let go (nodes, warnings) : - block_element with_location -> Ast.t * Warning.t list = function + block_element with_location -> Ast.t * partial_warning list = function | { value = `Heading (n, s, child_elts); _ } as loc -> let child_elts, warnings = Unpack.children Unpack.inline child_elts in let heading = `Heading (n, s, child_elts) in @@ -175,4 +176,6 @@ let unpack : t -> Ast.t * Warning.t list = let node = (node :> Ast.block_element) in ({ loc with value = node } :: nodes, node_warnings @ warnings) in - List.fold_left go ([], []) self + let ast, warnings = List.fold_left go ([], []) self in + let warnings = List.map (fun mk_warning -> mk_warning ~filename) warnings in + (ast, warnings) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 7562a4aa06..46a365f8a4 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -1,6 +1,6 @@ { -open Parser +open Tokens let unescape_word : string -> string = fun s -> (* The common case is that there are no escape sequences. *) @@ -182,7 +182,7 @@ let warning = with_location_adjustments (fun input location error -> input.warnings <- (error location) :: input.warnings) -let reference_token media start target input lexbuf = +let reference_token media start ( target : string ) input lexbuf = match start with | "{!" -> Simple_ref target | "{{!" -> Ref_with_replacement target @@ -198,7 +198,6 @@ let reference_token media start target input lexbuf = | _ -> let target, kind = - let open Parser_aux in match start with | "{{image!" -> Reference target, Image | "{{image:" -> Link target, Image @@ -208,7 +207,7 @@ let reference_token media start target input lexbuf = | "{{video:" -> Link target, Video | _ -> assert false in - let token_descr = Describe.describe (Media_with_replacement (target, kind, "")) in + let token_descr = Tokens.describe (Media_with_replacement (target, kind, "")) in let content = media token_descr (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) input lexbuf in Media_with_replacement (target, kind, content) @@ -486,8 +485,8 @@ and token input = parse input ~start_offset:(Lexing.lexeme_end lexbuf) (Parse_error.not_allowed - ~what:(Describe.describe END) - ~in_what:(Describe.describe token)); + ~what:(Tokens.describe END) + ~in_what:(Tokens.describe token)); emit lexbuf input token } | "{ul" @@ -622,8 +621,8 @@ and token input = parse input ~start_offset:(Lexing.lexeme_end lexbuf) (Parse_error.not_allowed - ~what:(Describe.describe END) - ~in_what:(Describe.describe (Modules ""))); + ~what:(Tokens.describe END) + ~in_what:(Tokens.describe (Modules ""))); emit lexbuf input (Modules modules) } and code_span buffer nesting_level start_offset input = parse @@ -648,8 +647,8 @@ and code_span buffer nesting_level start_offset input = parse lexbuf input (Parse_error.not_allowed - ~what:(Describe.describe (Blank_line "\n\n")) - ~in_what:(Describe.describe (Code_span ""))); + ~what:(Tokens.describe (Blank_line "\n\n")) + ~in_what:(Tokens.describe (Code_span ""))); Buffer.add_char buffer ' '; code_span buffer nesting_level start_offset input lexbuf } | newline horizontal_space* @@ -661,8 +660,8 @@ and code_span buffer nesting_level start_offset input = parse lexbuf input (Parse_error.not_allowed - ~what:(Describe.describe END) - ~in_what:(Describe.describe (Code_span ""))); + ~what:(Tokens.describe END) + ~in_what:(Tokens.describe (Code_span ""))); emit lexbuf input (Code_span (Buffer.contents buffer)) ~start_offset } | _ as c @@ -692,8 +691,8 @@ and math kind buffer nesting_level start_offset input = parse lexbuf input (Parse_error.not_allowed - ~what:(Describe.describe (Blank_line "\n")) - ~in_what:(Describe.describe (math_constr kind ""))); + ~what:(Tokens.describe (Blank_line "\n")) + ~in_what:(Tokens.describe (math_constr kind ""))); Buffer.add_char buffer '\n'; math kind buffer nesting_level start_offset input lexbuf | Block -> @@ -705,8 +704,8 @@ and math kind buffer nesting_level start_offset input = parse lexbuf input (Parse_error.not_allowed - ~what:(Describe.describe END) - ~in_what:(Describe.describe (math_constr kind ""))); + ~what:(Tokens.describe END) + ~in_what:(Tokens.describe (math_constr kind ""))); emit lexbuf input (math_constr kind (Buffer.contents buffer)) ~start_offset } | _ as c { Buffer.add_char buffer c; @@ -732,7 +731,7 @@ and media tok_descr buffer nesting_level start_offset input = parse lexbuf input (Parse_error.not_allowed - ~what:(Describe.describe END) + ~what:(Tokens.describe END) ~in_what:tok_descr); Buffer.contents buffer} | (newline) @@ -759,8 +758,8 @@ and verbatim buffer last_false_terminator start_offset input = parse lexbuf input (Parse_error.not_allowed - ~what:(Describe.describe END) - ~in_what:(Describe.describe (Verbatim ""))) + ~what:(Tokens.describe END) + ~in_what:(Tokens.describe (Verbatim ""))) | Some location -> warning lexbuf diff --git a/src/parser/odoc_parser.ml b/src/parser/odoc_parser.ml index 5ae833d0aa..5ec70787a5 100644 --- a/src/parser/odoc_parser.ml +++ b/src/parser/odoc_parser.ml @@ -119,7 +119,8 @@ let parse_comment ~location ~text = (* Remove the `Loc.with_location` wrapping our token because Menhir cannot handle that *) let unwrapped_token lexbuf = Lexer.token lexer_state lexbuf |> Loc.value in let ast, parser_warnings = - Parser.main unwrapped_token lexbuf |> Intermediate.unpack + Parser.main unwrapped_token lexbuf + |> Intermediate.unpack ~filename:lexer_state.Lexer.file in { ast; diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 529e4b9db4..3ccb79b34f 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -81,12 +81,16 @@ let as_data = merged_tagged_row `Data let as_header = merged_tagged_row `Header - let media_kind_of_target = function + let media_kind_of_target = + let open Tokens in + function | Audio -> `Audio | Video -> `Video | Image -> `Image - let href_of_media = function + let href_of_media = + let open Tokens in + function | Reference name -> `Reference name | Link uri -> `Link uri @@ -95,7 +99,6 @@ let split_replacement_media Loc.{ location; value = (media, target, content) } = (Loc.at location media, target, content) - %} %token SPACE NEWLINE @@ -150,12 +153,11 @@ %token Ref_with_replacement %token Simple_link %token Link_with_replacement -%token Media -%token Media_with_replacement +%token Media +%token Media_with_replacement %token Verbatim %token END - %start main %% @@ -228,6 +230,17 @@ let inline_element := | ~ = Code_span; <`Code_span> | ~ = Raw_markup; <`Raw_markup> | style = Style; inner = located(inline_element)+; RIGHT_BRACE; { `Styled (style, inner) } + | style = Style; RIGHT_BRACE; + { + let location = Parser_aux.to_location $sloc in + let node = `Styled (style, [Loc.at location (`Word "")]) in + let what = Tokens.describe @@ Style style in + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what span + in + `Warning (node, warning) + } | ~ = Math_span; <`Math_span> | ~ = ref; <> | ~ = link; <> diff --git a/src/parser/parser_aux.ml b/src/parser/parser_aux.ml index a0f00308d9..1d3a3f721d 100644 --- a/src/parser/parser_aux.ml +++ b/src/parser/parser_aux.ml @@ -1,6 +1,3 @@ -type media = Reference of string | Link of string -type media_target = Audio | Video | Image - type tag = | Author of string | Deprecated @@ -21,9 +18,14 @@ let point_of_position Lexing.{ pos_lnum; pos_cnum; _ } = Loc.{ line = pos_lnum; column = pos_cnum } type lexspan = Lexing.position * Lexing.position -let to_location : lexspan -> Loc.span = - fun (start, end_) -> + +let to_location : ?filename:string -> lexspan -> Loc.span = + fun ?filename (start, end_) -> let open Loc in let start_point = point_of_position start and end_point = point_of_position end_ in - { file = start.pos_fname; start = start_point; end_ = end_point } + { + file = Option.value ~default:start.pos_fname filename; + start = start_point; + end_ = end_point; + } diff --git a/src/parser/token.ml b/src/parser/token.ml deleted file mode 100644 index ebc2846c99..0000000000 --- a/src/parser/token.ml +++ /dev/null @@ -1,106 +0,0 @@ -(* NOTE : (@faycarsons) keep as reference for the moment. - Should probably become utilities for Menhir-defined tokens *) - -(* This module contains the token type, emitted by the lexer, and consumed by - the comment syntax parser. It also contains two functions that format tokens - for error messages. *) - -type section_heading = [ `Begin_section_heading of int * string option ] -type style = [ `Bold | `Italic | `Emphasis | `Superscript | `Subscript ] -type paragraph_style = [ `Left | `Center | `Right ] - - -type tag = - [ `Tag of - [ `Author of string - | `Deprecated - | `Param of string - | `Raise of string - | `Return - | `See of [ `Url | `File | `Document ] * string - | `Since of string - | `Before of string - | `Version of string - | `Canonical of string - | `Inline - | `Open - | `Closed - | `Hidden ] ] - -type media = [ `Audio | `Video | `Image ] -type media_href = [ `Reference of string | `Link of string ] - -type media_markup = - [ `Simple_media of media_href * media - | `Media_with_replacement_text of media_href * media * string ] - -let s_of_media kind media = - match (kind, media) with - | `Simple, `Audio -> "{audio!" - | `Simple, `Video -> "{video!" - | `Simple, `Image -> "{image!" - | `Replaced, `Audio -> "{{audio!" - | `Replaced, `Video -> "{{video!" - | `Replaced, `Image -> "{{image!" - -type t = - [ (* End of input. *) - `End - | (* Runs of whitespace. [Blank_line] is any run of whitespace that contains two - or more newline characters. [Single_newline] is any run of whitespace that - contains exactly one newline character. [Space] is any run of whitespace - that contains no newline characters. - - It is an important invariant in the parser that no adjacent whitespace - tokens are emitted by the lexer. Otherwise, there would be the need for - unbounded lookahead, a (co-?)ambiguity between - [Single_newline Single_newline] and [Blank_line], and other problems. *) - `Space of - string - | `Single_newline of string - | `Blank_line of string - | (* A right curly brace ([}]), i.e. end of markup. *) - `Right_brace - | `Right_code_delimiter - | (* Words are anything that is not whitespace or markup. Markup symbols can be - be part of words if escaped. - - Words can contain plus and minus symbols, but those are emitted as [Plus] - and [Minus] tokens. The parser combines plus and minus into words, except - when they appear first on a line, in which case the tokens are list item - bullets. *) - `Word of - string - | `Code_span of string - | `Raw_markup of string option * string - | `Math_span of string - | `Math_block of string - | `Begin_style of style - | `Begin_paragraph_style of paragraph_style - | (* Other inline element markup. *) - `Simple_reference of string - | `Begin_reference_with_replacement_text of string - | `Simple_link of string - | `Begin_link_with_replacement_text of string - | media_markup - | (* Leaf block element markup. *) - `Code_block of - (string Loc.with_location * string Loc.with_location option) option - * string - * string Loc.with_location - * bool - | `Verbatim of string - | `Modules of string - | (* List markup. *) - `Begin_list of [ `Unordered | `Ordered ] - | `Begin_list_item of [ `Li | `Dash ] - | (* Table markup. *) - `Begin_table_light - | `Begin_table_heavy - | `Begin_table_row - | `Begin_table_cell of [ `Header | `Data ] - | `Minus - | `Plus - | `Bar - | section_heading - | tag ] diff --git a/src/parser/describe.ml b/src/parser/tokens.ml similarity index 76% rename from src/parser/describe.ml rename to src/parser/tokens.ml index 19e34b7a47..0339b732db 100644 --- a/src/parser/describe.ml +++ b/src/parser/tokens.ml @@ -1,7 +1,64 @@ -open Parser +type ref_kind = Simple | With_replacement + +type media = Reference of string | Link of string +type media_target = Audio | Video | Image + +type paragraph_style = [ `Left | `Center | `Right ] + +type style = [ `Bold | `Italic | `Emphasis | `Superscript | `Subscript ] +type table_cell_kind = [ `Header | `Data ] + +type token = + | SPACE + | Space of string + | NEWLINE + | Single_newline of string + | Blank_line of string + | Simple_ref of string + | Ref_with_replacement of string + | Simple_link of string + | Link_with_replacement of string + | Modules of string + | Media of (media * media_target) + | Media_with_replacement of (media * media_target * string) + | Math_span of string + | Math_block of string + | Code_span of string + | Code_block of Ast.code_block + | Word of string + | Verbatim of string + | RIGHT_CODE_DELIMITER + | RIGHT_BRACE + | Paragraph_style of paragraph_style + | Style of style + | List of Ast.list_kind + | List_item of Ast.list_item + | TABLE_LIGHT + | TABLE_HEAVY + | TABLE_ROW + | Table_cell of Ast.table_cell_kind + | MINUS + | PLUS + | BAR + | Section_heading of (int * string option) + | Author of string + | DEPRECATED + | Param of string + | Raise of string + | RETURN + | See of ([ `Url | `File | `Document ] * string) + | Since of string + | Before of string + | Version of string + | Canonical of string + | INLINE + | OPEN + | CLOSED + | HIDDEN + | Raw_markup of (string option * string) + | END let media_description ref_kind media_kind = - let open Parser_aux in let media_kind = match media_kind with | Audio -> "audio" @@ -11,7 +68,7 @@ let media_description ref_kind media_kind = let ref_kind = match ref_kind with Reference _ -> "!" | Link _ -> ":" in (ref_kind, media_kind) -let print : Parser.token -> string = function +let print : token -> string = function | SPACE | Space _ -> "\t" | NEWLINE | Single_newline _ -> "\n" | Blank_line _ -> "\n\n" @@ -78,7 +135,7 @@ let print : Parser.token -> string = function (* [`Minus] and [`Plus] are interpreted as if they start list items. Therefore, for error messages based on [Token.describe] to be accurate, formatted [`Minus] and [`Plus] should always be plausibly list item bullets. *) -let describe : Parser.token -> string = function +let describe : token -> string = function | Space _ -> "(horizontal space)" | Media (ref_kind, media_kind) -> let ref_kind, media_kind = media_description ref_kind media_kind in @@ -140,10 +197,3 @@ let describe : Parser.token -> string = function | OPEN -> "'@open'" | CLOSED -> "'@closed'" | HIDDEN -> "'@hidden" - -(* NOTE : (@faycarsons) Should this be in Ast.ml? This takes an Ast.t no? *) -let describe_element = function - | `Reference (`Simple, _, _) -> describe (Simple_ref "") - | `Reference (`With_text, _, _) -> describe (Ref_with_replacement "") - | `Link _ -> describe (Link_with_replacement "") - | `Heading (level, _, _) -> describe (Section_heading (level, None)) From 6595f2c854782b915fb6e9f650a6a06d423c60fe Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 25 Nov 2024 10:36:08 -0500 Subject: [PATCH 061/150] Squash merge tester-original into intermediate --- src/parser/intermediate.ml | 181 ----------- src/parser/lexer.mll | 5 +- src/parser/loc.ml | 6 + src/parser/loc.mli | 3 + src/parser/odoc_parser.ml | 25 +- src/parser/odoc_parser.mli | 10 + src/parser/parser.mly | 494 ++++++++++++++++++++----------- src/parser/test/test.ml | 2 + src/parser/test/test.mli | 1 + src/parser/test_driver/dune | 5 + src/parser/test_driver/tester.ml | 161 ++++++++++ src/parser/tokens.ml | 14 +- src/parser/writer.ml | 50 ++++ 13 files changed, 590 insertions(+), 367 deletions(-) delete mode 100644 src/parser/intermediate.ml create mode 100644 src/parser/test_driver/dune create mode 100644 src/parser/test_driver/tester.ml create mode 100644 src/parser/writer.ml diff --git a/src/parser/intermediate.ml b/src/parser/intermediate.ml deleted file mode 100644 index 29cbf04694..0000000000 --- a/src/parser/intermediate.ml +++ /dev/null @@ -1,181 +0,0 @@ -(** Abstract syntax tree representing ocamldoc comments *) - -type partial_warning = filename:string -> Warning.t -type +'a warning = [ `Warning of 'a * partial_warning ] - -type inline_element = - [ `Space of string - | `Word of string - | `Code_span of string - | `Raw_markup of string option * string - | `Styled of Ast.style * inline_element Ast.with_location list - | `Reference of - Ast.reference_kind - * string Ast.with_location - * inline_element Ast.with_location list - | `Link of string * inline_element Ast.with_location list - | `Math_span of string (** @since 2.0.0 *) - | Ast.inline_element warning ] - -type 'a cell = 'a Ast.with_location list * [ `Header | `Data ] -type 'a row = 'a cell list -type 'a grid = 'a row list -type 'a abstract_table = 'a grid * Ast.alignment option list option - -and nestable_block_element = - [ `Paragraph of inline_element Ast.with_location list - | `Code_block of Ast.code_block - | `Verbatim of string - | `Modules of string Ast.with_location list - | `List of - Ast.list_kind - * Ast.list_syntax - * nestable_block_element Ast.with_location list list - | `Table of table - | `Math_block of string (** @since 2.0.0 *) - | `Media of - Ast.reference_kind * Ast.media_href Ast.with_location * string * Ast.media - | Ast.nestable_block_element warning ] - -and table = nestable_block_element abstract_table * [ `Light | `Heavy ] - -type internal_tag = - [ `Canonical of string Ast.with_location - | `Inline - | `Open - | `Closed - | `Hidden ] -(** Internal tags are used to exercise fine control over the output of odoc. They - are never rendered in the output *) - -type ocamldoc_tag = - [ `Author of string - | `Deprecated of nestable_block_element Ast.with_location list - | `Param of string * nestable_block_element Ast.with_location list - | `Raise of string * nestable_block_element Ast.with_location list - | `Return of nestable_block_element Ast.with_location list - | `See of - [ `Url | `File | `Document ] - * string - * nestable_block_element Ast.with_location list - | `Since of string - | `Before of string * nestable_block_element Ast.with_location list - | `Version of string ] - -type tag = [ ocamldoc_tag | internal_tag | Ast.tag warning ] -type heading = int * string option * inline_element Ast.with_location list - -type block_element = - [ nestable_block_element | `Heading of heading | `Tag of tag ] - -type t = block_element Ast.with_location list - -module Unpack = struct - let children f = - List.fold_left - (fun (elts, warnings) lelt -> - let elt, elt_warnings = f lelt.Loc.value in - ( Loc.{ value = elt; location = lelt.Loc.location } :: elts, - elt_warnings @ warnings )) - ([], []) - - let rec inline : inline_element -> Ast.inline_element * partial_warning list = - function - | `Warning (node, warning) -> (node, [ warning ]) - | `Code_span s -> (`Code_span s, []) - | `Link (s, child_elts) -> - let children, warnings = children inline child_elts in - (`Link (s, children), warnings) - | `Math_span s -> (`Math_span s, []) - | `Raw_markup (s1, s2) -> (`Raw_markup (s1, s2), []) - | `Reference (k, s, child_elts) -> - let children, warnings = children inline child_elts in - (`Reference (k, s, children), warnings) - | `Space s -> (`Space s, []) - | `Styled (s, child_elts) -> - let children, warnings = children inline child_elts in - (`Styled (s, children), warnings) - | `Word s -> (`Word s, []) - - let rec nestable_block : - nestable_block_element -> - Ast.nestable_block_element * partial_warning list = function - | `Warning (node, warning) -> (node, [ warning ]) - | `Paragraph child_elts -> - let child_elts, warnings = children inline child_elts in - (`Paragraph child_elts, warnings) - | `List (kind, syntax, child_elts) -> - let child_elts, warnings = - List.map (children nestable_block) child_elts |> List.split - in - (`List (kind, syntax, child_elts), List.flatten warnings) - | `Table _ -> assert false - | (`Code_block _ | `Verbatim _ | `Modules _ | `Math_block _ | `Media _) as - node -> - (node, []) - - and table : table -> Ast.table * partial_warning list = - fun ((grid, alignment), syntax) -> - let rec go row_acc warning_acc : - nestable_block_element grid -> - Ast.nestable_block_element grid * partial_warning list = function - | row :: rows -> - let row, warnings = - List.map - (fun (cell : nestable_block_element cell) -> - let cell, tag = cell in - let cell, warnings = children nestable_block cell in - ((cell, tag), warnings)) - row - |> List.split - in - go (row :: row_acc) (List.flatten warnings @ warning_acc) rows - | [] -> (row_acc, warning_acc) - in - let grid, warnings = go [] [] grid in - (((grid, alignment), syntax), warnings) - - let tag : tag -> Ast.tag * partial_warning list = function - | `Warning (node, warning) -> (node, [ warning ]) - | `Deprecated child_elts -> - let child_elts, warnings = children nestable_block child_elts in - (`Deprecated child_elts, warnings) - | ( `Param (name, child_elts) - | `Raise (name, child_elts) - | `Before (name, child_elts) ) as node -> ( - let child_elts, warnings = children nestable_block child_elts in - match node with - | `Param _ -> (`Param (name, child_elts), warnings) - | `Raise _ -> (`Raise (name, child_elts), warnings) - | `Before _ -> (`Before (name, child_elts), warnings)) - | `Return child_elts -> - let child_elts, warnings = children nestable_block child_elts in - (`Return child_elts, warnings) - | `See (ref_kind, ref, child_elts) -> - let child_elts, warnings = children nestable_block child_elts in - (`See (ref_kind, ref, child_elts), warnings) - | ( `Author _ | `Since _ | `Version _ | `Closed | `Open | `Hidden - | `Canonical _ | `Inline ) as t -> - (t, []) -end - -let unpack : filename:string -> t -> Ast.t * Warning.t list = - fun ~filename self -> - let open Loc in - let go (nodes, warnings) : - block_element with_location -> Ast.t * partial_warning list = function - | { value = `Heading (n, s, child_elts); _ } as loc -> - let child_elts, warnings = Unpack.children Unpack.inline child_elts in - let heading = `Heading (n, s, child_elts) in - ({ loc with value = heading } :: nodes, warnings) - | { value = `Tag tag; _ } as loc -> - let tag, tag_warnings = Unpack.tag tag in - ({ loc with value = `Tag tag } :: nodes, tag_warnings @ warnings) - | { value = #nestable_block_element as node; _ } as loc -> - let node, node_warnings = Unpack.nestable_block node in - let node = (node :> Ast.block_element) in - ({ loc with value = node } :: nodes, node_warnings @ warnings) - in - let ast, warnings = List.fold_left go ([], []) self in - let warnings = List.map (fun mk_warning -> mk_warning ~filename) warnings in - (ast, warnings) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 46a365f8a4..f6b35297b2 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -144,6 +144,7 @@ type input = { mutable warnings : Warning.t list; } +(* TODO: Rewrite to add location inside tokens *) let with_location_adjustments k lexbuf input ?start_offset ?adjust_start_by ?end_offset ?adjust_end_by value = @@ -496,10 +497,10 @@ and token input = parse { emit lexbuf input (List `Ordered) } | "{li" - { emit lexbuf input (List_item `Li) } + { emit lexbuf input LI } | "{-" - { emit lexbuf input (List_item `Dash) } + { emit lexbuf input DASH } | "{table" { emit lexbuf input TABLE_HEAVY } diff --git a/src/parser/loc.ml b/src/parser/loc.ml index 46875a1f6f..892376eb02 100644 --- a/src/parser/loc.ml +++ b/src/parser/loc.ml @@ -26,6 +26,12 @@ let nudge_start offset span = let nudge_end offset span = { span with end_ = { span.end_ with column = span.end_.column - offset } } +let nudge_map_start offset loc = + { loc with location = nudge_start offset loc.location } + +let nudge_map_end offset loc = + { loc with location = nudge_end offset loc.location } + let spans_multiple_lines = function | { location = diff --git a/src/parser/loc.mli b/src/parser/loc.mli index ad8dedb77a..599f8d91a5 100644 --- a/src/parser/loc.mli +++ b/src/parser/loc.mli @@ -28,6 +28,9 @@ val nudge_end : int -> span -> span type +'a with_location = { location : span; value : 'a } (** Describes values located at a particular span *) +val nudge_map_start : int -> 'a with_location -> 'a with_location +val nudge_map_end : int -> 'a with_location -> 'a with_location + val at : span -> 'a -> 'a with_location (** Constructor for {!with_location} *) diff --git a/src/parser/odoc_parser.ml b/src/parser/odoc_parser.ml index 5ec70787a5..2a6699fcd2 100644 --- a/src/parser/odoc_parser.ml +++ b/src/parser/odoc_parser.ml @@ -79,6 +79,24 @@ let offset_to_location : in scan_to_last_newline reversed_newlines +module Tester = struct + include Parser + let is_EOI = function Tokens.END -> true | _ -> false + let pp_warning = Warning.to_string + let run = Writer.run + let token = + let dummy_loc = + Lexer. + { + warnings = []; + file = "f.ml"; + offset_to_location = Fun.const Loc.{ line = 1; column = 0 }; + } + in + Lexer.token dummy_loc + let string_of_token = Tokens.describe +end + (* Given a Loc.point and the result of [parse_comment], this function returns a valid Lexing.position *) let position_of_point : t -> Loc.point -> Lexing.position = @@ -118,13 +136,12 @@ let parse_comment ~location ~text = in (* Remove the `Loc.with_location` wrapping our token because Menhir cannot handle that *) let unwrapped_token lexbuf = Lexer.token lexer_state lexbuf |> Loc.value in - let ast, parser_warnings = - Parser.main unwrapped_token lexbuf - |> Intermediate.unpack ~filename:lexer_state.Lexer.file + let ast, warnings = + Writer.run ~filename:lexer_state.file @@ Parser.main unwrapped_token lexbuf in { ast; - warnings = parser_warnings @ lexer_state.warnings; + warnings = warnings @ lexer_state.warnings; reversed_newlines; original_pos = location; } diff --git a/src/parser/odoc_parser.mli b/src/parser/odoc_parser.mli index 5dbd4a0815..dc98f35618 100644 --- a/src/parser/odoc_parser.mli +++ b/src/parser/odoc_parser.mli @@ -19,6 +19,16 @@ val parse_comment : location:Lexing.position -> text:string -> t module Ast = Ast module Loc = Loc +module Tester : sig + type token + val is_EOI : token -> bool + val pp_warning : Warning.t -> string + val run : filename:string -> Ast.t Writer.t -> Ast.t * Warning.t list + val token : Lexing.lexbuf -> token Loc.with_location + val main : (Lexing.lexbuf -> token) -> Lexing.lexbuf -> Ast.t Writer.t + val string_of_token : token -> string +end + (** Warnings produced during parsing. *) module Warning : sig type t = Warning.t = { location : Loc.span; message : string } diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 3ccb79b34f..fdb1d14329 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -1,104 +1,144 @@ %{ open Parser_aux - let wrap_location : lexspan -> 'a -> 'a Loc.with_location = fun loc value -> - let location = Parser_aux.to_location loc in + open Writer.Prelude + let ( let+ ) = Fun.flip Writer.map + let wrap_location : lexspan -> 'a -> 'a Loc.with_location = + fun loc value -> + let location = Parser_aux.to_location loc in { location; value } - type align_error = - | Invalid_align (* An invalid align cell *) - | Not_align (* Not an align cell *) + type align_error = + | Invalid_align (* An invalid align cell *) + | Not_align (* Not an align cell *) (* This could be made a bit more specific by allowing Space elements only at - the beginning and end *) - let valid_elements (cell : Intermediate.inline_element list): string option = + the beginning and end *) + let valid_elements (cell : Ast.inline_element list) : string option = let rec go acc = function - | `Word _ :: _ when Option.is_some acc -> None - | `Word word :: rest -> - go (Some word) rest - | `Space _ :: rest -> - go acc rest - | _ :: _ -> None + | `Word _ :: _ when Option.is_some acc -> None + | `Word word :: rest -> go (Some word) rest + | `Space _ :: rest -> go acc rest + | _ :: _ -> None | [] -> acc - in + in go None cell let valid_word word = match String.length word with | 0 -> Ok None - | 1 -> - begin + | 1 -> ( match word.[0] with | ':' -> Ok (Some `Center) | '-' -> Ok None - | _ -> Error Not_align - end - | len -> - if String.for_all (Char.equal '-') (String.sub word 1 (len - 2)) then - match word.[0], word.[pred len] with - | ':', '-' -> Ok (Some `Left) - | '-', ':' -> Ok (Some `Right) - | ':', ':' -> Ok (Some `Center) - | '-', '-' -> Ok None - | _ -> Error Invalid_align - else Error Not_align - - let valid_align_cell cell = + | _ -> Error Not_align) + | len -> + if String.for_all (Char.equal '-') (String.sub word 1 (len - 2)) then + match (word.[0], word.[pred len]) with + | ':', '-' -> Ok (Some `Left) + | '-', ':' -> Ok (Some `Right) + | ':', ':' -> Ok (Some `Center) + | '-', '-' -> Ok None + | _ -> Error Invalid_align + else Error Not_align + + let valid_align_cell (cell : Ast.inline_element Loc.with_location list) = match List.map Loc.value cell |> valid_elements with - | Some word -> valid_word word + | Some word -> valid_word word | None -> Error Not_align let sequence : ('elt, 'err) result list -> ('elt list, 'err) result = - fun list -> + fun list -> let rec go acc : ('elt, 'err) result list -> ('elt list, 'err) result = - function + function | Ok x :: xs -> go (x :: acc) xs | Error err :: _ -> Error err - | [] -> Ok (List.rev acc) - in + | [] -> Ok (List.rev acc) + in go [] list - (* NOTE: (@FayCarsons) - When we get something that doesn't look like an align at all, we check to see if we've gotten - any valid aligns, if so we assume that the cell being considered is supposed to be an align and treat it as an error, - otherwise we assume the row is not supposed to be an align row *) - let valid_align_row (row : Intermediate.inline_element Loc.with_location list - list): (Ast.alignment option list, align_error) result = - let (align, not_align) = List.map valid_align_cell row |> List.partition (function Ok _ | Error Invalid_align -> true | _ -> false) in - match align, not_align with + (* NOTE: (@FayCarsons) + When we get something that doesn't look like an align at all, we check to see if we've gotten + any valid aligns, if so we assume that the cell being considered is supposed to be an align and treat it as an error, + otherwise we assume the row is not supposed to be an align row *) + let valid_align_row (row : Ast.inline_element Loc.with_location list list) : + (Ast.alignment option list, align_error) result = + let align, not_align = + List.map valid_align_cell row + |> List.partition (function + | Ok _ | Error Invalid_align -> true + | _ -> false) + in + match (align, not_align) with | _ :: _, _ :: _ -> Error Invalid_align | _ :: _, [] -> sequence align | _ -> Error Not_align - let to_paragraph : Intermediate.inline_element Loc.with_location list -> Intermediate.nestable_block_element Loc.with_location list - = fun words -> - let span = Loc.span @@ List.map Loc.location words in - [ Loc.at span (`Paragraph words) ] + let to_paragraph : + Ast.inline_element Loc.with_location list -> + Ast.nestable_block_element Loc.with_location = + fun words -> + let span = Loc.span @@ List.map Loc.location words in + Loc.at span (`Paragraph words) (* Merges inline elements within a cell into a single paragraph element, and tags cells w/ tag *) - let merged_tagged_row tag : Intermediate.inline_element Loc.with_location list list -> Intermediate.nestable_block_element Intermediate.row - = List.map (fun elts -> to_paragraph elts, tag) + let merged_tagged_row tag : 'a Loc.with_location list list -> 'b = + List.map (fun elts -> ([ to_paragraph elts ], tag)) let as_data = merged_tagged_row `Data let as_header = merged_tagged_row `Header - let media_kind_of_target = - let open Tokens in - function - | Audio -> `Audio - | Video -> `Video - | Image -> `Image - - let href_of_media = - let open Tokens in - function - | Reference name -> `Reference name - | Link uri -> `Link uri - - let split_simple_media Loc.{ location; value = (media, target) } = + let mktable data header align = + match valid_align_row align with + | Ok alignment -> `Table ((header :: data, Some alignment), `Light) + | Error Invalid_align -> `Table ((header :: data, None), `Light) + | Error Not_align -> + let rows = header :: as_data align :: data in + `Table ((rows, None), `Light) + + let construct_table : + ?header:Ast.inline_element Loc.with_location list list Writer.t -> + ?align:Ast.inline_element Loc.with_location list list Writer.t -> + Ast.inline_element Loc.with_location list list list Writer.t -> + Ast.nestable_block_element Writer.t = + fun ?header ?align data -> + let* data = Writer.map (List.map as_data) data in + let header = Option.value ~default:(Writer.return []) header in + let* header = Writer.map as_header header in + match align with + | Some align -> + let* table = Writer.map (mktable data header) align in + Writer.return table + | None -> Writer.return @@ mktable data header [] + + let unclosed_table + ?(data : + Ast.inline_element Loc.with_location list list list Writer.t option) + warning : Ast.nestable_block_element Writer.t = + let node = + match data with + | Some data -> + Writer.map + (fun data -> `Table ((List.map as_data data, None), `Light)) + data + | None -> Writer.return @@ `Table (([], None), `Light) + in + Writer.warning warning node + + let media_kind_of_target = + let open Tokens in + function Audio -> `Audio | Video -> `Video | Image -> `Image + + let href_of_media = + let open Tokens in + function Reference name -> `Reference name | Link uri -> `Link uri + + let split_simple_media Loc.{ location; value = media, target } = (Loc.at location media, target) - let split_replacement_media Loc.{ location; value = (media, target, content) } = + let split_replacement_media Loc.{ location; value = media, target, content } = (Loc.at location media, target, content) + + let tag t = `Tag t %} %token SPACE NEWLINE @@ -127,7 +167,7 @@ %token Code_span %token List -%token List_item +%token LI DASH %token TABLE_LIGHT %token TABLE_HEAVY @@ -158,120 +198,219 @@ %token Verbatim %token END -%start main +%start main %% (* Utility which wraps the return value of a producer in `Loc.with_location` *) -let located(rule) == inner = rule; { wrap_location $sloc inner } +let locatedM(rule) == inner = rule; { Writer.map (wrap_location $loc) inner } +let located(rule) == | inner = rule; { wrap_location $sloc inner } + +let sequence(rule) == xs = rule*; { Writer.sequence xs } +let sequence_nonempty(rule) == xs = rule+; { Writer.sequence xs } +let separated_sequence_nonempty(separator, X) := + | x = X; { let* x = x in return [ x ] } + | x = X; separator; xs = separated_sequence_nonempty(separator, X); + { + let* x = x in + let* xs = xs in + return @@ x :: xs + } (* ENTRY-POINT *) (* A comment can either contain block elements, block elements and then tags, or just tags, but not tags and then block elements *) let main := - | ~ = located(toplevel)+; whitespace*; END; <> - | elements = located(toplevel)+; tags = located(tag)+; whitespace*; END; - { elements @ tags } - | ~ = located(tag)+; whitespace*; END; <> - | whitespace; END; { [] } - | END; { [] } + | nodes = sequence_nonempty(locatedM(toplevel)); whitespace?; END; { nodes } + | whitespace?; END; { Writer.return @@ [] } let toplevel := - | block = nestable_block_element; { ( block :> Intermediate.block_element) } + | block = nestable_block_element; { Writer.map (fun b -> (b :> Ast.block_element) ) block } + | ~ = tag; <> | ~ = heading; <> let horizontal_whitespace := | SPACE; { `Space " " } | ~ = Space; <`Space> - | ~ = Single_newline; <`Space> + +let newline := NEWLINE; {} | Single_newline; {} let whitespace := | horizontal_whitespace; {} - | NEWLINE; { () } + | newline; {} let heading := - | (num, title) = Section_heading; children = list(located(inline_element)); RIGHT_BRACE; - { `Heading (num, title, children) } + | (num, title) = Section_heading; children = sequence_nonempty(locatedM(inline_element)); RIGHT_BRACE; + { + let* children = children in + return @@ `Heading (num, title, children) + } let tag == - | ~ = tag_with_content; <`Tag> - | ~ = tag_bare; <`Tag> + | t = tag_with_content; { Writer.map tag t } + | t = tag_bare; { Writer.map tag t } let tag_with_content := - | version = Before; children = located(nestable_block_element)+; - { `Before (version, children) } - | DEPRECATED; children = located(nestable_block_element)+; - { `Deprecated children } - | RETURN; children = located(nestable_block_element)+; - { `Return children } - | ident = Param; children = located(nestable_block_element)+; - { `Param (ident, children) } - | exn = Raise; children = located(nestable_block_element)+; - { `Raise (exn, children) } - | (kind, href) = See; children = located(nestable_block_element)+; - { `See (kind, href, children) } + | version = Before; children = sequence_nonempty(locatedM(nestable_block_element)); + { Writer.map (fun c -> `Before (version, c)) children } + | DEPRECATED; children = sequence_nonempty(locatedM(nestable_block_element)); + { Writer.map (fun c -> `Deprecated c) children } + | RETURN; children = sequence_nonempty(locatedM(nestable_block_element)); + { Writer.map (fun c -> `Return c) children } + | ident = Param; children = sequence_nonempty(locatedM(nestable_block_element)); + { Writer.map (fun c -> `Param (ident, c)) children } + | exn = Raise; children = sequence_nonempty(locatedM(nestable_block_element)); + { Writer.map (fun c -> `Raise (exn, c)) children } + | (kind, href) = See; horizontal_whitespace?; children = sequence_nonempty(locatedM(nestable_block_element)); + { + let* children = children in + return @@ `See (kind, href, children) + } let tag_bare := - | version = Version; { `Version version } - | version = Since; { `Since version } - | impl = located(Canonical); { `Canonical impl } - | version = Author; { `Author version } - | OPEN; { `Open } - | INLINE; { `Inline } - | CLOSED; { `Closed } - | HIDDEN; { `Hidden } + | version = Version; { return @@ `Version version } + | version = Since; { return @@ `Since version } + | impl = located(Canonical); { return @@ `Canonical impl } + | version = Author; { return @@ `Author version } + | OPEN; { return `Open } + | INLINE; { return `Inline } + | CLOSED; { return `Closed } + | HIDDEN; { return `Hidden } (* INLINE ELEMENTS *) let inline_element := - | ~ = Space; <`Space> - | ~ = Word; <`Word> - | ~ = Code_span; <`Code_span> - | ~ = Raw_markup; <`Raw_markup> - | style = Style; inner = located(inline_element)+; RIGHT_BRACE; { `Styled (style, inner) } + | s = horizontal_whitespace; { return s } + | w = Word; { return @@ `Word w } + | c = Code_span; { return @@ `Code_span c } + | m = Raw_markup; { return @@ `Raw_markup m } + | style = Style; children = sequence_nonempty(locatedM(inline_element)); RIGHT_BRACE; + { Writer.map (fun c -> `Styled (style, c)) children } | style = Style; RIGHT_BRACE; { let location = Parser_aux.to_location $sloc in - let node = `Styled (style, [Loc.at location (`Word "")]) in + let node = `Styled (style, [ Loc.at location (`Word "") ]) in let what = Tokens.describe @@ Style style in let warning = fun ~filename -> let span = Parser_aux.to_location ~filename $sloc in Parse_error.should_not_be_empty ~what span in - `Warning (node, warning) + Writer.with_warning node warning } - | ~ = Math_span; <`Math_span> - | ~ = ref; <> + | m = Math_span; { return @@ `Math_span m } + | ~ = reference; <> | ~ = link; <> -let ref := - | ref_body = located(Simple_ref); children = located(inline_element)*; - { `Reference (`Simple, ref_body, children) } - - | ref_body = located(Ref_with_replacement); - children = located(inline_element)+; - RIGHT_BRACE; - { `Reference (`With_text, ref_body, children) } +let reference := + | ref_body = located(Simple_ref); children = sequence(locatedM(inline_element)); + { + let+ children = children in + let ref_body = Loc.nudge_map_start (String.length "{!") ref_body in + `Reference (`Simple, ref_body, children) + } + | ref_body = located(Ref_with_replacement); children = sequence_nonempty(locatedM(inline_element)); RIGHT_BRACE; + { + let+ children = children in + let ref_body = Loc.nudge_map_start (String.length "{{!") ref_body in + `Reference (`With_text, ref_body, children) + } + | ref_body = located(Ref_with_replacement); RIGHT_BRACE; + { + let span = + Parser_aux.to_location $sloc + |> Loc.nudge_start (String.length "{{!") + in + let node = `Reference (`With_text, ref_body, []) in + let what = Tokens.describe @@ Ref_with_replacement (Loc.value ref_body) in + let warning = fun ~filename:_f -> + Parse_error.should_not_be_empty ~what span + in + Writer.with_warning node warning + } -let link := - | link_body = Simple_link; children = located(inline_element)+; RIGHT_BRACE; { `Link (link_body, children) } - | link_body = Link_with_replacement; children = located(inline_element)+; RIGHT_BRACE; { `Link (link_body, children) } +let link := | link_body = Simple_link; RIGHT_BRACE; + { + let node = `Link (link_body, []) in + let url = String.trim link_body in + if "" = url then + let what = Tokens.describe @@ Simple_link link_body in + let span = Parser_aux.to_location $sloc in + let warning = fun ~filename:_f -> + Parse_error.should_not_be_empty ~what span + in + Writer.with_warning node warning + else + return node + } + | link_body = Link_with_replacement; children = sequence_nonempty(locatedM(inline_element)); RIGHT_BRACE; + { + let* c = children in + let node = `Link (link_body, c) in + if "" = link_body then + let what = Tokens.describe @@ Link_with_replacement link_body in + let span = Parser_aux.to_location $sloc in + let warning = fun ~filename:_f -> + Parse_error.should_not_be_empty ~what span + in + Writer.with_warning node warning + else + return node + } + | link_body = Link_with_replacement; whitespace?; RIGHT_BRACE; + { + let span = + Parser_aux.to_location $sloc + |> Loc.nudge_start (String.length "{{!") + in + let node = `Link (link_body, []) in + let what = Tokens.describe @@ Link_with_replacement link_body in + let warning = fun ~filename:_f -> + Parse_error.should_not_be_empty ~what span + in + Writer.with_warning node warning + } (* LIST *) -let list_light := - | MINUS; unordered_items = separated_list(NEWLINE; MINUS, located(nestable_block_element)); - { `List (`Unordered, `Light, [ unordered_items ]) } +let list_light_item_unordered == + | MINUS; ~ = locatedM(nestable_block_element); <> + | horizontal_whitespace; MINUS; item = locatedM(nestable_block_element); + { + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_begin_on_its_own_line ~what:(Tokens.describe MINUS) span + in + Writer.warning warning item + } + +let list_light_item_ordered == + | PLUS; ~ = locatedM(nestable_block_element); <> + | horizontal_whitespace; PLUS; item = locatedM(nestable_block_element); + { + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_begin_on_its_own_line ~what:(Tokens.describe MINUS) span + in + Writer.warning warning item + } - | PLUS; ordered_items = separated_list(NEWLINE; SPACE?; PLUS, located(nestable_block_element)); - { `List (`Ordered, `Light, [ ordered_items ]) } +let list_light := + | children = separated_nonempty_list(newline, list_light_item_ordered); + { let* children = Writer.sequence children in return @@ `List (`Ordered, `Light, [ children ]) } + | children = separated_nonempty_list(newline, list_light_item_unordered); + { let* children = Writer.sequence children in return @@ `List (`Unordered, `Light, [ children ]) } + +(* TODO: Refactor `List_item` into two tokens - Li and Dash - so that we can + handle the case where '{li' is *not* followed by whitespace in our Menhir + rule as opposed to it's semantic action *) +let item_heavy == + | LI; whitespace*; ~ = sequence_nonempty(locatedM(nestable_block_element)); whitespace?; RIGHT_BRACE; whitespace?; <> + | DASH; whitespace*; ~ = sequence_nonempty(locatedM(nestable_block_element)); whitespace?; RIGHT_BRACE; whitespace?; <> -(* NOTE: (@FayCarsons) `List_item` is [ `Li | `Dash ], not sure how that's useful though. Can't find '{li' syntax in Odoc docs *) -let item_heavy == _ = List_item; whitespace*; ~ = located(nestable_block_element)+; whitespace*; RIGHT_BRACE; whitespace*; <> let list_heavy := - | list_kind = List; whitespace*; items = item_heavy*; whitespace*; RIGHT_BRACE; - { `List (list_kind, `Heavy, items) } + | list_kind = List; whitespace?; items = sequence(item_heavy); whitespace?; RIGHT_BRACE; + { Writer.map (fun items -> `List (list_kind, `Heavy, items)) items } let list_element := | ~ = list_light; <> @@ -279,73 +418,80 @@ let list_element := (* TABLES *) -let cell_heavy := cell_kind = Table_cell; whitespace?; children = located(nestable_block_element)*; whitespace?; RIGHT_BRACE; whitespace?; { (children, cell_kind) } -let row_heavy == TABLE_ROW; whitespace?; cells = list(cell_heavy); RIGHT_BRACE; whitespace?; { cells } -let table_heavy == TABLE_HEAVY; whitespace?; grid = row_heavy+; RIGHT_BRACE; { ((grid, None), `Heavy) } - -let cell_light == BAR?; ~ = located(inline_element)+; <> (* Intermediate.cell *) -let row_light := ~ = cell_light+; BAR?; NEWLINE; <> (* Intermediate.row *) +let cell_heavy := cell_kind = Table_cell; whitespace?; children = sequence(locatedM(nestable_block_element)); whitespace?; RIGHT_BRACE; + { Writer.map (fun c -> (c, cell_kind)) children } +let row_heavy == TABLE_ROW; whitespace?; ~ = sequence_nonempty(cell_heavy); RIGHT_BRACE; <> +let table_heavy == TABLE_HEAVY; whitespace?; grid = sequence_nonempty(row_heavy); RIGHT_BRACE; + { Writer.map (fun g -> `Table ((g, None), `Heavy)) grid } +let cell_light := ~ = sequence_nonempty(locatedM(inline_element)); <> +let row_light := BAR?; cells = separated_sequence_nonempty(BAR, cell_light); BAR?; <> +let rows_light := ~ = separated_sequence_nonempty(newline, row_light); <> let table_light := (* If the first row is the alignment row then the rest should be data *) - | TABLE_LIGHT; align = row_light; data = row_light+; RIGHT_BRACE; - { - let data = List.map as_data data in - match valid_align_row align with - | Ok alignment -> ((data, Some alignment), `Light) - | Error Invalid_align -> ((data, None), `Light) - | Error Not_align -> - let rows = as_data align :: data in - ((rows, None), `Light) - } - + | TABLE_LIGHT; align = row_light; newline; data = rows_light; whitespace?; RIGHT_BRACE; + { construct_table ~align data } (* Otherwise the first should be the headers, the second align, and the rest data *) - | TABLE_LIGHT; header = row_light; align = row_light; data = row_light+; RIGHT_BRACE; - { - let data = List.map as_data data - and header = as_header header in - match valid_align_row align with - | Ok alignment -> (header :: data, Some alignment), `Light - | Error Invalid_align -> - (header :: data, None), `Light - | Error Not_align -> - let rows = header :: as_data align :: data in - (rows, None), `Light - } - + | TABLE_LIGHT; header = row_light; newline; align = row_light; newline; data = rows_light; whitespace?; RIGHT_BRACE; + { construct_table ~header ~align data } (* If there's only one row and it's not the align row, then it's data *) - | TABLE_LIGHT; data = row_light+; RIGHT_BRACE; - { (List.map as_data data, None), `Light } - + | TABLE_LIGHT; data = rows_light; whitespace?; RIGHT_BRACE; { construct_table data } (* If there's nothing inside, return an empty table *) - | TABLE_LIGHT; SPACE*; RIGHT_BRACE; - { ([[]], None), `Light } + | TABLE_LIGHT; whitespace?; RIGHT_BRACE; + { return @@ `Table (([[]], None), `Light) } + | TABLE_LIGHT; whitespace?; END; + { + let in_what = Tokens.describe TABLE_LIGHT in + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.end_not_allowed ~in_what span + in + unclosed_table warning + } + | TABLE_LIGHT; data = rows_light; whitespace?; END; + { + let in_what = Tokens.describe TABLE_LIGHT in + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.end_not_allowed ~in_what span + in + unclosed_table ~data warning + } let table := - | ~ = table_heavy; <`Table> - | ~ = table_light; <`Table> + | ~ = table_heavy; <> + | ~ = table_light; <> (* MEDIA *) +(* TODO: Naming needs some work here, multiple fields labeled as `kind` which + doesn't make sense *) let media := | media = located(Media); whitespace*; - { let (located_media_kind, media_href) = split_simple_media media in + { + let (located_media_kind, media_href) = split_simple_media media in let wrapped_located_kind = Loc.map href_of_media located_media_kind in - `Media (`Simple, wrapped_located_kind, "", media_kind_of_target media_href) } + let kind = media_kind_of_target media_href in + return @@ `Media (`Simple, wrapped_located_kind, "", kind) + } | media = located(Media_with_replacement); whitespace*; - { let (located_media_kind, media_href, content) = split_replacement_media media in + { + let (located_media_kind, media_href, content) = split_replacement_media media in let wrapped_located_kind = Loc.map href_of_media located_media_kind in - `Media (`With_text, wrapped_located_kind, content, media_kind_of_target media_href) } + let kind = media_kind_of_target media_href in + return @@ `Media (`With_text, wrapped_located_kind, content, kind) + } (* TOP-LEVEL ELEMENTS *) let nestable_block_element := - | ~ = Verbatim; <`Verbatim> - | ~ = located(inline_element)+; <`Paragraph> - | ~ = Code_block; <`Code_block> - | ~ = located(Modules)+; <`Modules> + | v = Verbatim; { return (`Verbatim v) } + | items = sequence_nonempty(locatedM(inline_element)); + { Writer.map (fun i -> `Paragraph i) items } + | c = Code_block; { return (`Code_block c) } + | modules = located(Modules)+; { return (`Modules modules) } | ~ = list_element; <> | ~ = table; <> | ~ = media; <> - | ~ = Math_block; <`Math_block> + | m = Math_block; { return (`Math_block m) } diff --git a/src/parser/test/test.ml b/src/parser/test/test.ml index 74b068a1d8..b618635800 100644 --- a/src/parser/test/test.ml +++ b/src/parser/test/test.ml @@ -1,6 +1,8 @@ open Odoc_parser open Serialize +module Serialize = Serialize + let error err = Atom (Odoc_parser.Warning.to_string err) let parser_output formatter v = diff --git a/src/parser/test/test.mli b/src/parser/test/test.mli index 4f83ff600f..bad1943eff 100644 --- a/src/parser/test/test.mli +++ b/src/parser/test/test.mli @@ -1 +1,2 @@ val test : ?location:Odoc_parser.Loc.point -> string -> unit +module Serialize = Serialize diff --git a/src/parser/test_driver/dune b/src/parser/test_driver/dune new file mode 100644 index 0000000000..b567c975bf --- /dev/null +++ b/src/parser/test_driver/dune @@ -0,0 +1,5 @@ +(executable + (name tester) + (public_name tester) + (package odoc) + (libraries odoc_parser odoc_parser_test)) diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml new file mode 100644 index 0000000000..a47e9b03f1 --- /dev/null +++ b/src/parser/test_driver/tester.ml @@ -0,0 +1,161 @@ +open! Odoc_parser_test + +module Loc = Odoc_parser.Loc + +(* + NOTE (@FayCarsons): for anyone working on this parser - this is probably + the easiest way to check if something is working. The tests are numerous + and difficult to parse. A test suite will fail in its entirety if one case + throws an exception. + + If you need to test a specific parser rule or test case, add it here and + `dune exec tester -- {test cases you want to run}` +*) + +let code_cases = + [ + ("basic", "{[foo]}"); + ("empty", "{[]}"); + ("whitespace only", "{[ ]}"); + ("blank line only", "{[\n \n]}"); + ("whitespace", "{[foo bar]}"); + ("newline", "{[foo\nbar]}"); + ("carriage return", "{[foo\r\nbar]}"); + ("contains blank line", "{[foo\n\nbar]}"); + ("leading whitespace", "{[ foo]}"); + ] + +let error_recovery = + [ + ("Empty Italic", "{i}"); + ("Empty bold", "{b}"); + ("Light\nlist", "- foo\n - bar\n- baz"); + ("Empty ref w/ replacement", "{{!https://ocaml.org}}"); + ] + +let see = [ ("See", "@see bar baz quux\n") ] + +(* Cases (mostly) taken from the 'odoc for library authors' document *) +let documentation_cases = + [ + ("Light list", "- foo\n- bar"); + ("Heavy list", "{ul {li foo} {li bar} {li baz}}"); + ("Simple ref", "{!Stdlib.Buffer}"); + ("Ref w/ replacement", "{{: https://ocaml.org/ }the OCaml website}"); + ("Modules", "{!modules: Foo Bar Baz}"); + ("Block tag", "@see bar baz quux\n"); + ("Inline tag", "@author Fay Carsons"); + ("Simple tag", "@open"); + ("Math", "{math \\sum_{i=0}^n x^i%}"); + ("Inline_math", "{m x + 4}"); + ( "Heavy table", + "{table\n\ + \ {tr\n\ + \ {th Header 1}\n\ + \ {th Header 2}\n\ + \ {th Header 3}\n\ + \ }\n\ + \ {tr\n\ + \ {td Cell 1}\n\ + \ {td Cell with {e emphasized content}}\n\ + \ {td {v a block v} }\n\ + \ }\n\ + \ }" ); + ( "Light table", + {|{t\n\ + | Header 1 | Header 2 | Header 3 | Header 4|\n + | :------: | --------:|:---------|---------|\n + | centered | right | left | default |\n + omitted | bar at | start and| finish\n + | {e emph} | and | unaligned | bars |\n}|} + ); + ("Styled", "{i italicized}"); + ("Inline code", "[fmap Fun.id None]"); + ("Code block", "{[\n let foo = 0 \n]}"); + ] + +type failure = { + label : string; + case : string; + tokens : Odoc_parser.Tester.token list; + failed_at : int; +} + +let tokens_string tokens = + List.fold_left + (fun acc token -> acc ^ "\n" ^ token) + "" + (List.map Odoc_parser.Tester.string_of_token tokens) + +open Test.Serialize + +let error err = Atom (Odoc_parser.Warning.to_string err) +let parser_output formatter (ast, warnings) = + let value = Ast_to_sexp.(docs loc_at ast) in + let warnings = List (List.map error warnings) in + let output = + List [ List [ Atom "output"; value ]; List [ Atom "warnings"; warnings ] ] + in + Sexplib0.Sexp.pp_hum formatter output; + Format.pp_print_flush formatter () + +let run_test (failed : failure list) (label, case) = + let module Parser = Odoc_parser.Tester in + let unwrap_token lexbuf = + let open Parser in + let Loc.{ value; _ } = token lexbuf in + value + in + let rec get_tokens lexbuf acc = + let token = unwrap_token lexbuf in + if Odoc_parser.Tester.is_EOI token then List.rev (token :: acc) + else get_tokens lexbuf (token :: acc) + in + let lexbuf = Lexing.from_string case in + let tokens_cached = get_tokens lexbuf [] in + let idx = ref @@ -1 in + try + let tokens = ref tokens_cached in + let intermediate = + Odoc_parser.Tester.main + (fun _ -> + match !tokens with + | t :: ts -> + incr idx; + tokens := ts; + t + | [] -> failwith "No more tokens") + lexbuf + in + let ast, warnings = Parser.run ~filename:"" intermediate in + Format.printf "%a\n" parser_output (ast, warnings); + print_newline (); + Printf.printf "Got %d warnings \n%!" (List.length warnings); + Printf.printf "Warnings:\n%s\n%!" + (List.fold_left + (fun acc warning -> acc ^ "\n" ^ Parser.pp_warning warning) + "" warnings); + failed + with _ -> + { case; label; tokens = tokens_cached; failed_at = !idx } :: failed + +let () = + let cases = + if Array.length Sys.argv > 1 then ( + match Sys.argv.(1) with + | "code" -> code_cases + | "recovery" | "r" -> error_recovery + | "docs" | "d" -> documentation_cases + | "see" | "s" -> see + | _ -> + print_endline "unrecognized argument - running documentation_cases"; + documentation_cases) + else documentation_cases + in + List.fold_left run_test [] cases + |> List.iter (fun { label; case; tokens; failed_at } -> + Printf.printf + "Failure: %s\nInput:\n%s\nOffending token:\n%d: %s\nTokens: %s\n\n" + label case failed_at + (List.nth tokens failed_at |> Odoc_parser.Tester.string_of_token) + (tokens_string tokens)) diff --git a/src/parser/tokens.ml b/src/parser/tokens.ml index 0339b732db..76195799a2 100644 --- a/src/parser/tokens.ml +++ b/src/parser/tokens.ml @@ -32,7 +32,8 @@ type token = | Paragraph_style of paragraph_style | Style of style | List of Ast.list_kind - | List_item of Ast.list_item + | LI + | DASH | TABLE_LIGHT | TABLE_HEAVY | TABLE_ROW @@ -101,8 +102,8 @@ let print : token -> string = function | Style `Subscript -> "'{_'" | List `Ordered -> "{ol" | List `Unordered -> "{ul" - | List_item `Li -> "'{li ...}'" - | List_item `Dash -> "'{- ...}'" + | LI -> "'{li ...}'" + | DASH -> "'{- ...}'" | TABLE_LIGHT -> "{t" | TABLE_HEAVY -> "{table" | TABLE_ROW -> "'{tr'" @@ -162,7 +163,8 @@ let describe : token -> string = function | Link_with_replacement _ -> "'{{:...} ...}' (external link)" | END -> "end of text" | SPACE -> "whitespace" - | Single_newline _ | NEWLINE -> "line break" + | Single_newline _ -> "newline" + | NEWLINE -> "line break" | Blank_line _ -> "blank line" | RIGHT_BRACE -> "'}'" | RIGHT_CODE_DELIMITER -> "']}'" @@ -171,8 +173,8 @@ let describe : token -> string = function | Modules _ -> "'{!modules ...}'" | List `Unordered -> "'{ul ...}' (bulleted list)" | List `Ordered -> "'{ol ...}' (numbered list)" - | List_item `Li -> "'{li ...}' (list item)" - | List_item `Dash -> "'{- ...}' (list item)" + | LI -> "'{li ...}' (list item)" + | DASH -> "'{- ...}' (list item)" | TABLE_LIGHT -> "'{t ...}' (table)" | TABLE_HEAVY -> "'{table ...}' (table)" | TABLE_ROW -> "'{tr ...}' (table row)" diff --git a/src/parser/writer.ml b/src/parser/writer.ml new file mode 100644 index 0000000000..609a12a206 --- /dev/null +++ b/src/parser/writer.ml @@ -0,0 +1,50 @@ +type partial_warning = filename:string -> Warning.t +type +'a t = Writer of ('a * partial_warning list) + +let return : 'a -> 'a t = fun x -> Writer (x, []) + +let bind : 'a t -> ('a -> 'b t) -> 'b t = + fun (Writer (node, warnings)) f -> + let (Writer (next, next_warnings)) = f node in + Writer (next, warnings @ next_warnings) + +let map : ('a -> 'b) -> 'a t -> 'b t = fun f w -> bind w (fun x -> return (f x)) + +let seq_right : 'a t -> 'b t -> 'b t = + fun (Writer (_, ws)) (Writer (x, ws2)) -> Writer (x, ws @ ws2) + +let seq_left : 'a t -> 'b t -> 'a t = + fun (Writer (x, ws)) (Writer (_, ws2)) -> Writer (x, ws @ ws2) + +module Prelude = struct + let return = return + let ( >>= ) = bind + let ( let* ) = bind + let ( and* ) = bind + let ( let+ ) = map + let ( >|= ) = map + let ( *> ) = seq_right + let ( <* ) = seq_left +end + +let warning warning (Writer (n, ws)) = Writer (n, warning :: ws) + +let sequence : 'a t list -> 'a list t = + fun xs -> + let rec go nodes warnings = function + | Writer (n, ws) :: xs -> go (n :: nodes) (ws @ warnings) xs + | [] -> Writer (nodes, warnings) + in + go [] [] xs + +let map2 : ('a -> 'b -> 'c) -> 'a t -> 'b t -> 'c t = + fun f (Writer (a, ws)) (Writer (b, wsb)) -> Writer (f a b, wsb @ ws) + +let traverse : ('a -> 'b t) -> 'a list -> 'b list t = + fun f xs -> sequence (List.map f xs) + +let with_warning node warning = Writer (node, [ warning ]) + +let run : filename:string -> Ast.t t -> Ast.t * Warning.t list = + fun ~filename (Writer (tree, warnings)) -> + (tree, List.map (fun f -> f ~filename) warnings) From c04e42c199de7958252008efaf6adb3414a01544 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 25 Nov 2024 11:05:08 -0500 Subject: [PATCH 062/150] Add heavy list item warnings --- src/parser/parser.mly | 65 +++++++++++++++++++++++++++++++++++++++++-- src/parser/writer.ml | 4 +++ 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index fdb1d14329..a24ffd5773 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -8,6 +8,8 @@ let location = Parser_aux.to_location loc in { location; value } + let not_empty : 'a list -> bool = function _ :: _ -> true | _ -> false + type align_error = | Invalid_align (* An invalid align cell *) | Not_align (* Not an align cell *) @@ -405,8 +407,67 @@ let list_light := handle the case where '{li' is *not* followed by whitespace in our Menhir rule as opposed to it's semantic action *) let item_heavy == - | LI; whitespace*; ~ = sequence_nonempty(locatedM(nestable_block_element)); whitespace?; RIGHT_BRACE; whitespace?; <> - | DASH; whitespace*; ~ = sequence_nonempty(locatedM(nestable_block_element)); whitespace?; RIGHT_BRACE; whitespace?; <> + | LI; whitespace; items = sequence(locatedM(nestable_block_element)); RIGHT_BRACE; whitespace?; + { + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span + in + Writer.ensure not_empty warning items + } + | LI; items = sequence(locatedM(nestable_block_element)); RIGHT_BRACE; whitespace?; + { + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_be_followed_by_whitespace ~what:(Tokens.describe LI) span + in + let writer = + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span + in + Writer.ensure not_empty warning items + in + Writer.warning warning writer + } + | DASH; whitespace?; items = sequence(locatedM(nestable_block_element)); RIGHT_BRACE; whitespace?; + { + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span + in + Writer.ensure not_empty warning items + } + | LI; whitespace; items = sequence(locatedM(nestable_block_element)); END; + { + let writer = + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span + in + Writer.ensure not_empty warning items + in + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.end_not_allowed ~in_what:(Tokens.describe LI) span + in + Writer.warning warning writer + } + | DASH; whitespace; items = sequence(locatedM(nestable_block_element)); END; + { + let writer = + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what:(Tokens.describe DASH) span + in + Writer.ensure not_empty warning items + in + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.end_not_allowed ~in_what:(Tokens.describe DASH) span + in + Writer.warning warning writer + } let list_heavy := | list_kind = List; whitespace?; items = sequence(item_heavy); whitespace?; RIGHT_BRACE; diff --git a/src/parser/writer.ml b/src/parser/writer.ml index 609a12a206..39e7c87d47 100644 --- a/src/parser/writer.ml +++ b/src/parser/writer.ml @@ -45,6 +45,10 @@ let traverse : ('a -> 'b t) -> 'a list -> 'b list t = let with_warning node warning = Writer (node, [ warning ]) +let ensure : ('a -> bool) -> partial_warning -> 'a t -> 'a t = + fun pred warning (Writer (x, ws) as self) -> + if pred x then self else Writer (x, warning :: ws) + let run : filename:string -> Ast.t t -> Ast.t * Warning.t list = fun ~filename (Writer (tree, warnings)) -> (tree, List.map (fun f -> f ~filename) warnings) From b1b3e5da1efae381ed4dfeeea74b511d0a372030 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 25 Nov 2024 11:16:28 -0500 Subject: [PATCH 063/150] Handle EOI in heavy list item parsing --- src/parser/parser.mly | 44 +++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index a24ffd5773..216f913a08 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -438,35 +438,43 @@ let item_heavy == in Writer.ensure not_empty warning items } - | LI; whitespace; items = sequence(locatedM(nestable_block_element)); END; + | LI; whitespace?; items = sequence(locatedM(nestable_block_element))?; END; { - let writer = - let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in - Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span - in - Writer.ensure not_empty warning items - in let warning = fun ~filename -> let span = Parser_aux.to_location ~filename $sloc in Parse_error.end_not_allowed ~in_what:(Tokens.describe LI) span in - Writer.warning warning writer + match items with + | Some items -> + let writer = + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span + in + Writer.ensure not_empty warning items + in + Writer.warning warning writer + | None -> + Writer.with_warning [] warning } - | DASH; whitespace; items = sequence(locatedM(nestable_block_element)); END; + | DASH; whitespace?; items = sequence(locatedM(nestable_block_element))?; END; { - let writer = - let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in - Parse_error.should_not_be_empty ~what:(Tokens.describe DASH) span - in - Writer.ensure not_empty warning items - in let warning = fun ~filename -> let span = Parser_aux.to_location ~filename $sloc in Parse_error.end_not_allowed ~in_what:(Tokens.describe DASH) span in - Writer.warning warning writer + match items with + | Some items -> + let writer = + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what:(Tokens.describe DASH) span + in + Writer.ensure not_empty warning items + in + Writer.warning warning writer + | None -> + Writer.with_warning [] warning } let list_heavy := From 69dba7f5e2fba8c2885be7f1324b61b9fe5870fe Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 25 Nov 2024 16:15:35 -0500 Subject: [PATCH 064/150] Add all currently handled error cases to test code, fix `sequence` --- src/parser/parser.mly | 67 +++++++++++++++++++++++++++++--- src/parser/test_driver/tester.ml | 33 +++++----------- src/parser/writer.ml | 8 ++-- 3 files changed, 74 insertions(+), 34 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 216f913a08..681c37339c 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -205,20 +205,22 @@ %% (* Utility which wraps the return value of a producer in `Loc.with_location` *) -let locatedM(rule) == inner = rule; { Writer.map (wrap_location $loc) inner } -let located(rule) == | inner = rule; { wrap_location $sloc inner } +let locatedM(rule) == inner = rule; { Writer.map (wrap_location $sloc) inner } +let located(rule) == inner = rule; { wrap_location $sloc inner } let sequence(rule) == xs = rule*; { Writer.sequence xs } let sequence_nonempty(rule) == xs = rule+; { Writer.sequence xs } -let separated_sequence_nonempty(separator, X) := +let separated_sequence_nonempty_inner(separator, X) := | x = X; { let* x = x in return [ x ] } - | x = X; separator; xs = separated_sequence_nonempty(separator, X); + | xs = separated_sequence_nonempty_inner(separator, X); separator; x = X; { let* x = x in let* xs = xs in return @@ x :: xs } +let separated_sequence_nonempty(sep, rule) == xs = separated_sequence_nonempty_inner(sep, rule); { Writer.map List.rev xs } + (* ENTRY-POINT *) (* A comment can either contain block elements, block elements and then tags, or @@ -227,6 +229,57 @@ let main := | nodes = sequence_nonempty(locatedM(toplevel)); whitespace?; END; { nodes } | whitespace?; END; { Writer.return @@ [] } +let any == + | SPACE; { SPACE } + | NEWLINE; { NEWLINE } + | RIGHT_BRACE; { RIGHT_BRACE } + | RIGHT_CODE_DELIMITER; { RIGHT_CODE_DELIMITER } + | b = Blank_line; { Blank_line b } + | s = Single_newline; { Single_newline s } + | s = Space; { Space s } + | w = Word; { Word w } + | MINUS; { MINUS } + | PLUS; { PLUS } + | BAR; { BAR } + | s = Style; { Style s } + | p = Paragraph_style; { Paragraph_style p } + | m = Modules; { Modules m } + | m = Math_span; { Math_span m } + | m = Math_block; { Math_block m } + | r = Raw_markup; { Raw_markup r } + | c = Code_block; { Code_block c } + | c = Code_span; { Code_span c } + | l = List; { List l } + | LI; { LI } + | DASH; { DASH } + | TABLE_LIGHT; { TABLE_LIGHT } + | TABLE_HEAVY; { TABLE_HEAVY } + | TABLE_ROW; { TABLE_ROW } + | t = Table_cell; { Table_cell t } + | s = Section_heading; { Section_heading s } + | a = Author; { Author a } + | DEPRECATED; { DEPRECATED } + | p = Param; { Param p } + | r = Raise; { Raise r } + | RETURN; { RETURN } + | s = See; { See s } + | s = Since; { Since s } + | b = Before; { Before b } + | v = Version; { Version v } + | c = Canonical; { Canonical c } + | INLINE; { INLINE } + | OPEN; { OPEN } + | CLOSED; { CLOSED } + | HIDDEN; { HIDDEN } + | r = Simple_ref; { Simple_ref r } + | r = Ref_with_replacement; { Ref_with_replacement r } + | l = Simple_link; { Simple_link l } + | l = Link_with_replacement; { Link_with_replacement l } + | m = Media; { Media m } + | m = Media_with_replacement; { Media_with_replacement m } + | v = Verbatim; { Verbatim v } + | END; { END } + let toplevel := | block = nestable_block_element; { Writer.map (fun b -> (b :> Ast.block_element) ) block } | ~ = tag; <> @@ -553,9 +606,11 @@ let media := (* TOP-LEVEL ELEMENTS *) -let nestable_block_element := +let nestable_block_element := ~ = nestable_block_element_inner; newline?; <> + +let nestable_block_element_inner := | v = Verbatim; { return (`Verbatim v) } - | items = sequence_nonempty(locatedM(inline_element)); + | items = sequence_nonempty(locatedM(inline_element)); { Writer.map (fun i -> `Paragraph i) items } | c = Code_block; { return (`Code_block c) } | modules = located(Modules)+; { return (`Modules modules) } diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index a47e9b03f1..181e0107c6 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -27,10 +27,13 @@ let code_cases = let error_recovery = [ - ("Empty Italic", "{i}"); - ("Empty bold", "{b}"); - ("Light\nlist", "- foo\n - bar\n- baz"); - ("Empty ref w/ replacement", "{{!https://ocaml.org}}"); + ("Empty style", "{i}"); + ("Empty ref", "{{!www.google.com}}"); + ("Empty link", "{{:www.google.com}}"); + ("List item not at beginning of line", "- foo\n - bar\n- baz"); + ("Empty list item", "{ol {li} }"); + ("'{li' not followed by whitespace", "{ol {lifoo bar baz} }"); + ("End not allowed in table", "{t "); ] let see = [ ("See", "@see bar baz quux\n") ] @@ -106,26 +109,11 @@ let run_test (failed : failure list) (label, case) = let Loc.{ value; _ } = token lexbuf in value in - let rec get_tokens lexbuf acc = - let token = unwrap_token lexbuf in - if Odoc_parser.Tester.is_EOI token then List.rev (token :: acc) - else get_tokens lexbuf (token :: acc) - in let lexbuf = Lexing.from_string case in - let tokens_cached = get_tokens lexbuf [] in - let idx = ref @@ -1 in + Lexing.set_filename lexbuf "TESTER"; try - let tokens = ref tokens_cached in let intermediate = - Odoc_parser.Tester.main - (fun _ -> - match !tokens with - | t :: ts -> - incr idx; - tokens := ts; - t - | [] -> failwith "No more tokens") - lexbuf + Odoc_parser.Tester.main (fun lexbuf -> unwrap_token lexbuf) lexbuf in let ast, warnings = Parser.run ~filename:"" intermediate in Format.printf "%a\n" parser_output (ast, warnings); @@ -136,8 +124,7 @@ let run_test (failed : failure list) (label, case) = (fun acc warning -> acc ^ "\n" ^ Parser.pp_warning warning) "" warnings); failed - with _ -> - { case; label; tokens = tokens_cached; failed_at = !idx } :: failed + with _ -> { case; label; tokens = []; failed_at = 0 } :: failed let () = let cases = diff --git a/src/parser/writer.ml b/src/parser/writer.ml index 39e7c87d47..ea4bd519b4 100644 --- a/src/parser/writer.ml +++ b/src/parser/writer.ml @@ -31,11 +31,9 @@ let warning warning (Writer (n, ws)) = Writer (n, warning :: ws) let sequence : 'a t list -> 'a list t = fun xs -> - let rec go nodes warnings = function - | Writer (n, ws) :: xs -> go (n :: nodes) (ws @ warnings) xs - | [] -> Writer (nodes, warnings) - in - go [] [] xs + let go (ns, ws) (Writer (n, w)) = (n :: ns, w @ ws) in + let xs, ws = List.fold_left go ([], []) xs in + Writer (List.rev xs, ws) let map2 : ('a -> 'b -> 'c) -> 'a t -> 'b t -> 'c t = fun f (Writer (a, ws)) (Writer (b, wsb)) -> Writer (f a b, wsb @ ws) From cedeb7d8d2022d04f154d2645b4cb738de13316b Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 25 Nov 2024 16:26:39 -0500 Subject: [PATCH 065/150] Remove `any` rule --- src/parser/parser.mly | 52 +------------------------------------------ 1 file changed, 1 insertion(+), 51 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 681c37339c..d64994e70a 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -229,57 +229,6 @@ let main := | nodes = sequence_nonempty(locatedM(toplevel)); whitespace?; END; { nodes } | whitespace?; END; { Writer.return @@ [] } -let any == - | SPACE; { SPACE } - | NEWLINE; { NEWLINE } - | RIGHT_BRACE; { RIGHT_BRACE } - | RIGHT_CODE_DELIMITER; { RIGHT_CODE_DELIMITER } - | b = Blank_line; { Blank_line b } - | s = Single_newline; { Single_newline s } - | s = Space; { Space s } - | w = Word; { Word w } - | MINUS; { MINUS } - | PLUS; { PLUS } - | BAR; { BAR } - | s = Style; { Style s } - | p = Paragraph_style; { Paragraph_style p } - | m = Modules; { Modules m } - | m = Math_span; { Math_span m } - | m = Math_block; { Math_block m } - | r = Raw_markup; { Raw_markup r } - | c = Code_block; { Code_block c } - | c = Code_span; { Code_span c } - | l = List; { List l } - | LI; { LI } - | DASH; { DASH } - | TABLE_LIGHT; { TABLE_LIGHT } - | TABLE_HEAVY; { TABLE_HEAVY } - | TABLE_ROW; { TABLE_ROW } - | t = Table_cell; { Table_cell t } - | s = Section_heading; { Section_heading s } - | a = Author; { Author a } - | DEPRECATED; { DEPRECATED } - | p = Param; { Param p } - | r = Raise; { Raise r } - | RETURN; { RETURN } - | s = See; { See s } - | s = Since; { Since s } - | b = Before; { Before b } - | v = Version; { Version v } - | c = Canonical; { Canonical c } - | INLINE; { INLINE } - | OPEN; { OPEN } - | CLOSED; { CLOSED } - | HIDDEN; { HIDDEN } - | r = Simple_ref; { Simple_ref r } - | r = Ref_with_replacement; { Ref_with_replacement r } - | l = Simple_link; { Simple_link l } - | l = Link_with_replacement; { Link_with_replacement l } - | m = Media; { Media m } - | m = Media_with_replacement; { Media_with_replacement m } - | v = Verbatim; { Verbatim v } - | END; { END } - let toplevel := | block = nestable_block_element; { Writer.map (fun b -> (b :> Ast.block_element) ) block } | ~ = tag; <> @@ -558,6 +507,7 @@ let table_light := { construct_table ~header ~align data } (* If there's only one row and it's not the align row, then it's data *) | TABLE_LIGHT; data = rows_light; whitespace?; RIGHT_BRACE; { construct_table data } + (* If there's nothing inside, return an empty table *) | TABLE_LIGHT; whitespace?; RIGHT_BRACE; { return @@ `Table (([[]], None), `Light) } From 6a04998c82b40a0dbfddb19e9dbdc03f7493cc07 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 25 Nov 2024 17:04:59 -0500 Subject: [PATCH 066/150] Describe Ast nodes like tokens for error recovery --- src/parser/ast.ml | 24 +++++++++++++++---- src/parser/tokens.ml | 57 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 5 deletions(-) diff --git a/src/parser/ast.ml b/src/parser/ast.ml index e4c9dd54c2..e8ca9984ce 100644 --- a/src/parser/ast.ml +++ b/src/parser/ast.ml @@ -20,7 +20,6 @@ type alignment = [ `Left | `Center | `Right ] type reference_kind = [ `Simple | `With_text ] (** References in doc comments can be of two kinds: [{!simple}] or [{{!ref}With text}]. *) - type inline_element = [ `Space of string | `Word of string @@ -63,9 +62,7 @@ and nestable_block_element = | `Verbatim of string | `Modules of string with_location list | `List of - list_kind - * list_syntax - * nestable_block_element with_location list list + list_kind * list_syntax * nestable_block_element with_location list list | `Table of table | `Math_block of string (** @since 2.0.0 *) | `Media of reference_kind * media_href with_location * string * media @@ -76,9 +73,26 @@ and nestable_block_element = {{:https://ocaml.org/releases/4.12/htmlman/ocamldoc.html#sss:ocamldoc-list}manual}). *) - and table = nestable_block_element abstract_table * [ `Light | `Heavy ] +let empty_code_block = + { + meta = None; + delimiter = None; + content = + Loc. + { + value = ""; + location = + { + file = ""; + start = { line = 0; column = 0 }; + end_ = { line = 0; column = 0 }; + }; + }; + output = None; + } + type internal_tag = [ `Canonical of string with_location | `Inline | `Open | `Closed | `Hidden ] (** Internal tags are used to exercise fine control over the output of odoc. They diff --git a/src/parser/tokens.ml b/src/parser/tokens.ml index 76195799a2..5a96baf3a1 100644 --- a/src/parser/tokens.ml +++ b/src/parser/tokens.ml @@ -199,3 +199,60 @@ let describe : token -> string = function | OPEN -> "'@open'" | CLOSED -> "'@closed'" | HIDDEN -> "'@hidden" + +let empty_code_block = + Ast. + { + meta = None; + delimiter = None; + content = + Loc. + { + value = ""; + location = + { + file = ""; + start = { line = 0; column = 0 }; + end_ = { line = 0; column = 0 }; + }; + }; + output = None; + } + +let describe_inline : Ast.inline_element -> string = function + | `Word w -> describe @@ Word w + | `Space _ -> describe SPACE + | `Styled (style, _) -> describe @@ Style style + | `Code_span _ -> describe @@ Code_span "" + | `Math_span _ -> describe @@ Math_span "" + | `Raw_markup x -> describe @@ Raw_markup x + | `Link (l, []) -> describe @@ Simple_link l + | `Link (l, _ :: _) -> describe @@ Link_with_replacement l + | `Reference (`Simple, { value; _ }, _) -> describe @@ Simple_ref value + | `Reference (`With_text, { value; _ }, _) -> + describe @@ Ref_with_replacement value + +let of_href = function `Reference s -> Reference s | `Link s -> Link s + +let of_media_kind = function + | `Audio -> Audio + | `Image -> Image + | `Video -> Video + +let of_media = function + | `Media (_, Loc.{ value; _ }, _, kind) -> + Media (of_href value, of_media_kind kind) + +let describe_nestable_block : Ast.nestable_block_element -> string = function + | `Paragraph ws -> ( + match ws with + | Loc.{ value; _ } :: _ -> describe_inline value + | [] -> describe @@ Word "") + | `Code_block _ -> describe @@ Code_block empty_code_block + | `Verbatim _ -> describe @@ Verbatim "" + | `Modules _ -> describe @@ Modules "" (* NOTE: Fix list *) + | `List (_, kind, _) -> describe @@ if kind = `Light then MINUS else DASH + | `Table (_, kind) -> + describe @@ if kind = `Light then TABLE_LIGHT else TABLE_HEAVY + | `Math_block _ -> describe @@ Math_block "" + | `Media _ as media -> describe @@ of_media media From fd6d260ac409413e130eb78ff1395321ce4057d1 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 27 Nov 2024 09:41:58 -0500 Subject: [PATCH 067/150] remove empty_code_block from ast.ml --- src/parser/ast.ml | 18 ------------------ src/parser/parser.mly | 13 +++++++------ 2 files changed, 7 insertions(+), 24 deletions(-) diff --git a/src/parser/ast.ml b/src/parser/ast.ml index e8ca9984ce..28a3508168 100644 --- a/src/parser/ast.ml +++ b/src/parser/ast.ml @@ -75,24 +75,6 @@ and nestable_block_element = and table = nestable_block_element abstract_table * [ `Light | `Heavy ] -let empty_code_block = - { - meta = None; - delimiter = None; - content = - Loc. - { - value = ""; - location = - { - file = ""; - start = { line = 0; column = 0 }; - end_ = { line = 0; column = 0 }; - }; - }; - output = None; - } - type internal_tag = [ `Canonical of string with_location | `Inline | `Open | `Closed | `Hidden ] (** Internal tags are used to exercise fine control over the output of odoc. They diff --git a/src/parser/parser.mly b/src/parser/parser.mly index d64994e70a..a159bfe4c9 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -333,7 +333,8 @@ let reference := Writer.with_warning node warning } -let link := | link_body = Simple_link; RIGHT_BRACE; +let link := + | link_body = Simple_link; RIGHT_BRACE; { let node = `Link (link_body, []) in let url = String.trim link_body in @@ -405,9 +406,6 @@ let list_light := | children = separated_nonempty_list(newline, list_light_item_unordered); { let* children = Writer.sequence children in return @@ `List (`Unordered, `Light, [ children ]) } -(* TODO: Refactor `List_item` into two tokens - Li and Dash - so that we can - handle the case where '{li' is *not* followed by whitespace in our Menhir - rule as opposed to it's semantic action *) let item_heavy == | LI; whitespace; items = sequence(locatedM(nestable_block_element)); RIGHT_BRACE; whitespace?; { @@ -491,8 +489,11 @@ let list_element := let cell_heavy := cell_kind = Table_cell; whitespace?; children = sequence(locatedM(nestable_block_element)); whitespace?; RIGHT_BRACE; { Writer.map (fun c -> (c, cell_kind)) children } -let row_heavy == TABLE_ROW; whitespace?; ~ = sequence_nonempty(cell_heavy); RIGHT_BRACE; <> -let table_heavy == TABLE_HEAVY; whitespace?; grid = sequence_nonempty(row_heavy); RIGHT_BRACE; + +let row_heavy := + | TABLE_ROW; whitespace?; ~ = sequence_nonempty(cell_heavy); RIGHT_BRACE; <> + +let table_heavy := TABLE_HEAVY; whitespace?; grid = sequence_nonempty(row_heavy); RIGHT_BRACE; { Writer.map (fun g -> `Table ((g, None), `Heavy)) grid } let cell_light := ~ = sequence_nonempty(locatedM(inline_element)); <> From 90176a5ea59e74ca3cdd9b943a674067f437754d Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 27 Nov 2024 10:27:59 -0500 Subject: [PATCH 068/150] Add empty tag warnings --- src/parser/loc.ml | 1 + src/parser/loc.mli | 3 +++ src/parser/parser.mly | 45 +++++++++++++++++++++++++++++--- src/parser/test_driver/tester.ml | 27 +++++-------------- 4 files changed, 52 insertions(+), 24 deletions(-) diff --git a/src/parser/loc.ml b/src/parser/loc.ml index 892376eb02..0c6d0679e7 100644 --- a/src/parser/loc.ml +++ b/src/parser/loc.ml @@ -6,6 +6,7 @@ let at location value = { location; value } let location { location; _ } = location let value { value; _ } = value let map f annotated = { annotated with value = f annotated.value } +let is predicate { value; _ } = predicate value let same annotated value = { annotated with value } let span spans = diff --git a/src/parser/loc.mli b/src/parser/loc.mli index 599f8d91a5..59bbfcfa5e 100644 --- a/src/parser/loc.mli +++ b/src/parser/loc.mli @@ -43,6 +43,9 @@ val value : 'a with_location -> 'a val map : ('a -> 'b) -> 'a with_location -> 'b with_location (** Map over a located value without changing its location *) +val is : ('a -> bool) -> 'a with_location -> bool +(** Test a located value with a predicate *) + val same : _ with_location -> 'b -> 'b with_location (** [same x y] retuns the value y wrapped in a {!with_location} whose location is that of [x] *) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index a159bfe4c9..3b74fc29ac 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -9,6 +9,7 @@ { location; value } let not_empty : 'a list -> bool = function _ :: _ -> true | _ -> false + let has_content : string -> bool = fun s -> String.length s > 0 type align_error = | Invalid_align (* An invalid align cell *) @@ -273,10 +274,46 @@ let tag_with_content := } let tag_bare := - | version = Version; { return @@ `Version version } - | version = Since; { return @@ `Since version } - | impl = located(Canonical); { return @@ `Canonical impl } - | version = Author; { return @@ `Author version } + | version = Version; + { + let what = Tokens.describe (Version version) in + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what span + in + Writer.ensure has_content warning (return version) + |> Writer.map (fun v -> `Version v) + } + | version = Since; + { + let what = Tokens.describe (Since version) in + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what span + in + Writer.ensure has_content warning (return version) + |> Writer.map (fun v -> `Since v) + } + | impl = located(Canonical); + { + let what = Tokens.describe @@ Canonical "" in + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what span + in + Writer.ensure (Loc.is has_content) warning @@ return impl + |> Writer.map (fun v -> `Canonical v) + } + | author = Author; + { + let what = Tokens.describe @@ Author author in + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what span + in + Writer.ensure has_content warning @@ return author + |> Writer.map (fun a -> `Author a) + } | OPEN; { return `Open } | INLINE; { return `Inline } | CLOSED; { return `Closed } diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index 181e0107c6..a9ceb302c5 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -34,6 +34,8 @@ let error_recovery = ("Empty list item", "{ol {li} }"); ("'{li' not followed by whitespace", "{ol {lifoo bar baz} }"); ("End not allowed in table", "{t "); + ("Empty @author", "@author"); + ("Empty @version" , "@version") ] let see = [ ("See", "@see bar baz quux\n") ] @@ -77,19 +79,6 @@ let documentation_cases = ("Code block", "{[\n let foo = 0 \n]}"); ] -type failure = { - label : string; - case : string; - tokens : Odoc_parser.Tester.token list; - failed_at : int; -} - -let tokens_string tokens = - List.fold_left - (fun acc token -> acc ^ "\n" ^ token) - "" - (List.map Odoc_parser.Tester.string_of_token tokens) - open Test.Serialize let error err = Atom (Odoc_parser.Warning.to_string err) @@ -102,7 +91,7 @@ let parser_output formatter (ast, warnings) = Sexplib0.Sexp.pp_hum formatter output; Format.pp_print_flush formatter () -let run_test (failed : failure list) (label, case) = +let run_test (failed : ( string * string ) list) (label, case) = let module Parser = Odoc_parser.Tester in let unwrap_token lexbuf = let open Parser in @@ -124,7 +113,7 @@ let run_test (failed : failure list) (label, case) = (fun acc warning -> acc ^ "\n" ^ Parser.pp_warning warning) "" warnings); failed - with _ -> { case; label; tokens = []; failed_at = 0 } :: failed + with _ -> ( case, label ) :: failed let () = let cases = @@ -140,9 +129,7 @@ let () = else documentation_cases in List.fold_left run_test [] cases - |> List.iter (fun { label; case; tokens; failed_at } -> + |> List.iter (fun (label, case) -> Printf.printf - "Failure: %s\nInput:\n%s\nOffending token:\n%d: %s\nTokens: %s\n\n" - label case failed_at - (List.nth tokens failed_at |> Odoc_parser.Tester.string_of_token) - (tokens_string tokens)) + "Failure: %s\nInput:\n%s\n%!" + label case) From 885802917d44deeb485082312b606db32c13d6a4 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 27 Nov 2024 12:04:15 -0500 Subject: [PATCH 069/150] add non-empty warnings for nestable block elements --- src/parser/parser.mly | 62 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 11 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 3b74fc29ac..dd75607f86 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -358,13 +358,13 @@ let reference := } | ref_body = located(Ref_with_replacement); RIGHT_BRACE; { - let span = - Parser_aux.to_location $sloc - |> Loc.nudge_start (String.length "{{!") - in let node = `Reference (`With_text, ref_body, []) in let what = Tokens.describe @@ Ref_with_replacement (Loc.value ref_body) in - let warning = fun ~filename:_f -> + let warning = fun ~filename -> + let span = + Parser_aux.to_location ~filename $sloc + |> Loc.nudge_start (String.length "{{!") + in Parse_error.should_not_be_empty ~what span in Writer.with_warning node warning @@ -377,8 +377,8 @@ let link := let url = String.trim link_body in if "" = url then let what = Tokens.describe @@ Simple_link link_body in - let span = Parser_aux.to_location $sloc in - let warning = fun ~filename:_f -> + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in Parse_error.should_not_be_empty ~what span in Writer.with_warning node warning @@ -516,7 +516,15 @@ let item_heavy == let list_heavy := | list_kind = List; whitespace?; items = sequence(item_heavy); whitespace?; RIGHT_BRACE; - { Writer.map (fun items -> `List (list_kind, `Heavy, items)) items } + { + let warning = fun ~filename -> + let what = Tokens.describe @@ List list_kind in + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what span + in + Writer.ensure not_empty warning items + |> Writer.map (fun items -> `List (list_kind, `Heavy, items)) + } let list_element := | ~ = list_light; <> @@ -597,13 +605,45 @@ let media := let nestable_block_element := ~ = nestable_block_element_inner; newline?; <> let nestable_block_element_inner := - | v = Verbatim; { return (`Verbatim v) } + | v = Verbatim; + { + let what = Tokens.describe @@ Verbatim v in + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what span + in + Writer.ensure has_content warning (return v) + |> Writer.map (fun v -> `Verbatim v) + } | items = sequence_nonempty(locatedM(inline_element)); { Writer.map (fun i -> `Paragraph i) items } | c = Code_block; { return (`Code_block c) } - | modules = located(Modules)+; { return (`Modules modules) } + (* TODO: HANDLE THIS IN LEXER - `Modules` should be a series of tokens *) + | modules = located(Modules); + { + let what = Tokens.describe @@ Modules (Loc.value modules) in + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what span + in + let Loc.{ value; location } = modules in + Astring.String.fields value + |> List.map (Loc.at location) + |> return + |> Writer.ensure not_empty warning + |> Writer.map (fun ms -> `Modules ms) + } | ~ = list_element; <> | ~ = table; <> | ~ = media; <> - | m = Math_block; { return (`Math_block m) } + | m = Math_block; + { + let what = Tokens.describe @@ Math_block m in + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what span + in + Writer.ensure has_content warning (return m) + |> Writer.map (fun m -> `Math_block m) + } From 4f1379cb6d1e830e36181fccc26e2c167a48c1b4 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 27 Nov 2024 17:56:19 -0500 Subject: [PATCH 070/150] Squash merge modules into intermediate --- src/parser/lexer.mll | 56 ++++++++++++++++++++++++++++++------------- src/parser/parser.mly | 35 ++++++++++++++------------- src/parser/tokens.ml | 7 +++--- 3 files changed, 62 insertions(+), 36 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index f6b35297b2..b4c390e290 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -144,9 +144,11 @@ type input = { mutable warnings : Warning.t list; } -(* TODO: Rewrite to add location inside tokens *) -let with_location_adjustments - k lexbuf input ?start_offset ?adjust_start_by ?end_offset ?adjust_end_by value = +let with_location_adjustments + = + fun + k lexbuf input ?start_offset ?adjust_start_by ?end_offset ?adjust_end_by + value -> let start = match start_offset with @@ -176,7 +178,16 @@ let with_location_adjustments in k input location value -let emit = +(* TODO: Fix this so it can take things besides tokens *) +let emit : + Lexing.lexbuf -> + input -> + ?start_offset:int -> + ?adjust_start_by:string -> + ?end_offset:int -> + ?adjust_end_by:string -> + 'a -> + 'a Loc.with_location = with_location_adjustments (fun _ -> Loc.at) let warning = @@ -429,8 +440,8 @@ and token input = parse { math Inline (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) input lexbuf } - | "{!modules:" ([^ '}']* as modules) '}' - { emit lexbuf input (Modules modules) } + | "{!modules:" + { modules input [] lexbuf } | (media_start as start) { @@ -616,16 +627,6 @@ and token input = parse { warning lexbuf input Parse_error.stray_cr; token input lexbuf } - | "{!modules:" ([^ '}']* as modules) eof - { warning - lexbuf - input - ~start_offset:(Lexing.lexeme_end lexbuf) - (Parse_error.not_allowed - ~what:(Tokens.describe END) - ~in_what:(Tokens.describe (Modules ""))); - emit lexbuf input (Modules modules) } - and code_span buffer nesting_level start_offset input = parse | ']' { if nesting_level = 0 then @@ -712,6 +713,29 @@ and math kind buffer nesting_level start_offset input = parse { Buffer.add_char buffer c; math kind buffer nesting_level start_offset input lexbuf } +and modules input acc = parse + | delim_char* as c + { + let start = Lexing.lexeme_start lexbuf + and end_ = Lexing.lexeme_end lexbuf in + let location = Loc.{ + file = input.file; + start = input.offset_to_location start; + end_ = input.offset_to_location end_; + } in + let c = Loc.at location c in + modules input (c :: acc) lexbuf + } + | (' ' | '\t' | '\r' | '\n') + { + modules input acc lexbuf + } + | '}' + { + let ms = List.rev acc in + emit lexbuf input (Modules ms) + } + and media tok_descr buffer nesting_level start_offset input = parse | '}' { if nesting_level == 0 then diff --git a/src/parser/parser.mly b/src/parser/parser.mly index dd75607f86..e0b3061a60 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -159,7 +159,7 @@ %token Style %token Paragraph_style -%token Modules +%token Modules %token Math_span %token Math_block @@ -618,25 +618,14 @@ let nestable_block_element_inner := | items = sequence_nonempty(locatedM(inline_element)); { Writer.map (fun i -> `Paragraph i) items } | c = Code_block; { return (`Code_block c) } - (* TODO: HANDLE THIS IN LEXER - `Modules` should be a series of tokens *) - | modules = located(Modules); - { - let what = Tokens.describe @@ Modules (Loc.value modules) in - let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in - Parse_error.should_not_be_empty ~what span - in - let Loc.{ value; location } = modules in - Astring.String.fields value - |> List.map (Loc.at location) - |> return - |> Writer.ensure not_empty warning - |> Writer.map (fun ms -> `Modules ms) - } + | ~ = modules; <> | ~ = list_element; <> | ~ = table; <> | ~ = media; <> - | m = Math_block; + | ~ = math_block; <> + + + let math_block := m = Math_block; { let what = Tokens.describe @@ Math_block m in let warning = fun ~filename -> @@ -647,3 +636,15 @@ let nestable_block_element_inner := |> Writer.map (fun m -> `Math_block m) } + let modules := modules = Modules; + { + let what = Tokens.describe @@ Modules [] in + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what span + in + return modules + |> Writer.ensure not_empty warning + |> Writer.map (fun ms -> `Modules ms) + } + diff --git a/src/parser/tokens.ml b/src/parser/tokens.ml index 5a96baf3a1..989a0a906b 100644 --- a/src/parser/tokens.ml +++ b/src/parser/tokens.ml @@ -18,7 +18,7 @@ type token = | Ref_with_replacement of string | Simple_link of string | Link_with_replacement of string - | Modules of string + | Modules of string Loc.with_location list | Media of (media * media_target) | Media_with_replacement of (media * media_target * string) | Math_span of string @@ -243,6 +243,7 @@ let of_media = function | `Media (_, Loc.{ value; _ }, _, kind) -> Media (of_href value, of_media_kind kind) +(* NOTE: Fix list *) let describe_nestable_block : Ast.nestable_block_element -> string = function | `Paragraph ws -> ( match ws with @@ -250,8 +251,8 @@ let describe_nestable_block : Ast.nestable_block_element -> string = function | [] -> describe @@ Word "") | `Code_block _ -> describe @@ Code_block empty_code_block | `Verbatim _ -> describe @@ Verbatim "" - | `Modules _ -> describe @@ Modules "" (* NOTE: Fix list *) - | `List (_, kind, _) -> describe @@ if kind = `Light then MINUS else DASH + | `Modules _ -> describe @@ Modules [] + | `List (_, _, _) -> "List" | `Table (_, kind) -> describe @@ if kind = `Light then TABLE_LIGHT else TABLE_HEAVY | `Math_block _ -> describe @@ Math_block "" From 9c8361649244176944213490f5a58ac90c978faf Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 27 Nov 2024 18:15:04 -0500 Subject: [PATCH 071/150] break nestable_block and inline_element up into smaller rules --- src/parser/parser.mly | 114 +++++++++++++++++++++++------------------- 1 file changed, 63 insertions(+), 51 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index e0b3061a60..e3f18b0fa6 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -323,25 +323,33 @@ let tag_bare := let inline_element := | s = horizontal_whitespace; { return s } - | w = Word; { return @@ `Word w } - | c = Code_span; { return @@ `Code_span c } - | m = Raw_markup; { return @@ `Raw_markup m } - | style = Style; children = sequence_nonempty(locatedM(inline_element)); RIGHT_BRACE; - { Writer.map (fun c -> `Styled (style, c)) children } - | style = Style; RIGHT_BRACE; - { - let location = Parser_aux.to_location $sloc in - let node = `Styled (style, [ Loc.at location (`Word "") ]) in - let what = Tokens.describe @@ Style style in + | ~ = code_span; <> + | ~ = raw_markup; <> + | ~ = word; <> + | ~ = style; <> + | ~ = math_span; <> + | ~ = reference; <> + | ~ = link; <> + + +let code_span := c = Code_span; { return @@ `Code_span c } +let raw_markup := m = Raw_markup; { return @@ `Raw_markup m } +let word := w = Word; { return @@ `Word w } + +let style := style = Style; children = sequence(locatedM(inline_element)); RIGHT_BRACE; + { let warning = fun ~filename -> let span = Parser_aux.to_location ~filename $sloc in + let what = Tokens.describe @@ Style style in Parse_error.should_not_be_empty ~what span in - Writer.with_warning node warning - } - | m = Math_span; { return @@ `Math_span m } - | ~ = reference; <> - | ~ = link; <> + Writer.ensure not_empty warning children + |> Writer.map (fun c -> `Styled (style, c)) + } + +let math_span := m = Math_span; { return @@ `Math_span m } + +(* LINKS + REFS *) let reference := | ref_body = located(Simple_ref); children = sequence(locatedM(inline_element)); @@ -605,46 +613,50 @@ let media := let nestable_block_element := ~ = nestable_block_element_inner; newline?; <> let nestable_block_element_inner := - | v = Verbatim; - { - let what = Tokens.describe @@ Verbatim v in - let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in - Parse_error.should_not_be_empty ~what span - in - Writer.ensure has_content warning (return v) - |> Writer.map (fun v -> `Verbatim v) - } - | items = sequence_nonempty(locatedM(inline_element)); - { Writer.map (fun i -> `Paragraph i) items } - | c = Code_block; { return (`Code_block c) } + | ~ = verbatim; <> + | ~ = code_block; <> | ~ = modules; <> | ~ = list_element; <> | ~ = table; <> | ~ = media; <> | ~ = math_block; <> - - let math_block := m = Math_block; - { - let what = Tokens.describe @@ Math_block m in - let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in - Parse_error.should_not_be_empty ~what span - in - Writer.ensure has_content warning (return m) - |> Writer.map (fun m -> `Math_block m) - } - - let modules := modules = Modules; - { - let what = Tokens.describe @@ Modules [] in - let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in - Parse_error.should_not_be_empty ~what span - in - return modules - |> Writer.ensure not_empty warning - |> Writer.map (fun ms -> `Modules ms) - } +let verbatim := v = Verbatim; + { + let what = Tokens.describe @@ Verbatim v in + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what span + in + Writer.ensure has_content warning (return v) + |> Writer.map (fun v -> `Verbatim v) + } + +let paragraph := items = sequence_nonempty(locatedM(inline_element)); + { Writer.map (fun i -> `Paragraph i) items } + +let code_block := c = Code_block; { return (`Code_block c) } + +let math_block := m = Math_block; + { + let what = Tokens.describe @@ Math_block m in + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what span + in + Writer.ensure has_content warning (return m) + |> Writer.map (fun m -> `Math_block m) + } + +let modules := modules = Modules; + { + let what = Tokens.describe @@ Modules [] in + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what span + in + return modules + |> Writer.ensure not_empty warning + |> Writer.map (fun ms -> `Modules ms) + } From eb62ed35599ffc2060e52bb1c6030f6e78741a66 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 2 Dec 2024 12:10:22 -0500 Subject: [PATCH 072/150] Explain and pretty print failures in tester --- src/parser/parser.mly | 60 ++++++++++------ src/parser/test_driver/tester.ml | 114 ++++++++++++++++++++++++------- 2 files changed, 128 insertions(+), 46 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index e3f18b0fa6..58a3a01423 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -222,10 +222,18 @@ let separated_sequence_nonempty_inner(separator, X) := let separated_sequence_nonempty(sep, rule) == xs = separated_sequence_nonempty_inner(sep, rule); { Writer.map List.rev xs } -(* ENTRY-POINT *) +let horizontal_whitespace := + | SPACE; { `Space " " } + | ~ = Space; <`Space> + +let newline := NEWLINE; { "\n" } | ~ = Single_newline; <> + +let whitespace := + | ~ = horizontal_whitespace; <> + | ~ = newline; <`Space> + +(* ENTRY *) -(* A comment can either contain block elements, block elements and then tags, or - just tags, but not tags and then block elements *) let main := | nodes = sequence_nonempty(locatedM(toplevel)); whitespace?; END; { nodes } | whitespace?; END; { Writer.return @@ [] } @@ -235,23 +243,22 @@ let toplevel := | ~ = tag; <> | ~ = heading; <> -let horizontal_whitespace := - | SPACE; { `Space " " } - | ~ = Space; <`Space> - -let newline := NEWLINE; {} | Single_newline; {} - -let whitespace := - | horizontal_whitespace; {} - | newline; {} +(* SECTION HEADING *) let heading := - | (num, title) = Section_heading; children = sequence_nonempty(locatedM(inline_element)); RIGHT_BRACE; + | (num, title) = Section_heading; children = sequence(locatedM(inline_element)); RIGHT_BRACE; { - let* children = children in - return @@ `Heading (num, title, children) + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + let what = Tokens.describe @@ Section_heading (num, title) in + Parse_error.should_not_be_empty ~what span + in + Writer.ensure not_empty warning children + |> Writer.map (fun c -> `Heading (num, title, c)) } +(* TAGS *) + let tag == | t = tag_with_content; { Writer.map tag t } | t = tag_bare; { Writer.map tag t } @@ -331,12 +338,12 @@ let inline_element := | ~ = reference; <> | ~ = link; <> - let code_span := c = Code_span; { return @@ `Code_span c } let raw_markup := m = Raw_markup; { return @@ `Raw_markup m } let word := w = Word; { return @@ `Word w } -let style := style = Style; children = sequence(locatedM(inline_element)); RIGHT_BRACE; +let style := + | style = Style; children = sequence(locatedM(inline_element)); RIGHT_BRACE; { let warning = fun ~filename -> let span = Parser_aux.to_location ~filename $sloc in @@ -346,6 +353,15 @@ let style := style = Style; children = sequence(locatedM(inline_element)); RIGHT Writer.ensure not_empty warning children |> Writer.map (fun c -> `Styled (style, c)) } + | style = Style; RIGHT_BRACE; + { + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + let what = Tokens.describe @@ Style style in + Parse_error.should_not_be_empty ~what span + in + Writer.with_warning (`Styled (style, [])) warning + } let math_span := m = Math_span; { return @@ `Math_span m } @@ -619,7 +635,8 @@ let nestable_block_element_inner := | ~ = list_element; <> | ~ = table; <> | ~ = media; <> - | ~ = math_block; <> + | ~ = math_block; <> + | ~ = paragraph; <> let verbatim := v = Verbatim; { @@ -632,8 +649,11 @@ let verbatim := v = Verbatim; |> Writer.map (fun v -> `Verbatim v) } -let paragraph := items = sequence_nonempty(locatedM(inline_element)); - { Writer.map (fun i -> `Paragraph i) items } +let paragraph := + | items = sequence_nonempty(locatedM(inline_element)); + { Writer.map (fun i -> `Paragraph i) items } + | located(error); + { return @@ `Paragraph [] } let code_block := c = Code_block; { return (`Code_block c) } diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index a9ceb302c5..6462163605 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -1,6 +1,7 @@ open! Odoc_parser_test module Loc = Odoc_parser.Loc +module Parser = Odoc_parser.Tester (* NOTE (@FayCarsons): for anyone working on this parser - this is probably @@ -35,7 +36,7 @@ let error_recovery = ("'{li' not followed by whitespace", "{ol {lifoo bar baz} }"); ("End not allowed in table", "{t "); ("Empty @author", "@author"); - ("Empty @version" , "@version") + ("Empty @version", "@version"); ] let see = [ ("See", "@see bar baz quux\n") ] @@ -91,29 +92,84 @@ let parser_output formatter (ast, warnings) = Sexplib0.Sexp.pp_hum formatter output; Format.pp_print_flush formatter () -let run_test (failed : ( string * string ) list) (label, case) = - let module Parser = Odoc_parser.Tester in - let unwrap_token lexbuf = - let open Parser in - let Loc.{ value; _ } = token lexbuf in - value - in +type failure = { + exn : string; + label : string; + offending_token : Parser.token; + area : string; +} + +let get_area Loc.{ start; end_; _ } input = + String.split_on_char '\n' input + |> List.filteri (fun idx _ -> idx >= pred start.line && idx <= pred end_.line) + |> List.fold_left ( ^ ) "" + +module TokBuf = struct + type t = { + tokens : Parser.token Loc.with_location Stream.t; + mutable idx : int; + mutable cache : Parser.token Loc.with_location list; + } + + let cache x self = self.cache <- x :: self.cache + + let create lexbuf = + let rec fill acc = + let (Loc.{ value; _ } as loc) = Parser.token lexbuf in + if Parser.is_EOI value then List.rev @@ (loc :: acc) else fill (loc :: acc) + in + + let tokens = fill [] in + { tokens = Stream.of_list tokens; idx = 0; cache = [] } + + let next self = + let (Loc.{ value = token; _ } as loc) = Stream.next self.tokens in + self.idx <- succ self.idx; + cache loc self; + token + + let failure self label input exn = + let Loc.{ value; location } = List.hd self.cache in + { exn; label; offending_token = value; area = get_area location input } +end + +let run_test (label, case) = + let open Either in let lexbuf = Lexing.from_string case in - Lexing.set_filename lexbuf "TESTER"; + Lexing.set_filename lexbuf "Tester"; + let tokens = TokBuf.create lexbuf in try - let intermediate = - Odoc_parser.Tester.main (fun lexbuf -> unwrap_token lexbuf) lexbuf + let ast, warnings = + Parser.run ~filename:"Tester" + @@ Parser.main (fun _ -> TokBuf.next tokens) lexbuf in - let ast, warnings = Parser.run ~filename:"" intermediate in - Format.printf "%a\n" parser_output (ast, warnings); - print_newline (); - Printf.printf "Got %d warnings \n%!" (List.length warnings); - Printf.printf "Warnings:\n%s\n%!" - (List.fold_left - (fun acc warning -> acc ^ "\n" ^ Parser.pp_warning warning) - "" warnings); - failed - with _ -> ( case, label ) :: failed + let output = Format.asprintf "%a" parser_output (ast, warnings) in + Left (label, output) + with e -> + let exns = Printexc.to_string e in + Right (TokBuf.failure tokens label case exns) + +let sep = String.init 80 @@ Fun.const '-' + +let format_successes = + let go acc (label, output) = + Printf.sprintf "%s%s:\n%s\n%s\n" acc label output sep + in + List.fold_left go "" + +let failure_string { exn; label; offending_token; area } = + Printf.sprintf {|%s failed with exn: %s +failed on token: +'%s' +input: +"%s"|} + label exn + (Parser.string_of_token offending_token) + area + +let format_failures = + let go acc x = acc ^ "\n" ^ sep ^ "\n" ^ x in + List.fold_left go "" let () = let cases = @@ -128,8 +184,14 @@ let () = documentation_cases) else documentation_cases in - List.fold_left run_test [] cases - |> List.iter (fun (label, case) -> - Printf.printf - "Failure: %s\nInput:\n%s\n%!" - label case) + let sucesses, failures = List.partition_map run_test cases in + let sucesses = format_successes sucesses in + let failures = format_failures @@ List.map failure_string failures in + Printf.printf + {| ----SUCCESS---- +%s + + ----FAILURE---- + %s + |} + sucesses failures From e0e0ca67174a450f011398d8754e1708c6809eaa Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 2 Dec 2024 12:28:07 -0500 Subject: [PATCH 073/150] Annotate tokens in parser --- src/parser/lexer.mll | 12 +++--- src/parser/parser.mly | 89 ++++++++++++++++++++++++------------------- 2 files changed, 55 insertions(+), 46 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index b4c390e290..2e197f0a38 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -529,8 +529,7 @@ and token input = parse { emit lexbuf input (Table_cell `Data) } | '{' (['0'-'9']+ as level) ':' (([^ '}'] # space_char)* as label) - { emit lexbuf - input (Section_heading (heading_level lexbuf input level, Some label)) } + { emit lexbuf input (Section_heading (heading_level lexbuf input level, Some label)) } | '{' (['0'-'9']+ as level) { emit lexbuf input (Section_heading (heading_level lexbuf input level, None)) } @@ -645,12 +644,11 @@ and code_span buffer nesting_level start_offset input = parse code_span buffer nesting_level start_offset input lexbuf } | newline horizontal_space* (newline horizontal_space*)+ - { warning - lexbuf - input - (Parse_error.not_allowed + { let w = (Parse_error.not_allowed ~what:(Tokens.describe (Blank_line "\n\n")) - ~in_what:(Tokens.describe (Code_span ""))); + ~in_what:(Tokens.describe (Code_span ""))) + in + warning lexbuf input w; Buffer.add_char buffer ' '; code_span buffer nesting_level start_offset input lexbuf } | newline horizontal_space* diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 58a3a01423..eed273e1b8 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -145,60 +145,71 @@ %} %token SPACE NEWLINE -%token RIGHT_BRACE -%token RIGHT_CODE_DELIMITER +%token RIGHT_BRACE "{" +%token RIGHT_CODE_DELIMITER "{[" %token Blank_line %token Single_newline -%token Space +%token Space " " -%token Word +%token Word (* Any space-delmited text *) -%token MINUS PLUS BAR +%token MINUS "-" +%token PLUS "+" -%token Style -%token Paragraph_style +%token Style "{i" (* or '{b' etc *) -%token Modules +(* or '{C' or '{R', but this syntax has been deprecated and is only kept around so legacy codebases don't break :p *) +%token Paragraph_style "{L" -%token Math_span -%token Math_block +%token Modules "{!modules:" -%token Raw_markup +%token Math_span "{m" +%token Math_block "{math" -%token Code_block -%token Code_span +%token Raw_markup "{%%}" -%token List -%token LI DASH +%token Code_block "{[]}" +%token Code_span "[]" -%token TABLE_LIGHT -%token TABLE_HEAVY -%token TABLE_ROW -%token Table_cell +%token List "{ol" (* or '{ul' *) +%token LI "{li" +%token DASH "{-" -%token Section_heading +%token TABLE_LIGHT "{t" +%token TABLE_HEAVY "{table" +%token TABLE_ROW "{tr" +%token BAR "|" + +%token Table_cell "{td" (* or '{th' for header *) + +(* Where N is an integer *) +%token Section_heading "{N:" (* Tags *) -%token Author -%token DEPRECATED -%token Param -%token Raise -%token RETURN -%token <[ `Url | `File | `Document ] * string> See -%token Since -%token Before -%token Version -%token Canonical -%token INLINE OPEN CLOSED HIDDEN - -%token Simple_ref -%token Ref_with_replacement -%token Simple_link -%token Link_with_replacement -%token Media -%token Media_with_replacement -%token Verbatim +%token Author "@author" +%token DEPRECATED "@deprecated" +%token Param "@param" +%token Raise "@raise(s)" +%token RETURN "@return" +%token <[ `Url | `File | `Document ] * string> See "@see" +%token Since "@since" +%token Before "@before" +%token Version "@version" +%token Canonical "@canonical" +%token INLINE "@inline" +%token OPEN "@open" +%token CLOSED "@closed" +%token HIDDEN "@hidden" + +%token Simple_ref "{!" +%token Ref_with_replacement "{{!" +%token Simple_link "{:" +%token Link_with_replacement "{{:" +%token Media "{(format)!" +(* where 'format' is audio, video, image *) +%token Media_with_replacement "{(format):" +%token Verbatim "{v" %token END %start main From e28d6dde6c1ee7d2d48f8292b70d589682efecd4 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 2 Dec 2024 13:10:08 -0500 Subject: [PATCH 074/150] Tester includes lexer warnings --- src/parser/lexer.mll | 5 +++-- src/parser/odoc_parser.ml | 19 +++++++------------ src/parser/odoc_parser.mli | 18 ++++++++++++++++-- src/parser/test_driver/tester.ml | 22 ++++++++++++++++++---- 4 files changed, 44 insertions(+), 20 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 2e197f0a38..a12d991243 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -644,10 +644,11 @@ and code_span buffer nesting_level start_offset input = parse code_span buffer nesting_level start_offset input lexbuf } | newline horizontal_space* (newline horizontal_space*)+ - { let w = (Parse_error.not_allowed + { let w = Parse_error.not_allowed ~what:(Tokens.describe (Blank_line "\n\n")) - ~in_what:(Tokens.describe (Code_span ""))) + ~in_what:(Tokens.describe (Code_span "")) in + print_endline "In CODE_BLOCK_BLANK_LINE"; warning lexbuf input w; Buffer.add_char buffer ' '; code_span buffer nesting_level start_offset input lexbuf } diff --git a/src/parser/odoc_parser.ml b/src/parser/odoc_parser.ml index 2a6699fcd2..06e545b827 100644 --- a/src/parser/odoc_parser.ml +++ b/src/parser/odoc_parser.ml @@ -81,19 +81,13 @@ let offset_to_location : module Tester = struct include Parser + module Lexer = Lexer + type token = Tokens.token let is_EOI = function Tokens.END -> true | _ -> false let pp_warning = Warning.to_string let run = Writer.run - let token = - let dummy_loc = - Lexer. - { - warnings = []; - file = "f.ml"; - offset_to_location = Fun.const Loc.{ line = 1; column = 0 }; - } - in - Lexer.token dummy_loc + let reversed_newlines = reversed_newlines + let offset_to_location = offset_to_location let string_of_token = Tokens.describe end @@ -118,13 +112,14 @@ let position_of_point : t -> Loc.point -> Lexing.position = { Lexing.pos_bol; pos_lnum; pos_cnum; pos_fname } (* The main entry point for this module *) -let parse_comment ~location ~text = +let parse_comment : location:Lexing.position -> text:string -> t = + fun ~location ~text -> let reversed_newlines = reversed_newlines ~input:text in let lexbuf = Lexing.from_string text in (* We cannot directly pass parameters to Menhir without converting our parser to a module functor. So we pass our current filename to the lexbuf here *) lexbuf.lex_curr_p <- - { lexbuf.lex_curr_p with pos_fname = Lexing.(location.pos_fname) }; + { lexbuf.lex_curr_p with pos_fname = location.Lexing.pos_fname }; let lexer_state = Lexer. { diff --git a/src/parser/odoc_parser.mli b/src/parser/odoc_parser.mli index dc98f35618..5310fb2247 100644 --- a/src/parser/odoc_parser.mli +++ b/src/parser/odoc_parser.mli @@ -20,11 +20,25 @@ module Ast = Ast module Loc = Loc module Tester : sig - type token + type token = Tokens.token + module Lexer : sig + type input = { + file : string; + offset_to_location : int -> Loc.point; + mutable warnings : Warning.t list; + } + + val token : input -> Lexing.lexbuf -> Parser.token Loc.with_location + end val is_EOI : token -> bool val pp_warning : Warning.t -> string + val reversed_newlines : input:string -> (int * int) list + val offset_to_location : + reversed_newlines:(int * int) list -> + comment_location:Lexing.position -> + int -> + Loc.point val run : filename:string -> Ast.t Writer.t -> Ast.t * Warning.t list - val token : Lexing.lexbuf -> token Loc.with_location val main : (Lexing.lexbuf -> token) -> Lexing.lexbuf -> Ast.t Writer.t val string_of_token : token -> string end diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index 6462163605..3ae47f8052 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -37,6 +37,7 @@ let error_recovery = ("End not allowed in table", "{t "); ("Empty @author", "@author"); ("Empty @version", "@version"); + ("Code block w/ blank line", "[\n\n]"); ] let see = [ ("See", "@see bar baz quux\n") ] @@ -113,9 +114,9 @@ module TokBuf = struct let cache x self = self.cache <- x :: self.cache - let create lexbuf = + let create ~input ~lexbuf = let rec fill acc = - let (Loc.{ value; _ } as loc) = Parser.token lexbuf in + let (Loc.{ value; _ } as loc) = Parser.Lexer.token input lexbuf in if Parser.is_EOI value then List.rev @@ (loc :: acc) else fill (loc :: acc) in @@ -135,14 +136,27 @@ end let run_test (label, case) = let open Either in + let reversed_newlines = Parser.reversed_newlines ~input:case in let lexbuf = Lexing.from_string case in - Lexing.set_filename lexbuf "Tester"; - let tokens = TokBuf.create lexbuf in + let file = "Tester" in + lexbuf.lex_curr_p <- { lexbuf.lex_curr_p with pos_fname = file }; + let input = + Parser.Lexer. + { + warnings = []; + offset_to_location = + Parser.offset_to_location ~reversed_newlines + ~comment_location:lexbuf.lex_curr_p; + file; + } + in + let tokens = TokBuf.create ~input ~lexbuf in try let ast, warnings = Parser.run ~filename:"Tester" @@ Parser.main (fun _ -> TokBuf.next tokens) lexbuf in + let warnings = warnings @ input.warnings in let output = Format.asprintf "%a" parser_output (ast, warnings) in Left (label, output) with e -> From 2572423210816a0ac54547938d6fbed205bfd049 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 2 Dec 2024 13:34:59 -0500 Subject: [PATCH 075/150] Error recovery for unexpected EOI working --- src/parser/parser.mly | 11 +++++++- src/parser/test_driver/tester.ml | 47 ++++++++++++++++++++++++-------- 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index eed273e1b8..d8fea236ee 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -373,6 +373,15 @@ let style := in Writer.with_warning (`Styled (style, [])) warning } + | style = Style; END; + { + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + let in_what = Tokens.describe @@ Style style in + Parse_error.end_not_allowed ~in_what span + in + Writer.with_warning (`Styled (style, [])) warning + } let math_span := m = Math_span; { return @@ `Math_span m } @@ -592,7 +601,7 @@ let table_light := (* If there's nothing inside, return an empty table *) | TABLE_LIGHT; whitespace?; RIGHT_BRACE; { return @@ `Table (([[]], None), `Light) } - | TABLE_LIGHT; whitespace?; END; + | TABLE_LIGHT; END; { let in_what = Tokens.describe TABLE_LIGHT in let warning = fun ~filename -> diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index 3ae47f8052..0376fc9007 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -28,6 +28,9 @@ let code_cases = let error_recovery = [ + ("Empty @author", "@author"); + ("Empty @version", "@version"); + ("Code block w/ blank line", "[\n\n]"); ("Empty style", "{i}"); ("Empty ref", "{{!www.google.com}}"); ("Empty link", "{{:www.google.com}}"); @@ -35,9 +38,7 @@ let error_recovery = ("Empty list item", "{ol {li} }"); ("'{li' not followed by whitespace", "{ol {lifoo bar baz} }"); ("End not allowed in table", "{t "); - ("Empty @author", "@author"); - ("Empty @version", "@version"); - ("Code block w/ blank line", "[\n\n]"); + ("Nothing", "{i"); ] let see = [ ("See", "@see bar baz quux\n") ] @@ -98,6 +99,7 @@ type failure = { label : string; offending_token : Parser.token; area : string; + tokens : Parser.token list; } let get_area Loc.{ start; end_; _ } input = @@ -119,19 +121,34 @@ module TokBuf = struct let (Loc.{ value; _ } as loc) = Parser.Lexer.token input lexbuf in if Parser.is_EOI value then List.rev @@ (loc :: acc) else fill (loc :: acc) in - let tokens = fill [] in { tokens = Stream.of_list tokens; idx = 0; cache = [] } let next self = - let (Loc.{ value = token; _ } as loc) = Stream.next self.tokens in + print_endline "Attempting to get next token"; + let (Loc.{ value = token; _ } as loc) = + try Stream.next self.tokens with _ -> List.hd self.cache + in + print_endline @@ "Got token: " ^ Parser.string_of_token token; self.idx <- succ self.idx; cache loc self; token let failure self label input exn = let Loc.{ value; location } = List.hd self.cache in - { exn; label; offending_token = value; area = get_area location input } + let tokens = + let rec go acc = + try go @@ (Stream.next self.tokens :: acc) with _ -> acc + in + List.map Loc.value @@ List.rev @@ go [] @ self.cache + in + { + exn; + label; + offending_token = value; + tokens; + area = get_area location input; + } end let run_test (label, case) = @@ -171,15 +188,21 @@ let format_successes = in List.fold_left go "" -let failure_string { exn; label; offending_token; area } = - Printf.sprintf {|%s failed with exn: %s -failed on token: +let failure_string { exn; label; offending_token; tokens; area } = + Printf.sprintf + {|>>> Case '%s' failed with exn: <<< +%s +offending token: '%s' input: -"%s"|} +"%s" +tokens:%s|} label exn (Parser.string_of_token offending_token) area + (List.fold_left + (fun acc t -> Printf.sprintf "%s\n<%s>" acc (Parser.string_of_token t)) + "" tokens) let format_failures = let go acc x = acc ^ "\n" ^ sep ^ "\n" ^ x in @@ -202,10 +225,10 @@ let () = let sucesses = format_successes sucesses in let failures = format_failures @@ List.map failure_string failures in Printf.printf - {| ----SUCCESS---- + {|<<<<----SUCCESS---->>>> %s - ----FAILURE---- +<<<<----FAILURE---->>>> %s |} sucesses failures From e7851a6830beb428465a9c9003a5a9625b29024a Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 3 Dec 2024 11:09:19 -0500 Subject: [PATCH 076/150] Tester with working locations --- src/parser/test_driver/tester.ml | 71 +++++++------------------------- 1 file changed, 15 insertions(+), 56 deletions(-) diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index 0376fc9007..c214762924 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -37,7 +37,7 @@ let error_recovery = ("List item not at beginning of line", "- foo\n - bar\n- baz"); ("Empty list item", "{ol {li} }"); ("'{li' not followed by whitespace", "{ol {lifoo bar baz} }"); - ("End not allowed in table", "{t "); + ("End not allowed in table", "{t \n| -- | :--: |\n| a | b \n"); ("Nothing", "{i"); ] @@ -99,7 +99,6 @@ type failure = { label : string; offending_token : Parser.token; area : string; - tokens : Parser.token list; } let get_area Loc.{ start; end_; _ } input = @@ -107,56 +106,16 @@ let get_area Loc.{ start; end_; _ } input = |> List.filteri (fun idx _ -> idx >= pred start.line && idx <= pred end_.line) |> List.fold_left ( ^ ) "" -module TokBuf = struct - type t = { - tokens : Parser.token Loc.with_location Stream.t; - mutable idx : int; - mutable cache : Parser.token Loc.with_location list; - } - - let cache x self = self.cache <- x :: self.cache - - let create ~input ~lexbuf = - let rec fill acc = - let (Loc.{ value; _ } as loc) = Parser.Lexer.token input lexbuf in - if Parser.is_EOI value then List.rev @@ (loc :: acc) else fill (loc :: acc) - in - let tokens = fill [] in - { tokens = Stream.of_list tokens; idx = 0; cache = [] } - - let next self = - print_endline "Attempting to get next token"; - let (Loc.{ value = token; _ } as loc) = - try Stream.next self.tokens with _ -> List.hd self.cache - in - print_endline @@ "Got token: " ^ Parser.string_of_token token; - self.idx <- succ self.idx; - cache loc self; - token - - let failure self label input exn = - let Loc.{ value; location } = List.hd self.cache in - let tokens = - let rec go acc = - try go @@ (Stream.next self.tokens :: acc) with _ -> acc - in - List.map Loc.value @@ List.rev @@ go [] @ self.cache - in - { - exn; - label; - offending_token = value; - tokens; - area = get_area location input; - } -end +let mkfailure label input exn last = + let Loc.{ value; location } = last in + { offending_token = value; exn; label; area = get_area location input } let run_test (label, case) = let open Either in let reversed_newlines = Parser.reversed_newlines ~input:case in let lexbuf = Lexing.from_string case in let file = "Tester" in - lexbuf.lex_curr_p <- { lexbuf.lex_curr_p with pos_fname = file }; + Lexing.set_filename lexbuf file; let input = Parser.Lexer. { @@ -167,18 +126,22 @@ let run_test (label, case) = file; } in - let tokens = TokBuf.create ~input ~lexbuf in + let last_tok = ref None in + let get_tok lexbuf = + let tok = Parser.Lexer.token input lexbuf in + last_tok := Some tok; + Loc.value tok + in try let ast, warnings = - Parser.run ~filename:"Tester" - @@ Parser.main (fun _ -> TokBuf.next tokens) lexbuf + Parser.run ~filename:"Tester" @@ Parser.main get_tok lexbuf in let warnings = warnings @ input.warnings in let output = Format.asprintf "%a" parser_output (ast, warnings) in Left (label, output) with e -> let exns = Printexc.to_string e in - Right (TokBuf.failure tokens label case exns) + Right (mkfailure label case exns @@ Option.get !last_tok) let sep = String.init 80 @@ Fun.const '-' @@ -188,21 +151,17 @@ let format_successes = in List.fold_left go "" -let failure_string { exn; label; offending_token; tokens; area } = +let failure_string { exn; label; offending_token; area } = Printf.sprintf {|>>> Case '%s' failed with exn: <<< %s offending token: '%s' input: -"%s" -tokens:%s|} +"%s"|} label exn (Parser.string_of_token offending_token) area - (List.fold_left - (fun acc t -> Printf.sprintf "%s\n<%s>" acc (Parser.string_of_token t)) - "" tokens) let format_failures = let go acc x = acc ^ "\n" ^ sep ^ "\n" ^ x in From 7fda51ce82a2e602759d9bcff3c96d048d7fb95e Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 4 Dec 2024 10:26:11 -0500 Subject: [PATCH 077/150] Trim start of paragraphs, handle EOF in `{!modules: ...}` --- .ocamlformat | 2 +- src/parser/lexer.mll | 7 ++++++- src/parser/parser.mly | 14 +++++++++----- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/.ocamlformat b/.ocamlformat index 5626d8362a..9d9e841212 100644 --- a/.ocamlformat +++ b/.ocamlformat @@ -1,3 +1,3 @@ module-item-spacing=preserve -version=0.26.1 +version=0.26.2 ocaml-version=4.02 diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index a12d991243..22c4178ccb 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -648,7 +648,6 @@ and code_span buffer nesting_level start_offset input = parse ~what:(Tokens.describe (Blank_line "\n\n")) ~in_what:(Tokens.describe (Code_span "")) in - print_endline "In CODE_BLOCK_BLANK_LINE"; warning lexbuf input w; Buffer.add_char buffer ' '; code_span buffer nesting_level start_offset input lexbuf } @@ -734,6 +733,12 @@ and modules input acc = parse let ms = List.rev acc in emit lexbuf input (Modules ms) } + | eof + { + let wg = Parse_error.end_not_allowed ~in_what:(Tokens.describe @@ Modules []) in + warning lexbuf input wg; + emit lexbuf input (Modules (List.rev acc)) + } and media tok_descr buffer nesting_level start_offset input = parse | '}' diff --git a/src/parser/parser.mly b/src/parser/parser.mly index d8fea236ee..cdfdb9ed47 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -142,6 +142,10 @@ (Loc.at location media, target, content) let tag t = `Tag t + + let trim_start = function + | Loc.{value = `Space _; _ } :: xs -> xs + | xs -> xs %} %token SPACE NEWLINE @@ -265,7 +269,7 @@ let heading := Parse_error.should_not_be_empty ~what span in Writer.ensure not_empty warning children - |> Writer.map (fun c -> `Heading (num, title, c)) + |> Writer.map (fun c -> `Heading (num, title, trim_start c)) } (* TAGS *) @@ -362,7 +366,7 @@ let style := Parse_error.should_not_be_empty ~what span in Writer.ensure not_empty warning children - |> Writer.map (fun c -> `Styled (style, c)) + |> Writer.map (fun c -> `Styled (style, trim_start c)) } | style = Style; RIGHT_BRACE; { @@ -392,13 +396,13 @@ let reference := { let+ children = children in let ref_body = Loc.nudge_map_start (String.length "{!") ref_body in - `Reference (`Simple, ref_body, children) + `Reference (`Simple, ref_body, trim_start children) } | ref_body = located(Ref_with_replacement); children = sequence_nonempty(locatedM(inline_element)); RIGHT_BRACE; { let+ children = children in let ref_body = Loc.nudge_map_start (String.length "{{!") ref_body in - `Reference (`With_text, ref_body, children) + `Reference (`With_text, ref_body, trim_start children) } | ref_body = located(Ref_with_replacement); RIGHT_BRACE; { @@ -671,7 +675,7 @@ let verbatim := v = Verbatim; let paragraph := | items = sequence_nonempty(locatedM(inline_element)); - { Writer.map (fun i -> `Paragraph i) items } + { Writer.map (fun i -> `Paragraph (trim_start i)) items } | located(error); { return @@ `Paragraph [] } From cbec3b200e3504d1f918d4778ef9144e8feb026d Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 4 Dec 2024 15:05:04 -0500 Subject: [PATCH 078/150] Tests fixed --- src/parser/lexer.mll | 31 +------ src/parser/parser.mly | 137 +++++++++++++++++++++++++------ src/parser/test_driver/tester.ml | 13 +-- src/parser/tokens.ml | 8 +- 4 files changed, 122 insertions(+), 67 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 22c4178ccb..58d366776d 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -441,7 +441,7 @@ and token input = parse | "{!modules:" - { modules input [] lexbuf } + { emit lexbuf input MODULES } | (media_start as start) { @@ -711,35 +711,6 @@ and math kind buffer nesting_level start_offset input = parse { Buffer.add_char buffer c; math kind buffer nesting_level start_offset input lexbuf } -and modules input acc = parse - | delim_char* as c - { - let start = Lexing.lexeme_start lexbuf - and end_ = Lexing.lexeme_end lexbuf in - let location = Loc.{ - file = input.file; - start = input.offset_to_location start; - end_ = input.offset_to_location end_; - } in - let c = Loc.at location c in - modules input (c :: acc) lexbuf - } - | (' ' | '\t' | '\r' | '\n') - { - modules input acc lexbuf - } - | '}' - { - let ms = List.rev acc in - emit lexbuf input (Modules ms) - } - | eof - { - let wg = Parse_error.end_not_allowed ~in_what:(Tokens.describe @@ Modules []) in - warning lexbuf input wg; - emit lexbuf input (Modules (List.rev acc)) - } - and media tok_descr buffer nesting_level start_offset input = parse | '}' { if nesting_level == 0 then diff --git a/src/parser/parser.mly b/src/parser/parser.mly index cdfdb9ed47..f25847f2fd 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -145,7 +145,28 @@ let trim_start = function | Loc.{value = `Space _; _ } :: xs -> xs - | xs -> xs + | xs -> xs + + let loc_is_some = function + | Loc.{ value = Some x; _ } as loc -> Some { loc with value = x } + | _ -> None +(* + let rec inline_element_inner : Ast.inline_element -> string = function + | `Space s -> s + | `Word s -> s + | `Styled (_, s) -> children s + | `Code_span s -> s + | `Raw_markup (_, s) -> s + | `Reference (_, _, s) -> children s + | `Link (_, s) -> children s + | `Math_span s -> s + and children s = + List.fold_left + (fun acc elt -> acc ^ inline_element_inner (Loc.value elt)) + "" + s + +*) %} %token SPACE NEWLINE @@ -166,7 +187,7 @@ (* or '{C' or '{R', but this syntax has been deprecated and is only kept around so legacy codebases don't break :p *) %token Paragraph_style "{L" -%token Modules "{!modules:" +%token MODULES "{!modules:" %token Math_span "{m" %token Math_block "{math" @@ -580,11 +601,11 @@ let list_element := (* TABLES *) -let cell_heavy := cell_kind = Table_cell; whitespace?; children = sequence(locatedM(nestable_block_element)); whitespace?; RIGHT_BRACE; +let cell_heavy := cell_kind = Table_cell; whitespace?; children = sequence(locatedM(nestable_block_element)); newline?; RIGHT_BRACE; whitespace?; { Writer.map (fun c -> (c, cell_kind)) children } let row_heavy := - | TABLE_ROW; whitespace?; ~ = sequence_nonempty(cell_heavy); RIGHT_BRACE; <> + | TABLE_ROW; whitespace?; ~ = sequence_nonempty(cell_heavy); RIGHT_BRACE; whitespace?; <> let table_heavy := TABLE_HEAVY; whitespace?; grid = sequence_nonempty(row_heavy); RIGHT_BRACE; { Writer.map (fun g -> `Table ((g, None), `Heavy)) grid } @@ -655,12 +676,12 @@ let nestable_block_element := ~ = nestable_block_element_inner; newline?; <> let nestable_block_element_inner := | ~ = verbatim; <> | ~ = code_block; <> - | ~ = modules; <> | ~ = list_element; <> | ~ = table; <> | ~ = media; <> | ~ = math_block; <> | ~ = paragraph; <> + | ~ = modules; <> let verbatim := v = Verbatim; { @@ -676,31 +697,93 @@ let verbatim := v = Verbatim; let paragraph := | items = sequence_nonempty(locatedM(inline_element)); { Writer.map (fun i -> `Paragraph (trim_start i)) items } - | located(error); - { return @@ `Paragraph [] } let code_block := c = Code_block; { return (`Code_block c) } let math_block := m = Math_block; - { - let what = Tokens.describe @@ Math_block m in - let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in - Parse_error.should_not_be_empty ~what span - in - Writer.ensure has_content warning (return m) - |> Writer.map (fun m -> `Math_block m) - } + { + let what = Tokens.describe @@ Math_block m in + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what span + in + Writer.ensure has_content warning (return m) + |> Writer.map (fun m -> `Math_block m) + } -let modules := modules = Modules; - { - let what = Tokens.describe @@ Modules [] in - let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in - Parse_error.should_not_be_empty ~what span - in - return modules - |> Writer.ensure not_empty warning - |> Writer.map (fun ms -> `Modules ms) - } +let module_list_element := + | ~ = Word; + | horizontal_whitespace; { None } + +let modules := + | MODULES; modules = located(module_list_element)+; RIGHT_BRACE; + { + let modules = List.filter_map loc_is_some modules in + return @@ `Modules modules + } + | MODULES; RIGHT_BRACE; + { + let what = Tokens.describe MODULES in + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.should_not_be_empty ~what span + in + Writer.with_warning (`Modules []) warning + } + | MODULES; modules = located(module_list_element)+; END; + { + let in_what = Tokens.describe MODULES in + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.end_not_allowed ~in_what span + in + let modules = List.filter_map loc_is_some modules in + Writer.with_warning (`Modules modules) warning + } +(* + | MODULES; modules = sequence_nonempty(locatedM(inline_element)); RIGHT_BRACE; + { + print_endline "INLINE"; + let in_what = Tokens.describe MODULES in + let* modules = modules in + let warning = fun ~filename:_ -> + let span = Loc.span @@ List.map Loc.location modules in + let first_offending : Ast.inline_element = + List.find + (function + | `Word _ | `Space _ -> false + | _ -> true) + (List.map Loc.value modules : Ast.inline_element list) + in + let what = Tokens.describe_inline first_offending in + Parse_error.not_allowed ~what ~in_what span + in + let inner = `Modules (List.map (Loc.map inline_element_inner) modules) in + Writer.with_warning inner warning + } + + | MODULES; modules = sequence_nonempty(locatedM(inline_element)); END; + { + print_endline "INLINE + EOI"; + let in_what = Tokens.describe MODULES in + let* modules = modules in + let span = Loc.span @@ List.map Loc.location modules in + let not_allowed = fun ~filename:_ -> + let first_offending : Ast.inline_element = + List.find + (function + | `Word _ | `Space _ -> false + | _ -> true) + (List.map Loc.value modules : Ast.inline_element list) + in + let what = Tokens.describe_inline first_offending in + Parse_error.not_allowed ~what ~in_what span + in + let unexpected_end = fun ~filename:_ -> + Parse_error.end_not_allowed ~in_what:(Tokens.describe MODULES) span + in + let inner = `Modules (List.map (Loc.map inline_element_inner) modules) in + Writer.with_warning inner not_allowed |> Writer.warning unexpected_end + } +*) diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index c214762924..cebe0515fb 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -38,11 +38,10 @@ let error_recovery = ("Empty list item", "{ol {li} }"); ("'{li' not followed by whitespace", "{ol {lifoo bar baz} }"); ("End not allowed in table", "{t \n| -- | :--: |\n| a | b \n"); - ("Nothing", "{i"); + ("Empty modules", "{!modules: }"); + ("EOI in modules", "{!modules: Foo Bar"); ] -let see = [ ("See", "@see bar baz quux\n") ] - (* Cases (mostly) taken from the 'odoc for library authors' document *) let documentation_cases = [ @@ -50,7 +49,7 @@ let documentation_cases = ("Heavy list", "{ul {li foo} {li bar} {li baz}}"); ("Simple ref", "{!Stdlib.Buffer}"); ("Ref w/ replacement", "{{: https://ocaml.org/ }the OCaml website}"); - ("Modules", "{!modules: Foo Bar Baz}"); + ("Modules", "{!modules: Foo Bar Baz }"); ("Block tag", "@see bar baz quux\n"); ("Inline tag", "@author Fay Carsons"); ("Simple tag", "@open"); @@ -82,6 +81,8 @@ let documentation_cases = ("Code block", "{[\n let foo = 0 \n]}"); ] +let modules = [ ("Modules", "{!modules: Foo Bar Baz }") ] + open Test.Serialize let error err = Atom (Odoc_parser.Warning.to_string err) @@ -129,6 +130,7 @@ let run_test (label, case) = let last_tok = ref None in let get_tok lexbuf = let tok = Parser.Lexer.token input lexbuf in + print_endline @@ "Got tok: " ^ Parser.string_of_token (Loc.value tok); last_tok := Some tok; Loc.value tok in @@ -174,11 +176,10 @@ let () = | "code" -> code_cases | "recovery" | "r" -> error_recovery | "docs" | "d" -> documentation_cases - | "see" | "s" -> see | _ -> print_endline "unrecognized argument - running documentation_cases"; documentation_cases) - else documentation_cases + else modules in let sucesses, failures = List.partition_map run_test cases in let sucesses = format_successes sucesses in diff --git a/src/parser/tokens.ml b/src/parser/tokens.ml index 989a0a906b..63a9365495 100644 --- a/src/parser/tokens.ml +++ b/src/parser/tokens.ml @@ -18,7 +18,7 @@ type token = | Ref_with_replacement of string | Simple_link of string | Link_with_replacement of string - | Modules of string Loc.with_location list + | MODULES | Media of (media * media_target) | Media_with_replacement of (media * media_target * string) | Math_span of string @@ -77,7 +77,7 @@ let print : token -> string = function | Ref_with_replacement _ -> "{{!" | Simple_link _ -> "{:" | Link_with_replacement _ -> "{{:" - | Modules _ -> "{!modules:" + | MODULES -> "{!modules:" | Media (ref_kind, media_kind) -> let ref_kind, media_kind = media_description ref_kind media_kind in Printf.sprintf "{%s%s" media_kind ref_kind @@ -170,7 +170,7 @@ let describe : token -> string = function | RIGHT_CODE_DELIMITER -> "']}'" | Code_block _ -> "'{[...]}' (code block)" | Verbatim _ -> "'{v ... v}' (verbatim text)" - | Modules _ -> "'{!modules ...}'" + | MODULES -> "'{!modules ...}'" | List `Unordered -> "'{ul ...}' (bulleted list)" | List `Ordered -> "'{ol ...}' (numbered list)" | LI -> "'{li ...}' (list item)" @@ -251,7 +251,7 @@ let describe_nestable_block : Ast.nestable_block_element -> string = function | [] -> describe @@ Word "") | `Code_block _ -> describe @@ Code_block empty_code_block | `Verbatim _ -> describe @@ Verbatim "" - | `Modules _ -> describe @@ Modules [] + | `Modules _ -> describe MODULES | `List (_, _, _) -> "List" | `Table (_, kind) -> describe @@ if kind = `Light then TABLE_LIGHT else TABLE_HEAVY From 82e456b6cf0290cc1cdd3589f1cad77d0163a990 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 4 Dec 2024 16:48:01 -0500 Subject: [PATCH 079/150] Make light tables simpler and more robust --- src/parser/parser.mly | 65 ++++++++++++++++++++++---------- src/parser/test_driver/tester.ml | 22 +++++++---- 2 files changed, 59 insertions(+), 28 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index f25847f2fd..7ef5d6fa8e 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -90,13 +90,34 @@ let as_data = merged_tagged_row `Data let as_header = merged_tagged_row `Header - let mktable data header align = - match valid_align_row align with - | Ok alignment -> `Table ((header :: data, Some alignment), `Light) - | Error Invalid_align -> `Table ((header :: data, None), `Light) - | Error Not_align -> - let rows = header :: as_data align :: data in - `Table ((rows, None), `Light) + let is_valid_align row = Result.is_ok @@ valid_align_row row + +(* + + - If the first row is the alignment row then the rest should be data + - Otherwise the first should be the headers, the second align, and the rest data + - If there's only one row and it's not the align row, then it's data +*) + + let construct_table + : Ast.inline_element Loc.with_location list list list -> + Ast.nestable_block_element = + function + | [only_row] -> ( + match valid_align_row only_row with + | Ok align -> + `Table (([[]], Some align), `Light) + | _ -> + `Table (([as_data only_row], None), `Light)) + | align :: data when is_valid_align align -> + let align = Result.get_ok @@ valid_align_row align in + `Table ( (List.map as_data data, Some align) , `Light) + | header :: align :: data when is_valid_align align -> + let align = Result.get_ok @@ valid_align_row align in + `Table ((as_header header :: List.map as_data data, Some align), `Light) + | data -> `Table ((List.map as_data data, None), `Light) + +(* let construct_table : ?header:Ast.inline_element Loc.with_location list list Writer.t -> @@ -112,7 +133,7 @@ let* table = Writer.map (mktable data header) align in Writer.return table | None -> Writer.return @@ mktable data header [] - +*) let unclosed_table ?(data : Ast.inline_element Loc.with_location list list list Writer.t option) @@ -268,11 +289,15 @@ let whitespace := | ~ = horizontal_whitespace; <> | ~ = newline; <`Space> +let any_whitespace := + | ~ = whitespace; <> + | ~ = Blank_line; <`Space> + (* ENTRY *) let main := - | nodes = sequence_nonempty(locatedM(toplevel)); whitespace?; END; { nodes } - | whitespace?; END; { Writer.return @@ [] } + | any_whitespace*; nodes = sequence_nonempty(locatedM(toplevel)); whitespace?; END; { nodes } + | any_whitespace*; END; { Writer.return @@ [] } let toplevel := | block = nestable_block_element; { Writer.map (fun b -> (b :> Ast.block_element) ) block } @@ -601,7 +626,7 @@ let list_element := (* TABLES *) -let cell_heavy := cell_kind = Table_cell; whitespace?; children = sequence(locatedM(nestable_block_element)); newline?; RIGHT_BRACE; whitespace?; +let cell_heavy := cell_kind = Table_cell; whitespace?; children = sequence(locatedM(nestable_block_element)); whitespace?; RIGHT_BRACE; whitespace?; { Writer.map (fun c -> (c, cell_kind)) children } let row_heavy := @@ -610,18 +635,18 @@ let row_heavy := let table_heavy := TABLE_HEAVY; whitespace?; grid = sequence_nonempty(row_heavy); RIGHT_BRACE; { Writer.map (fun g -> `Table ((g, None), `Heavy)) grid } + let cell_light := ~ = sequence_nonempty(locatedM(inline_element)); <> -let row_light := BAR?; cells = separated_sequence_nonempty(BAR, cell_light); BAR?; <> -let rows_light := ~ = separated_sequence_nonempty(newline, row_light); <> +(* This is crucial because we cannot have multiple optionals in sequence *) +let row_prefix := + | whitespace; {} + | BAR; {} + | whitespace; BAR; {} +let row_light := row_prefix?; cells = separated_sequence_nonempty(BAR, cell_light); BAR?; <> +let rows_light := ~ = separated_sequence_nonempty(newline,row_light); <> let table_light := - (* If the first row is the alignment row then the rest should be data *) - | TABLE_LIGHT; align = row_light; newline; data = rows_light; whitespace?; RIGHT_BRACE; - { construct_table ~align data } - (* Otherwise the first should be the headers, the second align, and the rest data *) - | TABLE_LIGHT; header = row_light; newline; align = row_light; newline; data = rows_light; whitespace?; RIGHT_BRACE; - { construct_table ~header ~align data } (* If there's only one row and it's not the align row, then it's data *) - | TABLE_LIGHT; data = rows_light; whitespace?; RIGHT_BRACE; { construct_table data } + | TABLE_LIGHT; data = rows_light; whitespace?; RIGHT_BRACE; { Writer.map construct_table data } (* If there's nothing inside, return an empty table *) | TABLE_LIGHT; whitespace?; RIGHT_BRACE; diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index cebe0515fb..f450991bc6 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -69,19 +69,25 @@ let documentation_cases = \ }\n\ \ }" ); ( "Light table", - {|{t\n\ - | Header 1 | Header 2 | Header 3 | Header 4|\n - | :------: | --------:|:---------|---------|\n - | centered | right | left | default |\n - omitted | bar at | start and| finish\n - | {e emph} | and | unaligned | bars |\n}|} + {|{t + | Header 1 | Header 2 | Header 3 | Header 4| + | :------: | --------:|:---------|---------| + | centered | right | left | default | + omitted | bar at | start and| finish + | {e emph} | and | unaligned | bars | + }|} ); ("Styled", "{i italicized}"); ("Inline code", "[fmap Fun.id None]"); ("Code block", "{[\n let foo = 0 \n]}"); ] -let modules = [ ("Modules", "{!modules: Foo Bar Baz }") ] +let light_table = + [ ("Light table", {| + {t + |--|--| + } + |}) ] open Test.Serialize @@ -179,7 +185,7 @@ let () = | _ -> print_endline "unrecognized argument - running documentation_cases"; documentation_cases) - else modules + else light_table in let sucesses, failures = List.partition_map run_test cases in let sucesses = format_successes sucesses in From 8f4482a5967d3486d5aeccbe5b2d054f99fab1dc Mon Sep 17 00:00:00 2001 From: faycarsons Date: Fri, 6 Dec 2024 15:36:11 -0500 Subject: [PATCH 080/150] Light tables broken for unclear reasons --- src/parser/lexer.mll | 9 +- src/parser/parser.mly | 183 ++++++++++++------------------- src/parser/test_driver/tester.ml | 116 +++++++++++++++----- src/parser/tokens.ml | 4 +- 4 files changed, 167 insertions(+), 145 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 58d366776d..1d4038a370 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -178,7 +178,6 @@ let with_location_adjustments in k input location value -(* TODO: Fix this so it can take things besides tokens *) let emit : Lexing.lexbuf -> input -> @@ -810,15 +809,15 @@ and code_block_metadata_tail input = parse (* NOTE : (@faycarsons) This is currently broken!! *) and code_block start_offset content_offset metadata prefix delim input = parse | ("]" (delim_char* as delim') "[") as terminator - { if delim = delim' - then emit_code_block ~start_offset ~content_offset ~lexbuf input None (Some delim) terminator prefix None + { if delim = delim' then + emit_code_block ~start_offset ~content_offset ~lexbuf input None (Some delim) terminator prefix None else (Buffer.add_string prefix terminator; code_block start_offset content_offset metadata prefix delim input lexbuf) } | ("]" (delim_char* as delim') "}") as terminator { - if delim = delim' - then emit_code_block ~start_offset ~content_offset ~lexbuf input None (Some delim ) terminator prefix None + if delim = delim' then + emit_code_block ~start_offset ~content_offset ~lexbuf input None (Some delim ) terminator prefix None else ( Buffer.add_string prefix terminator; code_block start_offset content_offset metadata prefix delim input lexbuf diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 7ef5d6fa8e..16b4bcf641 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -51,14 +51,14 @@ | None -> Error Not_align let sequence : ('elt, 'err) result list -> ('elt list, 'err) result = - fun list -> - let rec go acc : ('elt, 'err) result list -> ('elt list, 'err) result = - function - | Ok x :: xs -> go (x :: acc) xs - | Error err :: _ -> Error err - | [] -> Ok (List.rev acc) - in - go [] list + fun list -> + let rec go acc : ('elt, 'err) result list -> ('elt list, 'err) result = + function + | Ok x :: xs -> go (x :: acc) xs + | Error err :: _ -> Error err + | [] -> Ok (List.rev acc) + in + go [] list (* NOTE: (@FayCarsons) When we get something that doesn't look like an align at all, we check to see if we've gotten @@ -117,23 +117,6 @@ `Table ((as_header header :: List.map as_data data, Some align), `Light) | data -> `Table ((List.map as_data data, None), `Light) -(* - - let construct_table : - ?header:Ast.inline_element Loc.with_location list list Writer.t -> - ?align:Ast.inline_element Loc.with_location list list Writer.t -> - Ast.inline_element Loc.with_location list list list Writer.t -> - Ast.nestable_block_element Writer.t = - fun ?header ?align data -> - let* data = Writer.map (List.map as_data) data in - let header = Option.value ~default:(Writer.return []) header in - let* header = Writer.map as_header header in - match align with - | Some align -> - let* table = Writer.map (mktable data header) align in - Writer.return table - | None -> Writer.return @@ mktable data header [] -*) let unclosed_table ?(data : Ast.inline_element Loc.with_location list list list Writer.t option) @@ -168,10 +151,6 @@ | Loc.{value = `Space _; _ } :: xs -> xs | xs -> xs - let loc_is_some = function - | Loc.{ value = Some x; _ } as loc -> Some { loc with value = x } - | _ -> None -(* let rec inline_element_inner : Ast.inline_element -> string = function | `Space s -> s | `Word s -> s @@ -187,10 +166,18 @@ "" s -*) +let legal_module_list : Ast.inline_element Loc.with_location list -> bool = + fun xs -> + not_empty xs + && List.for_all + (function + | `Word _ | `Space _ -> true + | _ -> false) + @@ List.map Loc.value xs + %} -%token SPACE NEWLINE +%token SPACE %token RIGHT_BRACE "{" %token RIGHT_CODE_DELIMITER "{[" @@ -262,28 +249,28 @@ %% -(* Utility which wraps the return value of a producer in `Loc.with_location` *) +(* UTILITIES *) + +(* Utility which wraps the return value of a rule in `Loc.with_location` *) let locatedM(rule) == inner = rule; { Writer.map (wrap_location $sloc) inner } let located(rule) == inner = rule; { wrap_location $sloc inner } -let sequence(rule) == xs = rule*; { Writer.sequence xs } -let sequence_nonempty(rule) == xs = rule+; { Writer.sequence xs } -let separated_sequence_nonempty_inner(separator, X) := - | x = X; { let* x = x in return [ x ] } - | xs = separated_sequence_nonempty_inner(separator, X); separator; x = X; - { - let* x = x in - let* xs = xs in - return @@ x :: xs - } +let sequence(rule) == xs = list(rule); { Writer.sequence xs } +let sequence_nonempty(rule) == xs = nonempty_list(rule); { Writer.sequence xs } + +let separated_nonempty_sequence(sep, rule) := xs = separated_nonempty_list(sep, rule); { Writer.sequence xs } +let separated_sequence(sep, rule) := + | ~ = separated_nonempty_sequence(sep, rule); <> + | { return [] } -let separated_sequence_nonempty(sep, rule) == xs = separated_sequence_nonempty_inner(sep, rule); { Writer.map List.rev xs } +(* WHITESPACE *) let horizontal_whitespace := | SPACE; { `Space " " } | ~ = Space; <`Space> -let newline := NEWLINE; { "\n" } | ~ = Single_newline; <> +let newline := + | ~ = Single_newline; <> let whitespace := | ~ = horizontal_whitespace; <> @@ -297,7 +284,7 @@ let any_whitespace := let main := | any_whitespace*; nodes = sequence_nonempty(locatedM(toplevel)); whitespace?; END; { nodes } - | any_whitespace*; END; { Writer.return @@ [] } + | END; { return [] } let toplevel := | block = nestable_block_element; { Writer.map (fun b -> (b :> Ast.block_element) ) block } @@ -635,22 +622,31 @@ let row_heavy := let table_heavy := TABLE_HEAVY; whitespace?; grid = sequence_nonempty(row_heavy); RIGHT_BRACE; { Writer.map (fun g -> `Table ((g, None), `Heavy)) grid } +(* LIGHT TABLE *) + +let cell_content_light := ~ = sequence_nonempty(locatedM(inline_element)); <> +let row_light := + | BAR?; cells = separated_list(BAR, cell_content_light); BAR?; + { Writer.sequence cells } +let rows_light := rows = separated_list(Single_newline?, row_light); { Writer.sequence rows } +let table_start_light := TABLE_LIGHT; whitespace?; {} -let cell_light := ~ = sequence_nonempty(locatedM(inline_element)); <> -(* This is crucial because we cannot have multiple optionals in sequence *) -let row_prefix := - | whitespace; {} - | BAR; {} - | whitespace; BAR; {} -let row_light := row_prefix?; cells = separated_sequence_nonempty(BAR, cell_light); BAR?; <> -let rows_light := ~ = separated_sequence_nonempty(newline,row_light); <> +let table_unexpected_EOI_light := newline?; END; {} let table_light := - (* If there's only one row and it's not the align row, then it's data *) - | TABLE_LIGHT; data = rows_light; whitespace?; RIGHT_BRACE; { Writer.map construct_table data } - - (* If there's nothing inside, return an empty table *) - | TABLE_LIGHT; whitespace?; RIGHT_BRACE; - { return @@ `Table (([[]], None), `Light) } + | table_start_light; data = rows_light; RIGHT_BRACE; { Writer.map construct_table data } + | table_start_light; RIGHT_BRACE; + { return @@ `Table (([[]], None), `Light) } + | table_start_light; data = rows_light; table_unexpected_EOI_light; + { + let in_what = Tokens.describe TABLE_LIGHT in + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.end_not_allowed ~in_what span + in + unclosed_table ~data warning + } +(* + | TABLE_LIGHT; END; { let in_what = Tokens.describe TABLE_LIGHT in @@ -660,16 +656,7 @@ let table_light := in unclosed_table warning } - | TABLE_LIGHT; data = rows_light; whitespace?; END; - { - let in_what = Tokens.describe TABLE_LIGHT in - let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in - Parse_error.end_not_allowed ~in_what span - in - unclosed_table ~data warning - } - +*) let table := | ~ = table_heavy; <> | ~ = table_light; <> @@ -736,73 +723,48 @@ let math_block := m = Math_block; |> Writer.map (fun m -> `Math_block m) } -let module_list_element := - | ~ = Word; - | horizontal_whitespace; { None } - let modules := - | MODULES; modules = located(module_list_element)+; RIGHT_BRACE; - { - let modules = List.filter_map loc_is_some modules in - return @@ `Modules modules - } - | MODULES; RIGHT_BRACE; - { - let what = Tokens.describe MODULES in - let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in - Parse_error.should_not_be_empty ~what span - in - Writer.with_warning (`Modules []) warning - } - | MODULES; modules = located(module_list_element)+; END; - { - let in_what = Tokens.describe MODULES in - let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in - Parse_error.end_not_allowed ~in_what span - in - let modules = List.filter_map loc_is_some modules in - Writer.with_warning (`Modules modules) warning - } -(* - - | MODULES; modules = sequence_nonempty(locatedM(inline_element)); RIGHT_BRACE; + | MODULES; modules = sequence(locatedM(inline_element)); RIGHT_BRACE; { print_endline "INLINE"; let in_what = Tokens.describe MODULES in let* modules = modules in - let warning = fun ~filename:_ -> + let not_allowed = fun ~filename:_ -> let span = Loc.span @@ List.map Loc.location modules in - let first_offending : Ast.inline_element = - List.find + let first_offending = + List.find_opt (function | `Word _ | `Space _ -> false | _ -> true) (List.map Loc.value modules : Ast.inline_element list) in - let what = Tokens.describe_inline first_offending in + let what = Option.map Tokens.describe_inline first_offending |> Option.value ~default:String.empty in Parse_error.not_allowed ~what ~in_what span in + let is_empty = fun ~filename:_ -> + let span = Loc.span @@ List.map Loc.location modules in + let what = Tokens.describe MODULES in + Parse_error.should_not_be_empty ~what span + in let inner = `Modules (List.map (Loc.map inline_element_inner) modules) in - Writer.with_warning inner warning + List.fold_left (fun writer (f, w) -> Writer.ensure f w writer) (return modules) [(not_empty, is_empty); (legal_module_list, not_allowed)] + |> Writer.map (Fun.const inner) } - - | MODULES; modules = sequence_nonempty(locatedM(inline_element)); END; + | MODULES; modules = sequence(locatedM(inline_element)); END; { print_endline "INLINE + EOI"; let in_what = Tokens.describe MODULES in let* modules = modules in let span = Loc.span @@ List.map Loc.location modules in let not_allowed = fun ~filename:_ -> - let first_offending : Ast.inline_element = - List.find + let first_offending = + List.find_opt (function | `Word _ | `Space _ -> false | _ -> true) (List.map Loc.value modules : Ast.inline_element list) in - let what = Tokens.describe_inline first_offending in + let what = Option.map Tokens.describe_inline first_offending |> Option.value ~default:String.empty in Parse_error.not_allowed ~what ~in_what span in let unexpected_end = fun ~filename:_ -> @@ -811,4 +773,3 @@ let modules := let inner = `Modules (List.map (Loc.map inline_element_inner) modules) in Writer.with_warning inner not_allowed |> Writer.warning unexpected_end } -*) diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index f450991bc6..aa9508e528 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -6,7 +6,7 @@ module Parser = Odoc_parser.Tester (* NOTE (@FayCarsons): for anyone working on this parser - this is probably the easiest way to check if something is working. The tests are numerous - and difficult to parse. A test suite will fail in its entirety if one case + and difficult to read. A test suite will fail in its entirety if one case throws an exception. If you need to test a specific parser rule or test case, add it here and @@ -37,11 +37,25 @@ let error_recovery = ("List item not at beginning of line", "- foo\n - bar\n- baz"); ("Empty list item", "{ol {li} }"); ("'{li' not followed by whitespace", "{ol {lifoo bar baz} }"); - ("End not allowed in table", "{t \n| -- | :--: |\n| a | b \n"); + ("End not allowed in table", "{t \n| -- | :--: |\n| a | b"); ("Empty modules", "{!modules: }"); ("EOI in modules", "{!modules: Foo Bar"); ] +let light_table_early_EOI = + [ + (* +( "End not allowed WITHOUT newline", + {|{t + | -- | :--: | + | a | b |} ); + *) + ("End not allowed WITH newline", {|{t +| A | B | + C | D +}|}); + ] + (* Cases (mostly) taken from the 'odoc for library authors' document *) let documentation_cases = [ @@ -49,7 +63,7 @@ let documentation_cases = ("Heavy list", "{ul {li foo} {li bar} {li baz}}"); ("Simple ref", "{!Stdlib.Buffer}"); ("Ref w/ replacement", "{{: https://ocaml.org/ }the OCaml website}"); - ("Modules", "{!modules: Foo Bar Baz }"); + ("Modules", "{!modules: Foo Bar Baz}"); ("Block tag", "@see bar baz quux\n"); ("Inline tag", "@author Fay Carsons"); ("Simple tag", "@open"); @@ -82,13 +96,6 @@ let documentation_cases = ("Code block", "{[\n let foo = 0 \n]}"); ] -let light_table = - [ ("Light table", {| - {t - |--|--| - } - |}) ] - open Test.Serialize let error err = Atom (Odoc_parser.Warning.to_string err) @@ -105,6 +112,8 @@ type failure = { exn : string; label : string; offending_token : Parser.token; + failure_index : int; + tokens : Parser.token list; area : string; } @@ -113,9 +122,53 @@ let get_area Loc.{ start; end_; _ } input = |> List.filteri (fun idx _ -> idx >= pred start.line && idx <= pred end_.line) |> List.fold_left ( ^ ) "" -let mkfailure label input exn last = +let mkfailure label input exn last failure_index tokens = let Loc.{ value; location } = last in - { offending_token = value; exn; label; area = get_area location input } + { + offending_token = value; + failure_index; + tokens; + exn; + label; + area = get_area location input; + } + +module TokBuf = struct + type t = { + tok_stream : Parser.token Loc.with_location Stream.t; + mutable cache : Parser.token Loc.with_location list; + } + type tok_state = { mutable seen_EOI : bool } + + let create ~dispenser = + let tok_state = { seen_EOI = false } in + let tok_stream = + Stream.from (fun _ -> + if tok_state.seen_EOI then None + else + match dispenser () with + | Loc.{ value; _ } as loc when Parser.is_EOI value -> + tok_state.seen_EOI <- true; + Some loc + | t -> Some t) + in + let cache = [] in + { tok_stream; cache } + + let next self = + let tok = Stream.next self.tok_stream in + self.cache <- tok :: self.cache; + tok + + let cache_rest self = + let rec go () = + try + self.cache <- Stream.next self.tok_stream :: self.cache; + go () + with Stream.Failure -> List.rev self.cache + in + go () +end let run_test (label, case) = let open Either in @@ -133,12 +186,15 @@ let run_test (label, case) = file; } in - let last_tok = ref None in - let get_tok lexbuf = - let tok = Parser.Lexer.token input lexbuf in - print_endline @@ "Got tok: " ^ Parser.string_of_token (Loc.value tok); - last_tok := Some tok; - Loc.value tok + let failure_index = ref (-1) in + let tokbuf = + TokBuf.create ~dispenser:(fun () -> Parser.Lexer.token input lexbuf) + in + let get_tok _ = + incr failure_index; + let tok = Loc.value @@ TokBuf.next tokbuf in + print_endline @@ Parser.string_of_token tok; + tok in try let ast, warnings = @@ -149,7 +205,9 @@ let run_test (label, case) = Left (label, output) with e -> let exns = Printexc.to_string e in - Right (mkfailure label case exns @@ Option.get !last_tok) + let offending_token = List.hd @@ tokbuf.cache in + let tokens = List.map Loc.value @@ TokBuf.cache_rest tokbuf in + Right (mkfailure label case exns offending_token !failure_index tokens) let sep = String.init 80 @@ Fun.const '-' @@ -159,17 +217,23 @@ let format_successes = in List.fold_left go "" -let failure_string { exn; label; offending_token; area } = +let failure_string { exn; label; offending_token; area; tokens; failure_index } + = Printf.sprintf {|>>> Case '%s' failed with exn: <<< %s -offending token: -'%s' -input: -"%s"|} - label exn +Offending token: +%d: '%s' +Input: +"%s" +Tokens: +%s|} + label exn failure_index (Parser.string_of_token offending_token) area + (List.fold_left + (fun acc t -> acc ^ "\n" ^ Parser.string_of_token t) + "" tokens) let format_failures = let go acc x = acc ^ "\n" ^ sep ^ "\n" ^ x in @@ -185,7 +249,7 @@ let () = | _ -> print_endline "unrecognized argument - running documentation_cases"; documentation_cases) - else light_table + else light_table_early_EOI in let sucesses, failures = List.partition_map run_test cases in let sucesses = format_successes sucesses in diff --git a/src/parser/tokens.ml b/src/parser/tokens.ml index 63a9365495..0acc95c5fe 100644 --- a/src/parser/tokens.ml +++ b/src/parser/tokens.ml @@ -11,7 +11,6 @@ type table_cell_kind = [ `Header | `Data ] type token = | SPACE | Space of string - | NEWLINE | Single_newline of string | Blank_line of string | Simple_ref of string @@ -71,7 +70,7 @@ let media_description ref_kind media_kind = let print : token -> string = function | SPACE | Space _ -> "\t" - | NEWLINE | Single_newline _ -> "\n" + | Single_newline _ -> "\n" | Blank_line _ -> "\n\n" | Simple_ref _ -> "{!" | Ref_with_replacement _ -> "{{!" @@ -164,7 +163,6 @@ let describe : token -> string = function | END -> "end of text" | SPACE -> "whitespace" | Single_newline _ -> "newline" - | NEWLINE -> "line break" | Blank_line _ -> "blank line" | RIGHT_BRACE -> "'}'" | RIGHT_CODE_DELIMITER -> "']}'" From b86ffb770eeb050d4e54d40e02148feb3a68c4be Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 9 Dec 2024 13:56:10 -0500 Subject: [PATCH 081/150] Light tables fixed(?), removed unused `SPACE` token --- src/parser/parser.mly | 51 ++++++++++++++++---------------- src/parser/test_driver/tester.ml | 13 ++------ src/parser/tokens.ml | 6 ++-- 3 files changed, 29 insertions(+), 41 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 16b4bcf641..e76d721291 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -177,7 +177,6 @@ let legal_module_list : Ast.inline_element Loc.with_location list -> bool = %} -%token SPACE %token RIGHT_BRACE "{" %token RIGHT_CODE_DELIMITER "{[" @@ -266,24 +265,18 @@ let separated_sequence(sep, rule) := (* WHITESPACE *) let horizontal_whitespace := - | SPACE; { `Space " " } | ~ = Space; <`Space> -let newline := - | ~ = Single_newline; <> +let newline := ~ = Single_newline; <> let whitespace := | ~ = horizontal_whitespace; <> | ~ = newline; <`Space> -let any_whitespace := - | ~ = whitespace; <> - | ~ = Blank_line; <`Space> - (* ENTRY *) let main := - | any_whitespace*; nodes = sequence_nonempty(locatedM(toplevel)); whitespace?; END; { nodes } + | nodes = sequence_nonempty(locatedM(toplevel)); whitespace?; END; { nodes } | END; { return [] } let toplevel := @@ -376,20 +369,18 @@ let tag_bare := (* INLINE ELEMENTS *) -let inline_element := +let inline_element := + (* Single token inline elements which are mostly handled in the lexer *) | s = horizontal_whitespace; { return s } - | ~ = code_span; <> - | ~ = raw_markup; <> - | ~ = word; <> + | c = Code_span; { return @@ `Code_span c } + | m = Raw_markup; { return @@ `Raw_markup m } + | w = Word; { return @@ `Word w } + | m = Math_span; { return @@ `Math_span m } + (* More complex/recursive inline elements should have their own rule *) | ~ = style; <> - | ~ = math_span; <> | ~ = reference; <> | ~ = link; <> -let code_span := c = Code_span; { return @@ `Code_span c } -let raw_markup := m = Raw_markup; { return @@ `Raw_markup m } -let word := w = Word; { return @@ `Word w } - let style := | style = Style; children = sequence(locatedM(inline_element)); RIGHT_BRACE; { @@ -625,25 +616,33 @@ let table_heavy := TABLE_HEAVY; whitespace?; grid = sequence_nonempty(row_heavy) (* LIGHT TABLE *) let cell_content_light := ~ = sequence_nonempty(locatedM(inline_element)); <> -let row_light := - | BAR?; cells = separated_list(BAR, cell_content_light); BAR?; - { Writer.sequence cells } -let rows_light := rows = separated_list(Single_newline?, row_light); { Writer.sequence rows } -let table_start_light := TABLE_LIGHT; whitespace?; {} +let cell := + | ~ = cell_content_light; <> + | ~ = cell_content_light; BAR; <> + +let cells := BAR?; ~ = sequence_nonempty(cell); <> -let table_unexpected_EOI_light := newline?; END; {} +let row_light := + | ~ = cells; <> + | ~ = cells; Single_newline; <> + +let rows_light := Single_newline?; ~ = sequence_nonempty(row_light); <> + +let table_start_light := TABLE_LIGHT; whitespace?; {} let table_light := | table_start_light; data = rows_light; RIGHT_BRACE; { Writer.map construct_table data } | table_start_light; RIGHT_BRACE; { return @@ `Table (([[]], None), `Light) } - | table_start_light; data = rows_light; table_unexpected_EOI_light; + | table_start_light; data = rows_light?; END; { let in_what = Tokens.describe TABLE_LIGHT in let warning = fun ~filename -> let span = Parser_aux.to_location ~filename $sloc in Parse_error.end_not_allowed ~in_what span in - unclosed_table ~data warning + match data with + | Some data -> unclosed_table ~data warning + | None -> unclosed_table warning } (* diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index aa9508e528..357242f7a4 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -44,16 +44,7 @@ let error_recovery = let light_table_early_EOI = [ - (* -( "End not allowed WITHOUT newline", - {|{t - | -- | :--: | - | a | b |} ); - *) - ("End not allowed WITH newline", {|{t -| A | B | - C | D -}|}); + ("End not allowed WITH newline", "{t\n| A | B |\n| -- | -- |\n| C | D |\n}"); ] (* Cases (mostly) taken from the 'odoc for library authors' document *) @@ -193,7 +184,7 @@ let run_test (label, case) = let get_tok _ = incr failure_index; let tok = Loc.value @@ TokBuf.next tokbuf in - print_endline @@ Parser.string_of_token tok; + print_endline @@ "Got token: " ^ Parser.string_of_token tok; tok in try diff --git a/src/parser/tokens.ml b/src/parser/tokens.ml index 0acc95c5fe..d9aec99a5e 100644 --- a/src/parser/tokens.ml +++ b/src/parser/tokens.ml @@ -9,7 +9,6 @@ type style = [ `Bold | `Italic | `Emphasis | `Superscript | `Subscript ] type table_cell_kind = [ `Header | `Data ] type token = - | SPACE | Space of string | Single_newline of string | Blank_line of string @@ -69,7 +68,7 @@ let media_description ref_kind media_kind = (ref_kind, media_kind) let print : token -> string = function - | SPACE | Space _ -> "\t" + | Space _ -> "\t" | Single_newline _ -> "\n" | Blank_line _ -> "\n\n" | Simple_ref _ -> "{!" @@ -161,7 +160,6 @@ let describe : token -> string = function | Simple_link _ -> "'{:...} (external link)'" | Link_with_replacement _ -> "'{{:...} ...}' (external link)" | END -> "end of text" - | SPACE -> "whitespace" | Single_newline _ -> "newline" | Blank_line _ -> "blank line" | RIGHT_BRACE -> "'}'" @@ -219,7 +217,7 @@ let empty_code_block = let describe_inline : Ast.inline_element -> string = function | `Word w -> describe @@ Word w - | `Space _ -> describe SPACE + | `Space _ -> describe @@ Space "" | `Styled (style, _) -> describe @@ Style style | `Code_span _ -> describe @@ Code_span "" | `Math_span _ -> describe @@ Math_span "" From 8f69af991b3f44e018455cbc841582a52a18a94d Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 9 Dec 2024 14:27:34 -0500 Subject: [PATCH 082/150] Handle more unclosed table cases --- src/parser/parser.mly | 32 ++++++++++++++++---------------- src/parser/test_driver/tester.ml | 24 +++++++++++++----------- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index e76d721291..bcdd31c5e3 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -273,6 +273,10 @@ let whitespace := | ~ = horizontal_whitespace; <> | ~ = newline; <`Space> +let any_whitespace := + | ~ = whitespace; <> + | ~ = Blank_line; <`Space> + (* ENTRY *) let main := @@ -626,36 +630,32 @@ let row_light := | ~ = cells; <> | ~ = cells; Single_newline; <> -let rows_light := Single_newline?; ~ = sequence_nonempty(row_light); <> +let rows_light := ~ = sequence_nonempty(row_light); <> let table_start_light := TABLE_LIGHT; whitespace?; {} let table_light := | table_start_light; data = rows_light; RIGHT_BRACE; { Writer.map construct_table data } | table_start_light; RIGHT_BRACE; { return @@ `Table (([[]], None), `Light) } - | table_start_light; data = rows_light?; END; + | table_start_light; data = rows_light; END; { let in_what = Tokens.describe TABLE_LIGHT in let warning = fun ~filename -> let span = Parser_aux.to_location ~filename $sloc in Parse_error.end_not_allowed ~in_what span in - match data with - | Some data -> unclosed_table ~data warning - | None -> unclosed_table warning + unclosed_table ~data warning + } + | TABLE_LIGHT; any_whitespace?; END; + { + let in_what = Tokens.describe TABLE_LIGHT in + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + Parse_error.end_not_allowed ~in_what span + in + unclosed_table warning } -(* - | TABLE_LIGHT; END; - { - let in_what = Tokens.describe TABLE_LIGHT in - let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in - Parse_error.end_not_allowed ~in_what span - in - unclosed_table warning - } -*) let table := | ~ = table_heavy; <> | ~ = table_light; <> diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index 357242f7a4..77e344667c 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -42,10 +42,7 @@ let error_recovery = ("EOI in modules", "{!modules: Foo Bar"); ] -let light_table_early_EOI = - [ - ("End not allowed WITH newline", "{t\n| A | B |\n| -- | -- |\n| C | D |\n}"); - ] +let light_table_early_EOI = [ ("End not allowed WITH newline", "{t\n") ] (* Cases (mostly) taken from the 'odoc for library authors' document *) let documentation_cases = @@ -123,6 +120,7 @@ let mkfailure label input exn last failure_index tokens = label; area = get_area location input; } +(* module TokBuf = struct type t = { @@ -160,6 +158,7 @@ module TokBuf = struct in go () end +*) let run_test (label, case) = let open Either in @@ -178,14 +177,12 @@ let run_test (label, case) = } in let failure_index = ref (-1) in - let tokbuf = - TokBuf.create ~dispenser:(fun () -> Parser.Lexer.token input lexbuf) - in + let offending_token = ref None in let get_tok _ = incr failure_index; - let tok = Loc.value @@ TokBuf.next tokbuf in - print_endline @@ "Got token: " ^ Parser.string_of_token tok; - tok + let locd = Parser.Lexer.token input lexbuf in + offending_token := Some locd; + Loc.value locd in try let ast, warnings = @@ -196,9 +193,14 @@ let run_test (label, case) = Left (label, output) with e -> let exns = Printexc.to_string e in + (* let offending_token = List.hd @@ tokbuf.cache in let tokens = List.map Loc.value @@ TokBuf.cache_rest tokbuf in - Right (mkfailure label case exns offending_token !failure_index tokens) + *) + Right + (mkfailure label case exns + (Option.get !offending_token) + !failure_index []) let sep = String.init 80 @@ Fun.const '-' From 01eb3df6369baf0de72a8a017a79be5d25656619 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 9 Dec 2024 14:44:20 -0500 Subject: [PATCH 083/150] Add error handling for paragraph alignment --- src/parser/parser.mly | 22 ++++++++++++++++------ src/parser/test_driver/tester.ml | 15 ++++++++++++--- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index bcdd31c5e3..3c4a6cca19 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -267,11 +267,9 @@ let separated_sequence(sep, rule) := let horizontal_whitespace := | ~ = Space; <`Space> -let newline := ~ = Single_newline; <> - let whitespace := | ~ = horizontal_whitespace; <> - | ~ = newline; <`Space> + | ~ = Single_newline; <`Space> let any_whitespace := | ~ = whitespace; <> @@ -514,9 +512,9 @@ let list_light_item_ordered == } let list_light := - | children = separated_nonempty_list(newline, list_light_item_ordered); + | children = separated_nonempty_list(Single_newline, list_light_item_ordered); { let* children = Writer.sequence children in return @@ `List (`Ordered, `Light, [ children ]) } - | children = separated_nonempty_list(newline, list_light_item_unordered); + | children = separated_nonempty_list(Single_newline, list_light_item_unordered); { let* children = Writer.sequence children in return @@ `List (`Unordered, `Light, [ children ]) } let item_heavy == @@ -682,7 +680,7 @@ let media := (* TOP-LEVEL ELEMENTS *) -let nestable_block_element := ~ = nestable_block_element_inner; newline?; <> +let nestable_block_element := ~ = nestable_block_element_inner; Single_newline?; <> let nestable_block_element_inner := | ~ = verbatim; <> @@ -693,6 +691,18 @@ let nestable_block_element_inner := | ~ = math_block; <> | ~ = paragraph; <> | ~ = modules; <> + | ~ = paragraph_style; <> + +let paragraph_style := + | style = Paragraph_style; ws = paragraph; RIGHT_BRACE; + { + let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + let what = Tokens.describe @@ Paragraph_style style in + Parse_error.markup_should_not_be_used span ~what + in + Writer.warning warning ws + } let verbatim := v = Verbatim; { diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index 77e344667c..bf2a2c33ca 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -42,8 +42,6 @@ let error_recovery = ("EOI in modules", "{!modules: Foo Bar"); ] -let light_table_early_EOI = [ ("End not allowed WITH newline", "{t\n") ] - (* Cases (mostly) taken from the 'odoc for library authors' document *) let documentation_cases = [ @@ -84,6 +82,17 @@ let documentation_cases = ("Code block", "{[\n let foo = 0 \n]}"); ] +let unsupported = + [ + ("left_alignment", "{L foo}"); + ("center alignment", "{C foo}"); + ("Right", "{R foo}"); + ("Custom style", "{c foo}"); + ("custom tag", "@custom"); + ("custom reference kind", "{!custom:foo}"); + ("html", "foo"); + ] + open Test.Serialize let error err = Atom (Odoc_parser.Warning.to_string err) @@ -242,7 +251,7 @@ let () = | _ -> print_endline "unrecognized argument - running documentation_cases"; documentation_cases) - else light_table_early_EOI + else unsupported in let sucesses, failures = List.partition_map run_test cases in let sucesses = format_successes sucesses in From 8acacbf0a93b92bc76ddbfbb1fd2138c9c784dae Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 10 Dec 2024 11:26:12 -0500 Subject: [PATCH 084/150] Add location printing fn --- src/parser/loc.ml | 6 ++++ src/parser/loc.mli | 2 ++ src/parser/test_driver/tester.ml | 55 +++----------------------------- 3 files changed, 12 insertions(+), 51 deletions(-) diff --git a/src/parser/loc.ml b/src/parser/loc.ml index 0c6d0679e7..f9d2a3fe91 100644 --- a/src/parser/loc.ml +++ b/src/parser/loc.ml @@ -9,6 +9,12 @@ let map f annotated = { annotated with value = f annotated.value } let is predicate { value; _ } = predicate value let same annotated value = { annotated with value } +let fmt { start; end_; _ } = + let { line = sline; column = scol } = start + and { line = eline; column = ecol } = end_ in + Printf.sprintf "start: { line : %d, col : %d }\nend: { line: %d; col: %d }" + sline scol eline ecol + let span spans = match spans with | [] -> diff --git a/src/parser/loc.mli b/src/parser/loc.mli index 59bbfcfa5e..2e5ce49730 100644 --- a/src/parser/loc.mli +++ b/src/parser/loc.mli @@ -28,6 +28,8 @@ val nudge_end : int -> span -> span type +'a with_location = { location : span; value : 'a } (** Describes values located at a particular span *) +val fmt : span -> string + val nudge_map_start : int -> 'a with_location -> 'a with_location val nudge_map_end : int -> 'a with_location -> 'a with_location diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index bf2a2c33ca..6f86134289 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -42,6 +42,8 @@ let error_recovery = ("EOI in modules", "{!modules: Foo Bar"); ] +let location = [ ("Inline false nesting", "{m \\{ \\mathbb{only_left}}") ] + (* Cases (mostly) taken from the 'odoc for library authors' document *) let documentation_cases = [ @@ -82,17 +84,6 @@ let documentation_cases = ("Code block", "{[\n let foo = 0 \n]}"); ] -let unsupported = - [ - ("left_alignment", "{L foo}"); - ("center alignment", "{C foo}"); - ("Right", "{R foo}"); - ("Custom style", "{c foo}"); - ("custom tag", "@custom"); - ("custom reference kind", "{!custom:foo}"); - ("html", "foo"); - ] - open Test.Serialize let error err = Atom (Odoc_parser.Warning.to_string err) @@ -129,45 +120,6 @@ let mkfailure label input exn last failure_index tokens = label; area = get_area location input; } -(* - -module TokBuf = struct - type t = { - tok_stream : Parser.token Loc.with_location Stream.t; - mutable cache : Parser.token Loc.with_location list; - } - type tok_state = { mutable seen_EOI : bool } - - let create ~dispenser = - let tok_state = { seen_EOI = false } in - let tok_stream = - Stream.from (fun _ -> - if tok_state.seen_EOI then None - else - match dispenser () with - | Loc.{ value; _ } as loc when Parser.is_EOI value -> - tok_state.seen_EOI <- true; - Some loc - | t -> Some t) - in - let cache = [] in - { tok_stream; cache } - - let next self = - let tok = Stream.next self.tok_stream in - self.cache <- tok :: self.cache; - tok - - let cache_rest self = - let rec go () = - try - self.cache <- Stream.next self.tok_stream :: self.cache; - go () - with Stream.Failure -> List.rev self.cache - in - go () -end -*) let run_test (label, case) = let open Either in @@ -191,6 +143,7 @@ let run_test (label, case) = incr failure_index; let locd = Parser.Lexer.token input lexbuf in offending_token := Some locd; + print_endline @@ "Token location: " ^ Loc.fmt (Loc.location locd); Loc.value locd in try @@ -251,7 +204,7 @@ let () = | _ -> print_endline "unrecognized argument - running documentation_cases"; documentation_cases) - else unsupported + else location in let sucesses, failures = List.partition_map run_test cases in let sucesses = format_successes sucesses in From 5298af2d7780de06762c7a5a42c45440bb2414de Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 10 Dec 2024 14:21:19 -0500 Subject: [PATCH 085/150] Refactor lexer to return unwrapped tokens --- src/parser/lexer.mli | 2 +- src/parser/lexer.mll | 86 ++++++++++++++++++++++---------- src/parser/odoc_parser.ml | 4 +- src/parser/odoc_parser.mli | 2 +- src/parser/test_driver/tester.ml | 5 +- 5 files changed, 64 insertions(+), 35 deletions(-) diff --git a/src/parser/lexer.mli b/src/parser/lexer.mli index a6f3b69b7f..ff39d6c13b 100644 --- a/src/parser/lexer.mli +++ b/src/parser/lexer.mli @@ -6,4 +6,4 @@ type input = { mutable warnings : Warning.t list; } -val token : input -> Lexing.lexbuf -> Parser.token Loc.with_location +val token : input -> Lexing.lexbuf -> Parser.token diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 1d4038a370..5890799a9b 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -144,7 +144,27 @@ type input = { mutable warnings : Warning.t list; } -let with_location_adjustments +let mkloc input lexbuf = + let open Lexing in + let file = input.file + and start = + Loc.{ line = lexbuf.lex_start_p.pos_lnum; + column = lexbuf.lex_start_p.pos_cnum } + and end_ = + Loc.{ line = lexbuf.lex_curr_p.pos_lnum; + column = lexbuf.lex_curr_p.pos_cnum } + in + { Loc.file; start; end_ } + +let with_loc : Lexing.lexbuf -> input -> 'a -> 'a Loc.with_location = + fun lexbuf input -> + let location = mkloc input lexbuf in + Loc.at location + +let with_location_adjustments : (Lexing.lexbuf -> input -> 'a) -> Lexing.lexbuf + -> input + -> ?start_offset:int -> ?adjust_start_by:string -> ?end_offset:int + -> ?adjust_end_by:string -> 'a = fun k lexbuf input ?start_offset ?adjust_start_by ?end_offset ?adjust_end_by @@ -170,28 +190,18 @@ let with_location_adjustments | None -> end_ | Some s -> end_ - String.length s in - let location = { - Loc.file = input.file; - start = input.offset_to_location start; - end_ = input.offset_to_location end_; - } - in - k input location value - -let emit : - Lexing.lexbuf -> - input -> - ?start_offset:int -> - ?adjust_start_by:string -> - ?end_offset:int -> - ?adjust_end_by:string -> - 'a -> - 'a Loc.with_location = - with_location_adjustments (fun _ -> Loc.at) + lexbuf.lex_start_p <- { lexbuf.lex_start_p with pos_cnum = start }; + lexbuf.lex_curr_p <- { lexbuf.lex_curr_p with pos_cnum = end_ }; + k lexbuf input value + +let emit = + with_location_adjustments (fun _ _ token -> token) + let warning = - with_location_adjustments (fun input location error -> - input.warnings <- (error location) :: input.warnings) + with_location_adjustments (fun lexbuf input error -> + let location = mkloc input lexbuf in + input.warnings <- error location :: input.warnings) let reference_token media start ( target : string ) input lexbuf = match start with @@ -269,7 +279,13 @@ let emit_code_block ~start_offset ~content_offset ~lexbuf input meta delimiter t lexbuf input |> trim_leading_blank_lines - |> with_location_adjustments ~adjust_end_by:terminator ~start_offset:content_offset (fun _ -> Loc.at) lexbuf input in + |> with_location_adjustments + ~adjust_end_by:terminator + ~start_offset:content_offset + with_loc + lexbuf + input + in emit ~start_offset lexbuf input (Code_block { meta; delimiter; content; output }) let heading_level lexbuf input level = @@ -380,10 +396,15 @@ and token input = parse | ((horizontal_space* newline as prefix) horizontal_space* ((newline horizontal_space*)+ as suffix) as ws) - { emit lexbuf input (Blank_line ws) ~adjust_start_by:prefix ~adjust_end_by:suffix } + { + Lexing.new_line lexbuf; + Lexing.new_line lexbuf; + emit lexbuf input (Blank_line ws) ~adjust_start_by:prefix ~adjust_end_by:suffix } | (horizontal_space* newline horizontal_space* as ws) - { emit lexbuf input (Single_newline ws) } + { + Lexing.new_line lexbuf; + emit lexbuf input (Single_newline ws) } | (horizontal_space+ as ws) { emit lexbuf input (Space ws) } @@ -458,10 +479,15 @@ and token input = parse { let start_offset = Lexing.lexeme_start lexbuf in let lang_tag = - with_location_adjustments ~adjust_start_by:prefix (fun _ -> Loc.at) lexbuf input lang_tag_ + with_location_adjustments + ~adjust_start_by:prefix + with_loc + lexbuf + input + lang_tag_ in let emit_truncated_code_block () = - let empty_content = with_location_adjustments (fun _ -> Loc.at) lexbuf input "" in + let empty_content = with_location_adjustments with_loc lexbuf input "" in emit ~start_offset lexbuf input (Code_block { meta = Some { language = lang_tag; tags = None }; delimiter = Some delimiter; content = empty_content; output = None}) in match code_block_metadata_tail input lexbuf with @@ -795,7 +821,13 @@ and code_block_metadata_tail input = parse ((space_char* '[') as suffix) { let meta = - with_location_adjustments ~adjust_start_by:prefix ~adjust_end_by:suffix (fun _ -> Loc.at) lexbuf input meta + with_location_adjustments + ~adjust_start_by:prefix + ~adjust_end_by:suffix + with_loc + lexbuf + input + meta in Ok (Some meta) } diff --git a/src/parser/odoc_parser.ml b/src/parser/odoc_parser.ml index 06e545b827..0d649a421b 100644 --- a/src/parser/odoc_parser.ml +++ b/src/parser/odoc_parser.ml @@ -130,9 +130,9 @@ let parse_comment : location:Lexing.position -> text:string -> t = } in (* Remove the `Loc.with_location` wrapping our token because Menhir cannot handle that *) - let unwrapped_token lexbuf = Lexer.token lexer_state lexbuf |> Loc.value in let ast, warnings = - Writer.run ~filename:lexer_state.file @@ Parser.main unwrapped_token lexbuf + Writer.run ~filename:lexer_state.file + @@ Parser.main (Lexer.token lexer_state) lexbuf in { ast; diff --git a/src/parser/odoc_parser.mli b/src/parser/odoc_parser.mli index 5310fb2247..41b49c816c 100644 --- a/src/parser/odoc_parser.mli +++ b/src/parser/odoc_parser.mli @@ -28,7 +28,7 @@ module Tester : sig mutable warnings : Warning.t list; } - val token : input -> Lexing.lexbuf -> Parser.token Loc.with_location + val token : input -> Lexing.lexbuf -> Parser.token end val is_EOI : token -> bool val pp_warning : Warning.t -> string diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index 6f86134289..eedbb89b45 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -141,10 +141,7 @@ let run_test (label, case) = let offending_token = ref None in let get_tok _ = incr failure_index; - let locd = Parser.Lexer.token input lexbuf in - offending_token := Some locd; - print_endline @@ "Token location: " ^ Loc.fmt (Loc.location locd); - Loc.value locd + Parser.Lexer.token input lexbuf in try let ast, warnings = From 89d7366fda53fdaaa3688b0edaa53ab223c5fde5 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 10 Dec 2024 14:42:23 -0500 Subject: [PATCH 086/150] improve location handling --- src/parser/lexer.mll | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 5890799a9b..a448f3e6ac 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -146,14 +146,13 @@ type input = { let mkloc input lexbuf = let open Lexing in - let file = input.file - and start = - Loc.{ line = lexbuf.lex_start_p.pos_lnum; - column = lexbuf.lex_start_p.pos_cnum } - and end_ = - Loc.{ line = lexbuf.lex_curr_p.pos_lnum; - column = lexbuf.lex_curr_p.pos_cnum } + let pos_to_span pos = + Loc.{ line = pos.pos_lnum; + column = pos.pos_cnum - pos.pos_bol } in + let file = input.file + and start = pos_to_span lexbuf.lex_start_p + and end_ = pos_to_span lexbuf.lex_curr_p in { Loc.file; start; end_ } let with_loc : Lexing.lexbuf -> input -> 'a -> 'a Loc.with_location = @@ -699,17 +698,18 @@ and math kind buffer nesting_level start_offset input = parse emit lexbuf input (math_constr kind (Buffer.contents buffer)) ~start_offset else begin Buffer.add_char buffer '}'; - math kind buffer (nesting_level - 1) start_offset input lexbuf + math kind buffer (pred nesting_level) start_offset input lexbuf end } | '{' { Buffer.add_char buffer '{'; - math kind buffer (nesting_level + 1) start_offset input lexbuf } + math kind buffer (succ nesting_level) start_offset input lexbuf } | ("\\{" | "\\}") as s { Buffer.add_string buffer s; math kind buffer nesting_level start_offset input lexbuf } | (newline) as s { + Lexing.new_line lexbuf; match kind with | Inline -> warning From 418446e18b54e57de762d1a22237aace2ff9a14f Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 10 Dec 2024 14:47:21 -0500 Subject: [PATCH 087/150] handle newlines in inline elements --- src/parser/parser.mly | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 3c4a6cca19..6fb6f94886 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -373,7 +373,7 @@ let tag_bare := let inline_element := (* Single token inline elements which are mostly handled in the lexer *) - | s = horizontal_whitespace; { return s } + | s = inline_elt_legal_whitespace; { return s } | c = Code_span; { return @@ `Code_span c } | m = Raw_markup; { return @@ `Raw_markup m } | w = Word; { return @@ `Word w } @@ -383,6 +383,10 @@ let inline_element := | ~ = reference; <> | ~ = link; <> +let inline_elt_legal_whitespace := + | ~ = Space; <`Space> + | ~ = Single_newline; <`Space> + let style := | style = Style; children = sequence(locatedM(inline_element)); RIGHT_BRACE; { @@ -617,7 +621,19 @@ let table_heavy := TABLE_HEAVY; whitespace?; grid = sequence_nonempty(row_heavy) (* LIGHT TABLE *) -let cell_content_light := ~ = sequence_nonempty(locatedM(inline_element)); <> +let table_light_legal_elt := + (* Single token inline elements which are mostly handled in the lexer *) + | s = horizontal_whitespace; { return s } + | c = Code_span; { return @@ `Code_span c } + | m = Raw_markup; { return @@ `Raw_markup m } + | w = Word; { return @@ `Word w } + | m = Math_span; { return @@ `Math_span m } + (* More complex/recursive inline elements should have their own rule *) + | ~ = style; <> + | ~ = reference; <> + | ~ = link; <> + +let cell_content_light := ~ = sequence_nonempty(locatedM(table_light_legal_elt)); <> let cell := | ~ = cell_content_light; <> | ~ = cell_content_light; BAR; <> From 75a090353cf8d0cc5fba66bc14b772b68db15e9a Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 10 Dec 2024 16:14:49 -0500 Subject: [PATCH 088/150] Allow tags w/o children --- src/parser/parser.mly | 32 +++++++++++++---- src/parser/test_driver/tester.ml | 62 ++++++++++++++++++-------------- 2 files changed, 62 insertions(+), 32 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 6fb6f94886..3eb5ca40e8 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -307,21 +307,41 @@ let tag == | t = tag_bare; { Writer.map tag t } let tag_with_content := - | version = Before; children = sequence_nonempty(locatedM(nestable_block_element)); - { Writer.map (fun c -> `Before (version, c)) children } | DEPRECATED; children = sequence_nonempty(locatedM(nestable_block_element)); { Writer.map (fun c -> `Deprecated c) children } | RETURN; children = sequence_nonempty(locatedM(nestable_block_element)); { Writer.map (fun c -> `Return c) children } - | ident = Param; children = sequence_nonempty(locatedM(nestable_block_element)); - { Writer.map (fun c -> `Param (ident, c)) children } + | ~ = before; <> + | ~ = raise; <> + | ~ = see; <> + | ~ = param; <> + +let before := + | version = Before; children = sequence_nonempty(locatedM(nestable_block_element)); + { Writer.map (fun c -> `Before (version, c)) children } + | version = Before; + { return @@ `Before (version, []) } + +let raise := | exn = Raise; children = sequence_nonempty(locatedM(nestable_block_element)); { Writer.map (fun c -> `Raise (exn, c)) children } - | (kind, href) = See; horizontal_whitespace?; children = sequence_nonempty(locatedM(nestable_block_element)); + | exn = Raise; + { return @@ `Raise (exn, []) } + +let see := + | (kind, href) = See; children = sequence_nonempty(locatedM(nestable_block_element)); { let* children = children in return @@ `See (kind, href, children) } + | (kind, href) = See; + { return @@ `See (kind, href, []) } + +let param := + | ident = Param; children = sequence_nonempty(locatedM(nestable_block_element)); + { Writer.map (fun c -> `Param (ident, c)) children } + | ident = Param; + { return @@ `Param (ident, [])} let tag_bare := | version = Version; @@ -696,7 +716,7 @@ let media := (* TOP-LEVEL ELEMENTS *) -let nestable_block_element := ~ = nestable_block_element_inner; Single_newline?; <> +let nestable_block_element := ~ = nestable_block_element_inner; any_whitespace?; <> let nestable_block_element_inner := | ~ = verbatim; <> diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index eedbb89b45..935e301896 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -42,7 +42,31 @@ let error_recovery = ("EOI in modules", "{!modules: Foo Bar"); ] -let location = [ ("Inline false nesting", "{m \\{ \\mathbb{only_left}}") ] +let utf = + [ + ("lambda", "\xce\xbb"); + ("words", "\xce\xbb \xce\xbb"); + ("no_validation", "Î"); + ("escapes", "\xce\xbb\\"); + ("newline", "\xce\xbb \n \xce\xbb"); + ("paragraphs", "\xce\xbb \n\n \xce\xbb"); + ("code_span", "[\xce\xbb]"); + ("minuys", "\xce\xbb-\xce\xbb"); + ("shorthand list", "- \xce\xbb"); + ("styled", "{b \xce\xbb}"); + ("reference_target", "{!\xce\xbb}"); + ("code block ", "{[\xce\xbb]}"); + ("verbatim", "{v \xce\xbb v}"); + ("label", "{2:\xce\xbb Bar}"); + ("author", "@author \xce\xbb"); + ("param", "@param \xce\xbb"); + ("raise", "@raise \xce\xbb"); + ("see", "@see <\xce\xbb>"); + ("since", "@since \xce\xbb"); + ("before", "@before \xce\xbb"); + ("version", "@version \xce\xbb"); + ("right brace", "\xce\xbb}"); + ] (* Cases (mostly) taken from the 'odoc for library authors' document *) let documentation_cases = @@ -102,24 +126,10 @@ type failure = { offending_token : Parser.token; failure_index : int; tokens : Parser.token list; - area : string; } -let get_area Loc.{ start; end_; _ } input = - String.split_on_char '\n' input - |> List.filteri (fun idx _ -> idx >= pred start.line && idx <= pred end_.line) - |> List.fold_left ( ^ ) "" - -let mkfailure label input exn last failure_index tokens = - let Loc.{ value; location } = last in - { - offending_token = value; - failure_index; - tokens; - exn; - label; - area = get_area location input; - } +let mkfailure label exn last failure_index tokens = + { offending_token = last; failure_index; tokens; exn; label } let run_test (label, case) = let open Either in @@ -127,6 +137,7 @@ let run_test (label, case) = let lexbuf = Lexing.from_string case in let file = "Tester" in Lexing.set_filename lexbuf file; + let tokens = ref [] in let input = Parser.Lexer. { @@ -141,7 +152,10 @@ let run_test (label, case) = let offending_token = ref None in let get_tok _ = incr failure_index; - Parser.Lexer.token input lexbuf + let tok = Parser.Lexer.token input lexbuf in + tokens := tok :: !tokens; + offending_token := Some tok; + tok in try let ast, warnings = @@ -157,9 +171,9 @@ let run_test (label, case) = let tokens = List.map Loc.value @@ TokBuf.cache_rest tokbuf in *) Right - (mkfailure label case exns + (mkfailure label exns (Option.get !offending_token) - !failure_index []) + !failure_index (List.rev !tokens)) let sep = String.init 80 @@ Fun.const '-' @@ -169,20 +183,16 @@ let format_successes = in List.fold_left go "" -let failure_string { exn; label; offending_token; area; tokens; failure_index } - = +let failure_string { exn; label; offending_token; tokens; failure_index } = Printf.sprintf {|>>> Case '%s' failed with exn: <<< %s Offending token: %d: '%s' -Input: -"%s" Tokens: %s|} label exn failure_index (Parser.string_of_token offending_token) - area (List.fold_left (fun acc t -> acc ^ "\n" ^ Parser.string_of_token t) "" tokens) @@ -201,7 +211,7 @@ let () = | _ -> print_endline "unrecognized argument - running documentation_cases"; documentation_cases) - else location + else utf in let sucesses, failures = List.partition_map run_test cases in let sucesses = format_successes sucesses in From f0d50202c6ca067441c4d77a360ec1f590d666f1 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 11 Dec 2024 12:07:22 -0500 Subject: [PATCH 089/150] Handle stray closing delimiters at toplevel --- src/parser/parser.mly | 23 +++++++++++++++++++++++ src/parser/writer.ml | 4 ++++ 2 files changed, 27 insertions(+) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 3eb5ca40e8..d6497c6ce8 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -285,6 +285,29 @@ let toplevel := | block = nestable_block_element; { Writer.map (fun b -> (b :> Ast.block_element) ) block } | ~ = tag; <> | ~ = heading; <> + | ~ = toplevel_error; <> + +let toplevel_error := + | elt = locatedM(inline_element); RIGHT_BRACE; + { let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + let what = Tokens.describe RIGHT_BRACE in + let in_what = Tokens.describe_inline @@ Writer.unwrap_located elt in + Parse_error.not_allowed ~what ~in_what span + in + let* elt = Writer.warning warning elt in + return (`Paragraph [elt] :> Ast.block_element) + } + | elt = locatedM(inline_element); RIGHT_CODE_DELIMITER; + { let warning = fun ~filename -> + let span = Parser_aux.to_location ~filename $sloc in + let what = Tokens.describe RIGHT_CODE_DELIMITER in + let in_what = Tokens.describe_inline @@ Writer.unwrap_located elt in + Parse_error.not_allowed ~what ~in_what span + in + let* elt = Writer.warning warning elt in + return (`Paragraph [elt] :> Ast.block_element) + } (* SECTION HEADING *) diff --git a/src/parser/writer.ml b/src/parser/writer.ml index ea4bd519b4..aee840ac5f 100644 --- a/src/parser/writer.ml +++ b/src/parser/writer.ml @@ -50,3 +50,7 @@ let ensure : ('a -> bool) -> partial_warning -> 'a t -> 'a t = let run : filename:string -> Ast.t t -> Ast.t * Warning.t list = fun ~filename (Writer (tree, warnings)) -> (tree, List.map (fun f -> f ~filename) warnings) + +let unwrap : 'a t -> 'a = fun (Writer (x, _)) -> x +let unwrap_located : 'a Loc.with_location t -> 'a = + fun (Writer (Loc.{ value; _ }, _)) -> value From f942ad0bf6bfb6b0a74fe1d6424d4515168d4e82 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 12 Dec 2024 13:49:41 -0500 Subject: [PATCH 090/150] Move Parser_aux contents to Loc --- src/parser/TODO.md | 29 +++++++ src/parser/loc.ml | 19 ++++- src/parser/loc.mli | 3 + src/parser/odoc_parser.ml | 3 +- src/parser/parser.mly | 169 ++++++++++++++++++++++++++------------ src/parser/parser_aux.ml | 31 ------- src/parser/tokens.ml | 16 ++++ 7 files changed, 184 insertions(+), 86 deletions(-) create mode 100644 src/parser/TODO.md delete mode 100644 src/parser/parser_aux.ml diff --git a/src/parser/TODO.md b/src/parser/TODO.md new file mode 100644 index 0000000000..40a5d044c5 --- /dev/null +++ b/src/parser/TODO.md @@ -0,0 +1,29 @@ +- How to solve combinatorial explosion in elements like lists and tables? + - Ex: in a "heavy" list, an illegal token can exist in every nesting level + - {ul Something_illegal {li foo} {li bar} } + - {ul {li foo} Something_illegal {li bar} } + - {ul {li foo} {li bar Something_illegal} } + - Matching on each possible combination not only requires a great deal of + effort and repeated code, it weakens Menhir's ability to reduce the correct + rule + - My thought is that we do something like this: + ```ocaml + let list_error_cases := + | List; items = list_items+; loc = located(error); + { Warning.unexpected_at loc.location } + | List; loc = located(error); + { (* Same as above *)} + ``` + - With this strategy, we lose some information, but if we modify our + warning type (currently a function from filename to warning) to be a + function which takes the input text, we can at least show the user the specific + span that is illegal by mapping over our warnings and evaluating them. + + - So this warning looks something like + ```ocaml + fun ~filename input -> + String.split_on_char '\n' input + |> get_error_span location + |> Warning.unexpected_at location + + ``` diff --git a/src/parser/loc.ml b/src/parser/loc.ml index f9d2a3fe91..0ad9ba9722 100644 --- a/src/parser/loc.ml +++ b/src/parser/loc.ml @@ -9,10 +9,25 @@ let map f annotated = { annotated with value = f annotated.value } let is predicate { value; _ } = predicate value let same annotated value = { annotated with value } -let fmt { start; end_; _ } = +let of_position : ?filename:string -> Lexing.position * Lexing.position -> span + = + fun ?filename (start, end_) -> + print_endline @@ "FILENAME: " ^ start.pos_fname; + let to_point Lexing.{ pos_lnum; pos_cnum; _ } = + { line = pos_lnum; column = pos_cnum } + in + let start_point = to_point start and end_point = to_point end_ in + { + file = Option.value ~default:start.pos_fname filename; + start = start_point; + end_ = end_point; + } + +let fmt { file; start; end_ } = let { line = sline; column = scol } = start and { line = eline; column = ecol } = end_ in - Printf.sprintf "start: { line : %d, col : %d }\nend: { line: %d; col: %d }" + Printf.sprintf + "file: %s\nstart: { line : %d, col : %d }\nend: { line: %d; col: %d }" file sline scol eline ecol let span spans = diff --git a/src/parser/loc.mli b/src/parser/loc.mli index 2e5ce49730..f04b95267a 100644 --- a/src/parser/loc.mli +++ b/src/parser/loc.mli @@ -36,6 +36,9 @@ val nudge_map_end : int -> 'a with_location -> 'a with_location val at : span -> 'a -> 'a with_location (** Constructor for {!with_location} *) +val of_position : ?filename:string -> Lexing.position * Lexing.position -> span +(** Convert Menhir's `$loc` or `$sloc` into a span *) + val location : 'a with_location -> span (** Returns the location of a located value *) diff --git a/src/parser/odoc_parser.ml b/src/parser/odoc_parser.ml index 0d649a421b..f9d68237c4 100644 --- a/src/parser/odoc_parser.ml +++ b/src/parser/odoc_parser.ml @@ -118,8 +118,7 @@ let parse_comment : location:Lexing.position -> text:string -> t = let lexbuf = Lexing.from_string text in (* We cannot directly pass parameters to Menhir without converting our parser to a module functor. So we pass our current filename to the lexbuf here *) - lexbuf.lex_curr_p <- - { lexbuf.lex_curr_p with pos_fname = location.Lexing.pos_fname }; + Lexing.(set_filename lexbuf location.pos_fname); let lexer_state = Lexer. { diff --git a/src/parser/parser.mly b/src/parser/parser.mly index d6497c6ce8..1820661ad0 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -1,11 +1,9 @@ %{ - open Parser_aux - open Writer.Prelude let ( let+ ) = Fun.flip Writer.map - let wrap_location : lexspan -> 'a -> 'a Loc.with_location = - fun loc value -> - let location = Parser_aux.to_location loc in + let wrap_location : Lexing.position * Lexing.position -> 'a -> 'a Loc.with_location = + fun pos value -> + let location = Loc.of_position pos in { location; value } let not_empty : 'a list -> bool = function _ :: _ -> true | _ -> false @@ -145,8 +143,6 @@ let split_replacement_media Loc.{ location; value = media, target, content } = (Loc.at location media, target, content) - let tag t = `Tag t - let trim_start = function | Loc.{value = `Space _; _ } :: xs -> xs | xs -> xs @@ -283,39 +279,63 @@ let main := let toplevel := | block = nestable_block_element; { Writer.map (fun b -> (b :> Ast.block_element) ) block } - | ~ = tag; <> - | ~ = heading; <> + | t = tag; { Writer.map (fun t -> `Tag t) t } + | ~ = section_heading; <> | ~ = toplevel_error; <> -let toplevel_error := +let toplevel_error := + | brace = located(RIGHT_BRACE); + { + let warning = fun ~filename -> + let span = Loc.of_position ~filename $sloc in + let what = Tokens.describe RIGHT_BRACE in + Parse_error.bad_markup what span + in + let as_text = Loc.same brace @@ `Word "{" in + let node = (`Paragraph [ as_text ]) in + Writer.with_warning node warning + } + | t = tag; RIGHT_BRACE; + { + let* tag_descr = Writer.map Tokens.describe_tag (t : Ast.tag Writer.t) in + let warning = fun ~filename -> + let span = Loc.of_position ~filename $sloc in + let what = Tokens.describe RIGHT_BRACE in + Parse_error.not_allowed ~what ~in_what:tag_descr span + in + let ret = Writer.map (fun t -> ( `Tag t : Ast.block_element )) t + |> Writer.warning warning + in + (ret : Ast.block_element Writer.t) + } | elt = locatedM(inline_element); RIGHT_BRACE; { let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in let what = Tokens.describe RIGHT_BRACE in let in_what = Tokens.describe_inline @@ Writer.unwrap_located elt in Parse_error.not_allowed ~what ~in_what span in let* elt = Writer.warning warning elt in - return (`Paragraph [elt] :> Ast.block_element) + return @@ `Paragraph [elt] } | elt = locatedM(inline_element); RIGHT_CODE_DELIMITER; { let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in let what = Tokens.describe RIGHT_CODE_DELIMITER in let in_what = Tokens.describe_inline @@ Writer.unwrap_located elt in Parse_error.not_allowed ~what ~in_what span in let* elt = Writer.warning warning elt in - return (`Paragraph [elt] :> Ast.block_element) + return @@ `Paragraph [elt] } (* SECTION HEADING *) -let heading := +let section_heading := | (num, title) = Section_heading; children = sequence(locatedM(inline_element)); RIGHT_BRACE; { let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in let what = Tokens.describe @@ Section_heading (num, title) in Parse_error.should_not_be_empty ~what span in @@ -323,15 +343,40 @@ let heading := |> Writer.map (fun c -> `Heading (num, title, trim_start c)) } + | (num, title) = Section_heading; whitespace?; RIGHT_CODE_DELIMITER; + { + let should_not_be_empty = fun ~filename -> + let span = Loc.of_position ~filename $sloc in + let what = Tokens.describe @@ Section_heading (num, title) in + Parse_error.should_not_be_empty ~what span + in + let not_allowed = fun ~filename -> + let span = Loc.of_position ~filename $sloc in + let what = Tokens.describe RIGHT_CODE_DELIMITER in + let in_what = Tokens.describe @@ Section_heading (num, title) in + Parse_error.not_allowed ~what ~in_what span + in + return @@ `Heading (num, title, []) + |> Writer.warning should_not_be_empty + |> Writer.warning not_allowed + } + (* TAGS *) let tag == - | t = tag_with_content; { Writer.map tag t } - | t = tag_bare; { Writer.map tag t } + | with_content = tag_with_content; { (with_content : Ast.tag Writer.t) } + | bare = tag_bare; { (bare : Ast.tag Writer.t) } let tag_with_content := | DEPRECATED; children = sequence_nonempty(locatedM(nestable_block_element)); { Writer.map (fun c -> `Deprecated c) children } + | DEPRECATED; RIGHT_BRACE; + { + let warning = fun ~filename -> + Parse_error.unpaired_right_brace @@ Loc.of_position ~filename $sloc + in + Writer.with_warning (`Deprecated []) warning + } | RETURN; children = sequence_nonempty(locatedM(nestable_block_element)); { Writer.map (fun c -> `Return c) children } | ~ = before; <> @@ -371,7 +416,7 @@ let tag_bare := { let what = Tokens.describe (Version version) in let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in Parse_error.should_not_be_empty ~what span in Writer.ensure has_content warning (return version) @@ -381,7 +426,7 @@ let tag_bare := { let what = Tokens.describe (Since version) in let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in Parse_error.should_not_be_empty ~what span in Writer.ensure has_content warning (return version) @@ -391,7 +436,7 @@ let tag_bare := { let what = Tokens.describe @@ Canonical "" in let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in Parse_error.should_not_be_empty ~what span in Writer.ensure (Loc.is has_content) warning @@ return impl @@ -401,7 +446,7 @@ let tag_bare := { let what = Tokens.describe @@ Author author in let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in Parse_error.should_not_be_empty ~what span in Writer.ensure has_content warning @@ return author @@ -434,7 +479,7 @@ let style := | style = Style; children = sequence(locatedM(inline_element)); RIGHT_BRACE; { let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in let what = Tokens.describe @@ Style style in Parse_error.should_not_be_empty ~what span in @@ -444,24 +489,39 @@ let style := | style = Style; RIGHT_BRACE; { let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in let what = Tokens.describe @@ Style style in Parse_error.should_not_be_empty ~what span in Writer.with_warning (`Styled (style, [])) warning } + | style = Style; RIGHT_CODE_DELIMITER; + { + + let style_desc = Tokens.describe @@ Style style in + let not_allowed = fun ~filename -> + let span = Loc.of_position ~filename $sloc in + let what = Tokens.describe RIGHT_CODE_DELIMITER in + Parse_error.not_allowed ~what ~in_what:style_desc span + in + let should_not_be_empty = fun ~filename -> + let span = Loc.of_position ~filename $sloc in + Parse_error.should_not_be_empty ~what:style_desc span + in + return (`Styled (style, [])) + |> Writer.warning not_allowed + |> Writer.warning should_not_be_empty + } | style = Style; END; { let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in let in_what = Tokens.describe @@ Style style in Parse_error.end_not_allowed ~in_what span in Writer.with_warning (`Styled (style, [])) warning } -let math_span := m = Math_span; { return @@ `Math_span m } - (* LINKS + REFS *) let reference := @@ -483,7 +543,7 @@ let reference := let what = Tokens.describe @@ Ref_with_replacement (Loc.value ref_body) in let warning = fun ~filename -> let span = - Parser_aux.to_location ~filename $sloc + Loc.of_position ~filename $sloc |> Loc.nudge_start (String.length "{{!") in Parse_error.should_not_be_empty ~what span @@ -499,7 +559,7 @@ let link := if "" = url then let what = Tokens.describe @@ Simple_link link_body in let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in Parse_error.should_not_be_empty ~what span in Writer.with_warning node warning @@ -512,7 +572,7 @@ let link := let node = `Link (link_body, c) in if "" = link_body then let what = Tokens.describe @@ Link_with_replacement link_body in - let span = Parser_aux.to_location $sloc in + let span = Loc.of_position $sloc in let warning = fun ~filename:_f -> Parse_error.should_not_be_empty ~what span in @@ -523,7 +583,7 @@ let link := | link_body = Link_with_replacement; whitespace?; RIGHT_BRACE; { let span = - Parser_aux.to_location $sloc + Loc.of_position $sloc |> Loc.nudge_start (String.length "{{!") in let node = `Link (link_body, []) in @@ -541,7 +601,7 @@ let list_light_item_unordered == | horizontal_whitespace; MINUS; item = locatedM(nestable_block_element); { let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in Parse_error.should_begin_on_its_own_line ~what:(Tokens.describe MINUS) span in Writer.warning warning item @@ -552,7 +612,7 @@ let list_light_item_ordered == | horizontal_whitespace; PLUS; item = locatedM(nestable_block_element); { let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in Parse_error.should_begin_on_its_own_line ~what:(Tokens.describe MINUS) span in Writer.warning warning item @@ -568,7 +628,7 @@ let item_heavy == | LI; whitespace; items = sequence(locatedM(nestable_block_element)); RIGHT_BRACE; whitespace?; { let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span in Writer.ensure not_empty warning items @@ -576,12 +636,12 @@ let item_heavy == | LI; items = sequence(locatedM(nestable_block_element)); RIGHT_BRACE; whitespace?; { let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in Parse_error.should_be_followed_by_whitespace ~what:(Tokens.describe LI) span in let writer = let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span in Writer.ensure not_empty warning items @@ -591,7 +651,7 @@ let item_heavy == | DASH; whitespace?; items = sequence(locatedM(nestable_block_element)); RIGHT_BRACE; whitespace?; { let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span in Writer.ensure not_empty warning items @@ -599,14 +659,14 @@ let item_heavy == | LI; whitespace?; items = sequence(locatedM(nestable_block_element))?; END; { let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in Parse_error.end_not_allowed ~in_what:(Tokens.describe LI) span in match items with | Some items -> let writer = let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span in Writer.ensure not_empty warning items @@ -618,14 +678,14 @@ let item_heavy == | DASH; whitespace?; items = sequence(locatedM(nestable_block_element))?; END; { let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in Parse_error.end_not_allowed ~in_what:(Tokens.describe DASH) span in match items with | Some items -> let writer = let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in Parse_error.should_not_be_empty ~what:(Tokens.describe DASH) span in Writer.ensure not_empty warning items @@ -636,15 +696,19 @@ let item_heavy == } let list_heavy := - | list_kind = List; whitespace?; items = sequence(item_heavy); whitespace?; RIGHT_BRACE; + | list_kind = List; whitespace?; items = sequence_nonempty(item_heavy); whitespace?; RIGHT_BRACE; + { + Writer.map (fun items -> `List (list_kind, `Heavy, items)) items + } + | list_kind = List; whitespace?; RIGHT_BRACE; { let warning = fun ~filename -> let what = Tokens.describe @@ List list_kind in - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in Parse_error.should_not_be_empty ~what span in - Writer.ensure not_empty warning items - |> Writer.map (fun items -> `List (list_kind, `Heavy, items)) + let node = `List (list_kind, `Heavy, []) in + Writer.with_warning node warning } let list_element := @@ -698,7 +762,7 @@ let table_light := { let in_what = Tokens.describe TABLE_LIGHT in let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in Parse_error.end_not_allowed ~in_what span in unclosed_table ~data warning @@ -707,7 +771,7 @@ let table_light := { let in_what = Tokens.describe TABLE_LIGHT in let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in Parse_error.end_not_allowed ~in_what span in unclosed_table warning @@ -756,7 +820,7 @@ let paragraph_style := | style = Paragraph_style; ws = paragraph; RIGHT_BRACE; { let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in let what = Tokens.describe @@ Paragraph_style style in Parse_error.markup_should_not_be_used span ~what in @@ -767,7 +831,7 @@ let verbatim := v = Verbatim; { let what = Tokens.describe @@ Verbatim v in let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in Parse_error.should_not_be_empty ~what span in Writer.ensure has_content warning (return v) @@ -778,13 +842,16 @@ let paragraph := | items = sequence_nonempty(locatedM(inline_element)); { Writer.map (fun i -> `Paragraph (trim_start i)) items } -let code_block := c = Code_block; { return (`Code_block c) } +let code_block := c = Code_block; + { + + return (`Code_block c) } let math_block := m = Math_block; { let what = Tokens.describe @@ Math_block m in let warning = fun ~filename -> - let span = Parser_aux.to_location ~filename $sloc in + let span = Loc.of_position ~filename $sloc in Parse_error.should_not_be_empty ~what span in Writer.ensure has_content warning (return m) diff --git a/src/parser/parser_aux.ml b/src/parser/parser_aux.ml deleted file mode 100644 index 1d3a3f721d..0000000000 --- a/src/parser/parser_aux.ml +++ /dev/null @@ -1,31 +0,0 @@ -type tag = - | Author of string - | Deprecated - | Param of string - | Raise of string - | Return - | See of [ `Url | `File | `Document ] * string - | Since of string - | Before of string - | Version of string - | Canonical of string - | Inline - | Open - | Closed - | Hidden - -let point_of_position Lexing.{ pos_lnum; pos_cnum; _ } = - Loc.{ line = pos_lnum; column = pos_cnum } - -type lexspan = Lexing.position * Lexing.position - -let to_location : ?filename:string -> lexspan -> Loc.span = - fun ?filename (start, end_) -> - let open Loc in - let start_point = point_of_position start - and end_point = point_of_position end_ in - { - file = Option.value ~default:start.pos_fname filename; - start = start_point; - end_ = end_point; - } diff --git a/src/parser/tokens.ml b/src/parser/tokens.ml index d9aec99a5e..61c5ffcc33 100644 --- a/src/parser/tokens.ml +++ b/src/parser/tokens.ml @@ -253,3 +253,19 @@ let describe_nestable_block : Ast.nestable_block_element -> string = function describe @@ if kind = `Light then TABLE_LIGHT else TABLE_HEAVY | `Math_block _ -> describe @@ Math_block "" | `Media _ as media -> describe @@ of_media media + +let describe_tag : Ast.tag -> string = function + | `See (kind, _, _) -> describe @@ See (kind, "") + | `Author s -> describe @@ Author s + | `Deprecated _ -> describe DEPRECATED + | `Param (s, _) -> describe @@ Param s + | `Raise (s, _) -> describe @@ Raise s + | `Return _ -> describe RETURN + | `Since v -> describe @@ Since v + | `Before (v, _) -> describe @@ Before v + | `Version v -> describe @@ Version v + | `Closed -> describe CLOSED + | `Open -> describe OPEN + | `Canonical Loc.{ value; _ } -> describe @@ Canonical value + | `Hidden -> describe HIDDEN + | `Inline -> describe INLINE From f6b6b8036586876a82155ee6935370b1a2dd5392 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 12 Dec 2024 14:26:22 -0500 Subject: [PATCH 091/150] Remove filename from Writer.warning, function takes input string instead --- src/parser/odoc_parser.ml | 3 +- src/parser/odoc_parser.mli | 2 +- src/parser/parser.mly | 236 ++++++++++++++++--------------- src/parser/test_driver/tester.ml | 4 +- src/parser/writer.ml | 22 +-- 5 files changed, 136 insertions(+), 131 deletions(-) diff --git a/src/parser/odoc_parser.ml b/src/parser/odoc_parser.ml index f9d68237c4..6be1643bfd 100644 --- a/src/parser/odoc_parser.ml +++ b/src/parser/odoc_parser.ml @@ -130,8 +130,7 @@ let parse_comment : location:Lexing.position -> text:string -> t = in (* Remove the `Loc.with_location` wrapping our token because Menhir cannot handle that *) let ast, warnings = - Writer.run ~filename:lexer_state.file - @@ Parser.main (Lexer.token lexer_state) lexbuf + Writer.run ~input:text @@ Parser.main (Lexer.token lexer_state) lexbuf in { ast; diff --git a/src/parser/odoc_parser.mli b/src/parser/odoc_parser.mli index 41b49c816c..ecee24d90e 100644 --- a/src/parser/odoc_parser.mli +++ b/src/parser/odoc_parser.mli @@ -38,7 +38,7 @@ module Tester : sig comment_location:Lexing.position -> int -> Loc.point - val run : filename:string -> Ast.t Writer.t -> Ast.t * Warning.t list + val run : input:string -> Ast.t Writer.t -> Ast.t * Warning.t list val main : (Lexing.lexbuf -> token) -> Lexing.lexbuf -> Ast.t Writer.t val string_of_token : token -> string end diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 1820661ad0..534b287129 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -286,10 +286,10 @@ let toplevel := let toplevel_error := | brace = located(RIGHT_BRACE); { - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in + let warning = + let span = Loc.of_position $sloc in let what = Tokens.describe RIGHT_BRACE in - Parse_error.bad_markup what span + Writer.Warning (Parse_error.bad_markup what span) in let as_text = Loc.same brace @@ `Word "{" in let node = (`Paragraph [ as_text ]) in @@ -298,10 +298,10 @@ let toplevel_error := | t = tag; RIGHT_BRACE; { let* tag_descr = Writer.map Tokens.describe_tag (t : Ast.tag Writer.t) in - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in + let warning = + let span = Loc.of_position $sloc in let what = Tokens.describe RIGHT_BRACE in - Parse_error.not_allowed ~what ~in_what:tag_descr span + Writer.Warning (Parse_error.not_allowed ~what ~in_what:tag_descr span) in let ret = Writer.map (fun t -> ( `Tag t : Ast.block_element )) t |> Writer.warning warning @@ -309,21 +309,21 @@ let toplevel_error := (ret : Ast.block_element Writer.t) } | elt = locatedM(inline_element); RIGHT_BRACE; - { let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in + { let warning = + let span = Loc.of_position $sloc in let what = Tokens.describe RIGHT_BRACE in let in_what = Tokens.describe_inline @@ Writer.unwrap_located elt in - Parse_error.not_allowed ~what ~in_what span + Writer.Warning (Parse_error.not_allowed ~what ~in_what span) in let* elt = Writer.warning warning elt in return @@ `Paragraph [elt] } | elt = locatedM(inline_element); RIGHT_CODE_DELIMITER; - { let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in + { let warning = + let span = Loc.of_position $sloc in let what = Tokens.describe RIGHT_CODE_DELIMITER in let in_what = Tokens.describe_inline @@ Writer.unwrap_located elt in - Parse_error.not_allowed ~what ~in_what span + Writer.Warning (Parse_error.not_allowed ~what ~in_what span) in let* elt = Writer.warning warning elt in return @@ `Paragraph [elt] @@ -334,10 +334,10 @@ let toplevel_error := let section_heading := | (num, title) = Section_heading; children = sequence(locatedM(inline_element)); RIGHT_BRACE; { - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in + let warning = + let span = Loc.of_position $sloc in let what = Tokens.describe @@ Section_heading (num, title) in - Parse_error.should_not_be_empty ~what span + Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.ensure not_empty warning children |> Writer.map (fun c -> `Heading (num, title, trim_start c)) @@ -345,16 +345,16 @@ let section_heading := | (num, title) = Section_heading; whitespace?; RIGHT_CODE_DELIMITER; { - let should_not_be_empty = fun ~filename -> - let span = Loc.of_position ~filename $sloc in + let should_not_be_empty = + let span = Loc.of_position $sloc in let what = Tokens.describe @@ Section_heading (num, title) in - Parse_error.should_not_be_empty ~what span + Writer.Warning (Parse_error.should_not_be_empty ~what span) in - let not_allowed = fun ~filename -> - let span = Loc.of_position ~filename $sloc in + let not_allowed = + let span = Loc.of_position $sloc in let what = Tokens.describe RIGHT_CODE_DELIMITER in let in_what = Tokens.describe @@ Section_heading (num, title) in - Parse_error.not_allowed ~what ~in_what span + Writer.Warning (Parse_error.not_allowed ~what ~in_what span) in return @@ `Heading (num, title, []) |> Writer.warning should_not_be_empty @@ -372,8 +372,8 @@ let tag_with_content := { Writer.map (fun c -> `Deprecated c) children } | DEPRECATED; RIGHT_BRACE; { - let warning = fun ~filename -> - Parse_error.unpaired_right_brace @@ Loc.of_position ~filename $sloc + let warning = + Writer.Warning (Parse_error.unpaired_right_brace @@ Loc.of_position $sloc) in Writer.with_warning (`Deprecated []) warning } @@ -415,9 +415,9 @@ let tag_bare := | version = Version; { let what = Tokens.describe (Version version) in - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in - Parse_error.should_not_be_empty ~what span + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.ensure has_content warning (return version) |> Writer.map (fun v -> `Version v) @@ -425,9 +425,9 @@ let tag_bare := | version = Since; { let what = Tokens.describe (Since version) in - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in - Parse_error.should_not_be_empty ~what span + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.ensure has_content warning (return version) |> Writer.map (fun v -> `Since v) @@ -435,9 +435,9 @@ let tag_bare := | impl = located(Canonical); { let what = Tokens.describe @@ Canonical "" in - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in - Parse_error.should_not_be_empty ~what span + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.ensure (Loc.is has_content) warning @@ return impl |> Writer.map (fun v -> `Canonical v) @@ -445,9 +445,9 @@ let tag_bare := | author = Author; { let what = Tokens.describe @@ Author author in - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in - Parse_error.should_not_be_empty ~what span + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.ensure has_content warning @@ return author |> Writer.map (fun a -> `Author a) @@ -478,20 +478,20 @@ let inline_elt_legal_whitespace := let style := | style = Style; children = sequence(locatedM(inline_element)); RIGHT_BRACE; { - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in + let warning = + let span = Loc.of_position $sloc in let what = Tokens.describe @@ Style style in - Parse_error.should_not_be_empty ~what span + Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.ensure not_empty warning children |> Writer.map (fun c -> `Styled (style, trim_start c)) } | style = Style; RIGHT_BRACE; { - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in + let warning = + let span = Loc.of_position $sloc in let what = Tokens.describe @@ Style style in - Parse_error.should_not_be_empty ~what span + Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.with_warning (`Styled (style, [])) warning } @@ -499,14 +499,14 @@ let style := { let style_desc = Tokens.describe @@ Style style in - let not_allowed = fun ~filename -> - let span = Loc.of_position ~filename $sloc in + let not_allowed = + let span = Loc.of_position $sloc in let what = Tokens.describe RIGHT_CODE_DELIMITER in - Parse_error.not_allowed ~what ~in_what:style_desc span + Writer.Warning (Parse_error.not_allowed ~what ~in_what:style_desc span) in - let should_not_be_empty = fun ~filename -> - let span = Loc.of_position ~filename $sloc in - Parse_error.should_not_be_empty ~what:style_desc span + let should_not_be_empty = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_not_be_empty ~what:style_desc span) in return (`Styled (style, [])) |> Writer.warning not_allowed @@ -514,10 +514,10 @@ let style := } | style = Style; END; { - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in + let warning = + let span = Loc.of_position $sloc in let in_what = Tokens.describe @@ Style style in - Parse_error.end_not_allowed ~in_what span + Writer.Warning (Parse_error.end_not_allowed ~in_what span) in Writer.with_warning (`Styled (style, [])) warning } @@ -541,12 +541,12 @@ let reference := { let node = `Reference (`With_text, ref_body, []) in let what = Tokens.describe @@ Ref_with_replacement (Loc.value ref_body) in - let warning = fun ~filename -> + let warning = let span = - Loc.of_position ~filename $sloc + Loc.of_position $sloc |> Loc.nudge_start (String.length "{{!") in - Parse_error.should_not_be_empty ~what span + Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.with_warning node warning } @@ -558,9 +558,9 @@ let link := let url = String.trim link_body in if "" = url then let what = Tokens.describe @@ Simple_link link_body in - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in - Parse_error.should_not_be_empty ~what span + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.with_warning node warning else @@ -573,8 +573,8 @@ let link := if "" = link_body then let what = Tokens.describe @@ Link_with_replacement link_body in let span = Loc.of_position $sloc in - let warning = fun ~filename:_f -> - Parse_error.should_not_be_empty ~what span + let warning = + Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.with_warning node warning else @@ -588,8 +588,8 @@ let link := in let node = `Link (link_body, []) in let what = Tokens.describe @@ Link_with_replacement link_body in - let warning = fun ~filename:_f -> - Parse_error.should_not_be_empty ~what span + let warning = + Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.with_warning node warning } @@ -600,9 +600,9 @@ let list_light_item_unordered == | MINUS; ~ = locatedM(nestable_block_element); <> | horizontal_whitespace; MINUS; item = locatedM(nestable_block_element); { - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in - Parse_error.should_begin_on_its_own_line ~what:(Tokens.describe MINUS) span + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_begin_on_its_own_line ~what:(Tokens.describe MINUS) span) in Writer.warning warning item } @@ -611,9 +611,9 @@ let list_light_item_ordered == | PLUS; ~ = locatedM(nestable_block_element); <> | horizontal_whitespace; PLUS; item = locatedM(nestable_block_element); { - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in - Parse_error.should_begin_on_its_own_line ~what:(Tokens.describe MINUS) span + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_begin_on_its_own_line ~what:(Tokens.describe MINUS) span) in Writer.warning warning item } @@ -627,22 +627,22 @@ let list_light := let item_heavy == | LI; whitespace; items = sequence(locatedM(nestable_block_element)); RIGHT_BRACE; whitespace?; { - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in - Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span) in Writer.ensure not_empty warning items } | LI; items = sequence(locatedM(nestable_block_element)); RIGHT_BRACE; whitespace?; { - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in - Parse_error.should_be_followed_by_whitespace ~what:(Tokens.describe LI) span + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_be_followed_by_whitespace ~what:(Tokens.describe LI) span) in let writer = - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in - Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span) in Writer.ensure not_empty warning items in @@ -650,24 +650,24 @@ let item_heavy == } | DASH; whitespace?; items = sequence(locatedM(nestable_block_element)); RIGHT_BRACE; whitespace?; { - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in - Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span) in Writer.ensure not_empty warning items } | LI; whitespace?; items = sequence(locatedM(nestable_block_element))?; END; { - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in - Parse_error.end_not_allowed ~in_what:(Tokens.describe LI) span + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.end_not_allowed ~in_what:(Tokens.describe LI) span) in match items with | Some items -> let writer = - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in - Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span) in Writer.ensure not_empty warning items in @@ -677,16 +677,16 @@ let item_heavy == } | DASH; whitespace?; items = sequence(locatedM(nestable_block_element))?; END; { - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in - Parse_error.end_not_allowed ~in_what:(Tokens.describe DASH) span + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.end_not_allowed ~in_what:(Tokens.describe DASH) span) in match items with | Some items -> let writer = - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in - Parse_error.should_not_be_empty ~what:(Tokens.describe DASH) span + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe DASH) span) in Writer.ensure not_empty warning items in @@ -702,10 +702,10 @@ let list_heavy := } | list_kind = List; whitespace?; RIGHT_BRACE; { - let warning = fun ~filename -> + let warning = let what = Tokens.describe @@ List list_kind in - let span = Loc.of_position ~filename $sloc in - Parse_error.should_not_be_empty ~what span + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_not_be_empty ~what span) in let node = `List (list_kind, `Heavy, []) in Writer.with_warning node warning @@ -761,18 +761,18 @@ let table_light := | table_start_light; data = rows_light; END; { let in_what = Tokens.describe TABLE_LIGHT in - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in - Parse_error.end_not_allowed ~in_what span + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.end_not_allowed ~in_what span) in unclosed_table ~data warning } | TABLE_LIGHT; any_whitespace?; END; { let in_what = Tokens.describe TABLE_LIGHT in - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in - Parse_error.end_not_allowed ~in_what span + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.end_not_allowed ~in_what span) in unclosed_table warning } @@ -819,10 +819,10 @@ let nestable_block_element_inner := let paragraph_style := | style = Paragraph_style; ws = paragraph; RIGHT_BRACE; { - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in + let warning = + let span = Loc.of_position $sloc in let what = Tokens.describe @@ Paragraph_style style in - Parse_error.markup_should_not_be_used span ~what + Writer.Warning (Parse_error.markup_should_not_be_used span ~what) in Writer.warning warning ws } @@ -830,9 +830,9 @@ let paragraph_style := let verbatim := v = Verbatim; { let what = Tokens.describe @@ Verbatim v in - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in - Parse_error.should_not_be_empty ~what span + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.ensure has_content warning (return v) |> Writer.map (fun v -> `Verbatim v) @@ -850,9 +850,9 @@ let code_block := c = Code_block; let math_block := m = Math_block; { let what = Tokens.describe @@ Math_block m in - let warning = fun ~filename -> - let span = Loc.of_position ~filename $sloc in - Parse_error.should_not_be_empty ~what span + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.ensure has_content warning (return m) |> Writer.map (fun m -> `Math_block m) @@ -864,7 +864,7 @@ let modules := print_endline "INLINE"; let in_what = Tokens.describe MODULES in let* modules = modules in - let not_allowed = fun ~filename:_ -> + let not_allowed = let span = Loc.span @@ List.map Loc.location modules in let first_offending = List.find_opt @@ -874,12 +874,12 @@ let modules := (List.map Loc.value modules : Ast.inline_element list) in let what = Option.map Tokens.describe_inline first_offending |> Option.value ~default:String.empty in - Parse_error.not_allowed ~what ~in_what span + Writer.Warning (Parse_error.not_allowed ~what ~in_what span) in - let is_empty = fun ~filename:_ -> + let is_empty = let span = Loc.span @@ List.map Loc.location modules in let what = Tokens.describe MODULES in - Parse_error.should_not_be_empty ~what span + Writer.Warning (Parse_error.should_not_be_empty ~what span) in let inner = `Modules (List.map (Loc.map inline_element_inner) modules) in List.fold_left (fun writer (f, w) -> Writer.ensure f w writer) (return modules) [(not_empty, is_empty); (legal_module_list, not_allowed)] @@ -891,7 +891,7 @@ let modules := let in_what = Tokens.describe MODULES in let* modules = modules in let span = Loc.span @@ List.map Loc.location modules in - let not_allowed = fun ~filename:_ -> + let not_allowed = let first_offending = List.find_opt (function @@ -900,11 +900,13 @@ let modules := (List.map Loc.value modules : Ast.inline_element list) in let what = Option.map Tokens.describe_inline first_offending |> Option.value ~default:String.empty in - Parse_error.not_allowed ~what ~in_what span + Writer.Warning (Parse_error.not_allowed ~what ~in_what span) in - let unexpected_end = fun ~filename:_ -> - Parse_error.end_not_allowed ~in_what:(Tokens.describe MODULES) span + let unexpected_end = + Writer.Warning (Parse_error.end_not_allowed ~in_what:(Tokens.describe MODULES) span) in let inner = `Modules (List.map (Loc.map inline_element_inner) modules) in - Writer.with_warning inner not_allowed |> Writer.warning unexpected_end + return inner + |> Writer.warning not_allowed + |> Writer.warning unexpected_end } diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index 935e301896..633dec1c09 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -158,9 +158,7 @@ let run_test (label, case) = tok in try - let ast, warnings = - Parser.run ~filename:"Tester" @@ Parser.main get_tok lexbuf - in + let ast, warnings = Parser.run ~input:case @@ Parser.main get_tok lexbuf in let warnings = warnings @ input.warnings in let output = Format.asprintf "%a" parser_output (ast, warnings) in Left (label, output) diff --git a/src/parser/writer.ml b/src/parser/writer.ml index aee840ac5f..728f4bb186 100644 --- a/src/parser/writer.ml +++ b/src/parser/writer.ml @@ -1,5 +1,11 @@ -type partial_warning = filename:string -> Warning.t -type +'a t = Writer of ('a * partial_warning list) +(** An implementation of the Writer monad for parser error reporting *) + +type +'a t = Writer of ('a * warning list) +and warning = InputNeeded of (string -> Warning.t) | Warning of Warning.t + +let run_warning : input:string -> warning -> Warning.t = + fun ~input warning -> + match warning with InputNeeded f -> f input | Warning w -> w let return : 'a -> 'a t = fun x -> Writer (x, []) @@ -21,8 +27,8 @@ module Prelude = struct let ( >>= ) = bind let ( let* ) = bind let ( and* ) = bind - let ( let+ ) = map - let ( >|= ) = map + let ( let+ ) w f = map f w + let ( <$> ) = map let ( *> ) = seq_right let ( <* ) = seq_left end @@ -43,13 +49,13 @@ let traverse : ('a -> 'b t) -> 'a list -> 'b list t = let with_warning node warning = Writer (node, [ warning ]) -let ensure : ('a -> bool) -> partial_warning -> 'a t -> 'a t = +let ensure : ('a -> bool) -> warning -> 'a t -> 'a t = fun pred warning (Writer (x, ws) as self) -> if pred x then self else Writer (x, warning :: ws) -let run : filename:string -> Ast.t t -> Ast.t * Warning.t list = - fun ~filename (Writer (tree, warnings)) -> - (tree, List.map (fun f -> f ~filename) warnings) +let run : input:string -> Ast.t t -> Ast.t * Warning.t list = + fun ~input (Writer (tree, warnings)) -> + (tree, List.map (run_warning ~input) warnings) let unwrap : 'a t -> 'a = fun (Writer (x, _)) -> x let unwrap_located : 'a Loc.with_location t -> 'a = From 1525efdff87c48b919e216e408f6af31618583c2 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Fri, 13 Dec 2024 11:46:59 -0500 Subject: [PATCH 092/150] Add general illegal syntax error for undecidable cases --- src/parser/loc.ml | 12 ++++++++++-- src/parser/loc.mli | 6 ++++++ src/parser/parse_error.ml | 3 +++ src/parser/parser.mly | 29 ++++++++++++++++++++++++----- 4 files changed, 43 insertions(+), 7 deletions(-) diff --git a/src/parser/loc.ml b/src/parser/loc.ml index 0ad9ba9722..4728cf4877 100644 --- a/src/parser/loc.ml +++ b/src/parser/loc.ml @@ -13,8 +13,8 @@ let of_position : ?filename:string -> Lexing.position * Lexing.position -> span = fun ?filename (start, end_) -> print_endline @@ "FILENAME: " ^ start.pos_fname; - let to_point Lexing.{ pos_lnum; pos_cnum; _ } = - { line = pos_lnum; column = pos_cnum } + let to_point Lexing.{ pos_lnum; pos_cnum; pos_bol; _ } = + { line = pos_lnum; column = pos_cnum - pos_bol } in let start_point = to_point start and end_point = to_point end_ in { @@ -23,6 +23,14 @@ let of_position : ?filename:string -> Lexing.position * Lexing.position -> span end_ = end_point; } +let extract : + input:string -> + start_pos:Lexing.position -> + end_pos:Lexing.position -> + string = + fun ~input ~start_pos ~end_pos -> + String.sub input start_pos.pos_cnum (end_pos.pos_cnum - start_pos.pos_cnum) + let fmt { file; start; end_ } = let { line = sline; column = scol } = start and { line = eline; column = ecol } = end_ in diff --git a/src/parser/loc.mli b/src/parser/loc.mli index f04b95267a..16fac93439 100644 --- a/src/parser/loc.mli +++ b/src/parser/loc.mli @@ -29,6 +29,7 @@ type +'a with_location = { location : span; value : 'a } (** Describes values located at a particular span *) val fmt : span -> string +(** Format a `span` and return the resulting string *) val nudge_map_start : int -> 'a with_location -> 'a with_location val nudge_map_end : int -> 'a with_location -> 'a with_location @@ -39,6 +40,11 @@ val at : span -> 'a -> 'a with_location val of_position : ?filename:string -> Lexing.position * Lexing.position -> span (** Convert Menhir's `$loc` or `$sloc` into a span *) +val extract : + input:string -> start_pos:Lexing.position -> end_pos:Lexing.position -> string +(** Given a starting positon and an ending position extract the portion of the + input text between those positions *) + val location : 'a with_location -> span (** Returns the location of a located value *) diff --git a/src/parser/parse_error.ml b/src/parser/parse_error.ml index a07a8a24bb..e13ad380e6 100644 --- a/src/parser/parse_error.ml +++ b/src/parser/parse_error.ml @@ -1,5 +1,8 @@ let capitalize_ascii = Astring.String.Ascii.capitalize +let illegal : string -> Loc.span -> Warning.t = + Warning.make "Illegal character or syntax in '%s'" + let bad_markup : ?suggestion:string -> string -> Loc.span -> Warning.t = fun ?suggestion -> Warning.make ?suggestion "'%s': bad markup." diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 534b287129..4dad9cad98 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -249,7 +249,7 @@ let legal_module_list : Ast.inline_element Loc.with_location list -> bool = (* Utility which wraps the return value of a rule in `Loc.with_location` *) let locatedM(rule) == inner = rule; { Writer.map (wrap_location $sloc) inner } let located(rule) == inner = rule; { wrap_location $sloc inner } - +let position(rule) == _ = rule; { $sloc } let sequence(rule) == xs = list(rule); { Writer.sequence xs } let sequence_nonempty(rule) == xs = nonempty_list(rule); { Writer.sequence xs } @@ -343,7 +343,7 @@ let section_heading := |> Writer.map (fun c -> `Heading (num, title, trim_start c)) } - | (num, title) = Section_heading; whitespace?; RIGHT_CODE_DELIMITER; + | (num, title) = Section_heading; RIGHT_CODE_DELIMITER; { let should_not_be_empty = let span = Loc.of_position $sloc in @@ -710,7 +710,28 @@ let list_heavy := let node = `List (list_kind, `Heavy, []) in Writer.with_warning node warning } - + | list_kind = List; whitespace?; items = sequence_nonempty(item_heavy); errloc = position(error); + { + let warning = fun input -> + let (start_pos, end_pos) as loc = errloc in + let illegal_input = Loc.extract ~input ~start_pos ~end_pos in + let span = Loc.of_position loc in + Parse_error.illegal illegal_input span + in + let* items = Writer.warning (Writer.InputNeeded warning) items in + return @@ `List (list_kind, `Heavy, items) + } + | list_kind = List; whitespace?; errloc = position(error); + { + let warning = fun input -> + let (start_pos, end_pos) as loc = errloc in + let illegal_input = Loc.extract ~input ~start_pos ~end_pos in + let span = Loc.of_position loc in + Parse_error.illegal illegal_input span + in + return @@ `List (list_kind, `Heavy, []) + |> Writer.warning (Writer.InputNeeded warning) + } let list_element := | ~ = list_light; <> | ~ = list_heavy; <> @@ -861,7 +882,6 @@ let math_block := m = Math_block; let modules := | MODULES; modules = sequence(locatedM(inline_element)); RIGHT_BRACE; { - print_endline "INLINE"; let in_what = Tokens.describe MODULES in let* modules = modules in let not_allowed = @@ -887,7 +907,6 @@ let modules := } | MODULES; modules = sequence(locatedM(inline_element)); END; { - print_endline "INLINE + EOI"; let in_what = Tokens.describe MODULES in let* modules = modules in let span = Loc.span @@ List.map Loc.location modules in From f286e81dc1ecde0b8c5dde6fb6543751e99da44c Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 17 Dec 2024 16:56:33 -0500 Subject: [PATCH 093/150] fix heavy tables, remove unnecessary whitespace preceding RIGHT_BRACE --- src/parser/dune | 2 +- src/parser/loc.ml | 1 - src/parser/parser.mly | 64 +++++++++++++++++++++++++++---- src/parser/test_driver/tester.ml | 66 ++++++++++++++++++++------------ 4 files changed, 100 insertions(+), 33 deletions(-) diff --git a/src/parser/dune b/src/parser/dune index 06471b9775..a95d2ba29c 100644 --- a/src/parser/dune +++ b/src/parser/dune @@ -2,7 +2,7 @@ (menhir (modules parser) - (flags --table --external-tokens Tokens)) + (flags --table --external-tokens Tokens --explain --trace)) (library (name odoc_parser) diff --git a/src/parser/loc.ml b/src/parser/loc.ml index 4728cf4877..587e968526 100644 --- a/src/parser/loc.ml +++ b/src/parser/loc.ml @@ -12,7 +12,6 @@ let same annotated value = { annotated with value } let of_position : ?filename:string -> Lexing.position * Lexing.position -> span = fun ?filename (start, end_) -> - print_endline @@ "FILENAME: " ^ start.pos_fname; let to_point Lexing.{ pos_lnum; pos_cnum; pos_bol; _ } = { line = pos_lnum; column = pos_cnum - pos_bol } in diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 4dad9cad98..9879cfb9fc 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -240,6 +240,9 @@ let legal_module_list : Ast.inline_element Loc.with_location list -> bool = %token Verbatim "{v" %token END + +%on_error_reduce inline_element + %start main %% @@ -271,6 +274,10 @@ let any_whitespace := | ~ = whitespace; <> | ~ = Blank_line; <`Space> +let line_breaks := + | Blank_line; {} + | Single_newline; {} + (* ENTRY *) let main := @@ -580,7 +587,7 @@ let link := else return node } - | link_body = Link_with_replacement; whitespace?; RIGHT_BRACE; + | link_body = Link_with_replacement; RIGHT_BRACE; { let span = Loc.of_position $sloc @@ -696,11 +703,11 @@ let item_heavy == } let list_heavy := - | list_kind = List; whitespace?; items = sequence_nonempty(item_heavy); whitespace?; RIGHT_BRACE; + | list_kind = List; whitespace?; items = sequence_nonempty(item_heavy); RIGHT_BRACE; { Writer.map (fun items -> `List (list_kind, `Heavy, items)) items } - | list_kind = List; whitespace?; RIGHT_BRACE; + | list_kind = List; RIGHT_BRACE; { let warning = let what = Tokens.describe @@ List list_kind in @@ -732,20 +739,63 @@ let list_heavy := return @@ `List (list_kind, `Heavy, []) |> Writer.warning (Writer.InputNeeded warning) } + let list_element := | ~ = list_light; <> | ~ = list_heavy; <> (* TABLES *) -let cell_heavy := cell_kind = Table_cell; whitespace?; children = sequence(locatedM(nestable_block_element)); whitespace?; RIGHT_BRACE; whitespace?; - { Writer.map (fun c -> (c, cell_kind)) children } +let cell_heavy := + | cell_kind = Table_cell; whitespace?; children = sequence_nonempty(locatedM(nestable_block_element)); RIGHT_BRACE; whitespace?; + { Writer.map (fun c -> (c, cell_kind)) children } + | cell_kind = Table_cell; RIGHT_BRACE; whitespace?; + { return ([], cell_kind) } + | cell_kind = Table_cell; children = sequence_nonempty(locatedM(nestable_block_element))?; errloc = position(error); + { + let warning = fun input -> + let (start_pos, end_pos) as loc = errloc in + let illegal_input = Loc.extract ~input ~start_pos ~end_pos in + let span = Loc.of_position loc in + Parse_error.illegal illegal_input span + in + Option.value ~default:(return []) children + |> Writer.map (fun children -> (children, cell_kind)) + |> Writer.warning (Writer.InputNeeded warning) + } let row_heavy := | TABLE_ROW; whitespace?; ~ = sequence_nonempty(cell_heavy); RIGHT_BRACE; whitespace?; <> + | TABLE_ROW; whitespace?; RIGHT_BRACE; whitespace?; { return [] } + | TABLE_ROW; children = sequence_nonempty(cell_heavy)?; errloc = position(error); + { + let warning = fun input -> + let (start_pos, end_pos) as loc = errloc in + let illegal_input = Loc.extract ~input ~start_pos ~end_pos in + let span = Loc.of_position loc in + Parse_error.illegal illegal_input span + in + Option.value ~default:(return []) children + |> Writer.warning (Writer.InputNeeded warning) + } -let table_heavy := TABLE_HEAVY; whitespace?; grid = sequence_nonempty(row_heavy); RIGHT_BRACE; - { Writer.map (fun g -> `Table ((g, None), `Heavy)) grid } +let table_heavy := + | TABLE_HEAVY; whitespace?; grid = sequence_nonempty(row_heavy); RIGHT_BRACE; + { Writer.map (fun g -> `Table ((g, None), `Heavy)) grid } + | TABLE_HEAVY; RIGHT_BRACE; + { return (`Table (([], None), `Heavy)) } + | TABLE_HEAVY; whitespace?; grid = sequence_nonempty(row_heavy)?; errloc = position(error); + { + let warning = fun input -> + let (start_pos, end_pos) as loc = errloc in + let illegal_input = Loc.extract ~input ~start_pos ~end_pos in + let span = Loc.of_position loc in + Parse_error.illegal illegal_input span + in + Option.value ~default:(return []) grid + |> Writer.map (fun grid -> `Table ((grid, None), `Heavy)) + |> Writer.warning (Writer.InputNeeded warning) + } (* LIGHT TABLE *) diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index 633dec1c09..3b06379a15 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -42,30 +42,48 @@ let error_recovery = ("EOI in modules", "{!modules: Foo Bar"); ] -let utf = +let table_heavy = [ - ("lambda", "\xce\xbb"); - ("words", "\xce\xbb \xce\xbb"); - ("no_validation", "Î"); - ("escapes", "\xce\xbb\\"); - ("newline", "\xce\xbb \n \xce\xbb"); - ("paragraphs", "\xce\xbb \n\n \xce\xbb"); - ("code_span", "[\xce\xbb]"); - ("minuys", "\xce\xbb-\xce\xbb"); - ("shorthand list", "- \xce\xbb"); - ("styled", "{b \xce\xbb}"); - ("reference_target", "{!\xce\xbb}"); - ("code block ", "{[\xce\xbb]}"); - ("verbatim", "{v \xce\xbb v}"); - ("label", "{2:\xce\xbb Bar}"); - ("author", "@author \xce\xbb"); - ("param", "@param \xce\xbb"); - ("raise", "@raise \xce\xbb"); - ("see", "@see <\xce\xbb>"); - ("since", "@since \xce\xbb"); - ("before", "@before \xce\xbb"); - ("version", "@version \xce\xbb"); - ("right brace", "\xce\xbb}"); + (* name, case *) + (* + + ("empty_table_heavy", "{table }"); + ("empty_row", "{table {tr } }"); + ("no_header", "{table {tr {td}}}"); + ("no_data", "{table {tr {th}}}"); + ("bad_data", "{table absurd content}"); + ("bad_row", "{table {tr absurd content}}"); + ("multiple_headers", "{table {tr {th}} {tr {th}} {tr {td}}}"); + ("unclosed_table", "{table {tr {td}}"); + *) + ( "complex_table", + "{table\n\ + \ {tr\n\ + \ {th xxx}\n\ + \ {th yyy}\n\ + \ }\n\ + \ {tr\n\ + \ {td aaaa bbb ccc {i ddd}\n\ + \ }\n\ + \ {td\n\ + \ {table {tr {td}}}\n\ + \ }\n\ + \ }\n\ + \ {tr\n\ + \ {td\n\ + \ - aaa\n\ + \ - bbb\n\ + \ - ccc\n\ + \ }\n\ + \ {td\n\ + \ {t\n\ + \ x | y | z\n\ + \ --|---|--\n\ + \ 1 | 2 | 3\n\ + \ }\n\ + \ }\n\ + \ }\n\ + \ }" ); ] (* Cases (mostly) taken from the 'odoc for library authors' document *) @@ -209,7 +227,7 @@ let () = | _ -> print_endline "unrecognized argument - running documentation_cases"; documentation_cases) - else utf + else table_heavy in let sucesses, failures = List.partition_map run_test cases in let sucesses = format_successes sucesses in From 182a85fab84098a8ff045cc1457d3c5840e95364 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 18 Dec 2024 11:55:35 -0500 Subject: [PATCH 094/150] Location tracking improved --- src/parser/lexer.mll | 92 ++++++++++++++++++++------------ src/parser/test/serialize.ml | 3 +- src/parser/test_driver/tester.ml | 55 +++++-------------- 3 files changed, 72 insertions(+), 78 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index a448f3e6ac..3cd52aa98c 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -1,7 +1,14 @@ { - open Tokens +(* NOTE: + I think that, ideally, this would be rewritten from scratch, removing the + currying in `emit` and `with_location_adjustments` that makes this code + difficult to understand + + Additionally, I there think there are probably a few things here that could + be better handled by the parser. *) + let unescape_word : string -> string = fun s -> (* The common case is that there are no escape sequences. *) match String.index s '\\' with @@ -37,6 +44,10 @@ let math_constr kind x = | Inline -> Math_span x | Block -> Math_block x +let copy : 'a -> 'a = fun t -> + Obj.magic t |> Obj.dup |> Obj.magic + + (* This is used for code and verbatim blocks. It can be done with a regular expression, but the regexp gets quite ugly, so a function is easier to understand. *) @@ -160,10 +171,15 @@ let with_loc : Lexing.lexbuf -> input -> 'a -> 'a Loc.with_location = let location = mkloc input lexbuf in Loc.at location -let with_location_adjustments : (Lexing.lexbuf -> input -> 'a) -> Lexing.lexbuf +let with_location_adjustments : + (Lexing.lexbuf -> input -> 'a) + -> Lexing.lexbuf -> input - -> ?start_offset:int -> ?adjust_start_by:string -> ?end_offset:int - -> ?adjust_end_by:string -> 'a + -> ?start_offset:int + -> ?adjust_start_by:string + -> ?end_offset:int + -> ?adjust_end_by:string + -> 'a = fun k lexbuf input ?start_offset ?adjust_start_by ?end_offset ?adjust_end_by @@ -189,7 +205,7 @@ let with_location_adjustments : (Lexing.lexbuf -> input -> 'a) -> Lexing.lexbuf | None -> end_ | Some s -> end_ - String.length s in - lexbuf.lex_start_p <- { lexbuf.lex_start_p with pos_cnum = start }; + lexbuf.lex_start_p <- { lexbuf.lex_start_p with pos_cnum = start + lexbuf.lex_start_p.pos_bol }; lexbuf.lex_curr_p <- { lexbuf.lex_curr_p with pos_cnum = end_ }; k lexbuf input value @@ -453,10 +469,10 @@ and token input = parse { emit lexbuf input (Style `Subscript) } | "{math" space_char - { math Block (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) input lexbuf } + { math Block (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) (copy lexbuf.lex_curr_p) input lexbuf } | "{m" horizontal_space - { math Inline (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) input lexbuf } + { math Inline (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) (copy lexbuf.lex_curr_p) input lexbuf } | "{!modules:" @@ -668,11 +684,12 @@ and code_span buffer nesting_level start_offset input = parse code_span buffer nesting_level start_offset input lexbuf } | newline horizontal_space* (newline horizontal_space*)+ - { let w = Parse_error.not_allowed + { let not_allowed = + Parse_error.not_allowed ~what:(Tokens.describe (Blank_line "\n\n")) ~in_what:(Tokens.describe (Code_span "")) in - warning lexbuf input w; + warning lexbuf input not_allowed; Buffer.add_char buffer ' '; code_span buffer nesting_level start_offset input lexbuf } | newline horizontal_space* @@ -692,49 +709,56 @@ and code_span buffer nesting_level start_offset input = parse { Buffer.add_char buffer c; code_span buffer nesting_level start_offset input lexbuf } -and math kind buffer nesting_level start_offset input = parse +and math kind buffer nesting_level start_offset start_pos input = parse | '}' - { if nesting_level == 0 then - emit lexbuf input (math_constr kind (Buffer.contents buffer)) ~start_offset - else begin + { + if nesting_level == 0 then ( + lexbuf.lex_start_p <- start_pos; + emit lexbuf input (math_constr kind (Buffer.contents buffer)) ~start_offset) + else ( Buffer.add_char buffer '}'; - math kind buffer (pred nesting_level) start_offset input lexbuf - end - } + math kind buffer (pred nesting_level) start_offset start_pos input lexbuf) + } | '{' - { Buffer.add_char buffer '{'; - math kind buffer (succ nesting_level) start_offset input lexbuf } + { + Buffer.add_char buffer '{'; + math kind buffer (succ nesting_level) start_offset start_pos input lexbuf + } | ("\\{" | "\\}") as s { Buffer.add_string buffer s; - math kind buffer nesting_level start_offset input lexbuf } + math kind buffer nesting_level start_offset start_pos input lexbuf } | (newline) as s { + (* + Printf.printf "Adding newline!\nPosition:\n %s\n" (Loc.fmt @@ Loc.of_position ( lexbuf.lex_start_p, lexbuf.lex_curr_p )); + *) Lexing.new_line lexbuf; match kind with | Inline -> - warning - lexbuf - input - (Parse_error.not_allowed + let not_allowed = + Parse_error.not_allowed ~what:(Tokens.describe (Blank_line "\n")) - ~in_what:(Tokens.describe (math_constr kind ""))); + ~in_what:(Tokens.describe (math_constr kind "")) + in + warning lexbuf input not_allowed; Buffer.add_char buffer '\n'; - math kind buffer nesting_level start_offset input lexbuf + math kind buffer nesting_level start_offset start_pos input lexbuf | Block -> Buffer.add_string buffer s; - math kind buffer nesting_level start_offset input lexbuf + math kind buffer nesting_level start_offset start_pos input lexbuf } | eof - { warning - lexbuf - input - (Parse_error.not_allowed - ~what:(Tokens.describe END) - ~in_what:(Tokens.describe (math_constr kind ""))); - emit lexbuf input (math_constr kind (Buffer.contents buffer)) ~start_offset } + { + let unexpected_eof = + Parse_error.end_not_allowed + ~in_what:(Tokens.describe (math_constr kind "")) + in + warning lexbuf input unexpected_eof; + emit lexbuf input (math_constr kind (Buffer.contents buffer)) ~start_offset + } | _ as c { Buffer.add_char buffer c; - math kind buffer nesting_level start_offset input lexbuf } + math kind buffer nesting_level start_offset start_pos input lexbuf } and media tok_descr buffer nesting_level start_offset input = parse | '}' diff --git a/src/parser/test/serialize.ml b/src/parser/test/serialize.ml index 8b8da753f4..8dcf0e6f3b 100644 --- a/src/parser/test/serialize.ml +++ b/src/parser/test/serialize.ml @@ -1,4 +1,4 @@ - open Odoc_parser +open Odoc_parser type sexp = Sexplib0.Sexp.t = Atom of string | List of sexp list @@ -177,4 +177,3 @@ module Ast_to_sexp = struct let docs at : Ast.t -> sexp = fun f -> List (List.map (at.at (block_element at)) f) end - diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index 3b06379a15..fc0ad784bb 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -42,48 +42,19 @@ let error_recovery = ("EOI in modules", "{!modules: Foo Bar"); ] -let table_heavy = +let math = [ - (* name, case *) - (* - - ("empty_table_heavy", "{table }"); - ("empty_row", "{table {tr } }"); - ("no_header", "{table {tr {td}}}"); - ("no_data", "{table {tr {th}}}"); - ("bad_data", "{table absurd content}"); - ("bad_row", "{table {tr absurd content}}"); - ("multiple_headers", "{table {tr {th}} {tr {th}} {tr {td}}}"); - ("unclosed_table", "{table {tr {td}}"); - *) - ( "complex_table", - "{table\n\ - \ {tr\n\ - \ {th xxx}\n\ - \ {th yyy}\n\ - \ }\n\ - \ {tr\n\ - \ {td aaaa bbb ccc {i ddd}\n\ - \ }\n\ - \ {td\n\ - \ {table {tr {td}}}\n\ - \ }\n\ - \ }\n\ - \ {tr\n\ - \ {td\n\ - \ - aaa\n\ - \ - bbb\n\ - \ - ccc\n\ - \ }\n\ - \ {td\n\ - \ {t\n\ - \ x | y | z\n\ - \ --|---|--\n\ - \ 1 | 2 | 3\n\ - \ }\n\ - \ }\n\ - \ }\n\ - \ }" ); + ( "position", + {|{math + \alpha(x)=\left\{ + \begin{array}{ll} % beginning of the array + x \% 4\\ % some variable modulo 4 + \frac{1}{1+e^{-kx}}\\ % something else + \frac{e^x-e^{-x}}{e^x+e^{-x}} % another action + \end{array} % end of the array + \right. + }|} + ); ] (* Cases (mostly) taken from the 'odoc for library authors' document *) @@ -227,7 +198,7 @@ let () = | _ -> print_endline "unrecognized argument - running documentation_cases"; documentation_cases) - else table_heavy + else math in let sucesses, failures = List.partition_map run_test cases in let sucesses = format_successes sucesses in From c2f28cf5f88925d0bd1efe65c1fe230cfbbc8c71 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 18 Dec 2024 13:19:02 -0500 Subject: [PATCH 095/150] Fix toplevel stray brace warning --- src/parser/dune | 2 +- src/parser/parser.mly | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/parser/dune b/src/parser/dune index a95d2ba29c..69d306aed1 100644 --- a/src/parser/dune +++ b/src/parser/dune @@ -2,7 +2,7 @@ (menhir (modules parser) - (flags --table --external-tokens Tokens --explain --trace)) + (flags --table --external-tokens Tokens --explain)) (library (name odoc_parser) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 9879cfb9fc..a803b9bb2e 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -315,15 +315,14 @@ let toplevel_error := in (ret : Ast.block_element Writer.t) } - | elt = locatedM(inline_element); RIGHT_BRACE; - { let warning = - let span = Loc.of_position $sloc in - let what = Tokens.describe RIGHT_BRACE in - let in_what = Tokens.describe_inline @@ Writer.unwrap_located elt in - Writer.Warning (Parse_error.not_allowed ~what ~in_what span) + | elt = locatedM(inline_element); errloc = position(RIGHT_BRACE); + { + let span = Loc.of_position errloc in + let warning = + Writer.Warning (Parse_error.unpaired_right_brace span) in let* elt = Writer.warning warning elt in - return @@ `Paragraph [elt] + return @@ `Paragraph [elt; Loc.at span @@ `Word "}"] } | elt = locatedM(inline_element); RIGHT_CODE_DELIMITER; { let warning = From 27716be30a74bd10d2b94298ffb328bf5d0c159b Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 18 Dec 2024 13:34:04 -0500 Subject: [PATCH 096/150] convert polymorphic token types to nominal --- src/parser/lexer.mll | 26 +++++++------- src/parser/parser.mly | 22 ++++++------ src/parser/tokens.ml | 83 ++++++++++++++++++++++++++++--------------- 3 files changed, 79 insertions(+), 52 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 3cd52aa98c..8c0511afdb 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -445,28 +445,28 @@ and token input = parse { emit lexbuf input PLUS } | "{b" - { emit lexbuf input (Style `Bold) } + { emit lexbuf input (Style Bold) } | "{i" - { emit lexbuf input (Style `Italic) } + { emit lexbuf input (Style Italic) } | "{e" - { emit lexbuf input (Style `Emphasis) } + { emit lexbuf input (Style Emphasis) } | "{L" - { emit lexbuf input (Paragraph_style `Left) } + { emit lexbuf input (Paragraph_style Left) } | "{C" - { emit lexbuf input (Paragraph_style `Center) } + { emit lexbuf input (Paragraph_style Center) } | "{R" - { emit lexbuf input (Paragraph_style `Right) } + { emit lexbuf input (Paragraph_style Right) } | "{^" - { emit lexbuf input (Style `Superscript) } + { emit lexbuf input (Style Superscript) } | "{_" - { emit lexbuf input (Style `Subscript) } + { emit lexbuf input (Style Subscript) } | "{math" space_char { math Block (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) (copy lexbuf.lex_curr_p) input lexbuf } @@ -542,10 +542,10 @@ and token input = parse emit lexbuf input token } | "{ul" - { emit lexbuf input (List `Unordered) } + { emit lexbuf input (List Unordered) } | "{ol" - { emit lexbuf input (List `Ordered) } + { emit lexbuf input (List Ordered) } | "{li" { emit lexbuf input LI } @@ -590,13 +590,13 @@ and token input = parse { emit lexbuf input RETURN } | "@see" horizontal_space* '<' ([^ '>']* as url) '>' - { emit lexbuf input (See (`Url, url)) } + { emit lexbuf input (See (URL, url)) } | "@see" horizontal_space* '\'' ([^ '\'']* as filename) '\'' - { emit lexbuf input (See (`File, filename)) } + { emit lexbuf input (See (File, filename)) } | "@see" horizontal_space* '"' ([^ '"']* as name) '"' - { emit lexbuf input (See (`Document, name)) } + { emit lexbuf input (See (Document, name)) } | "@since" ((horizontal_space+ [^ '\r' '\n']*)? as version) { emit lexbuf input (Since version) } diff --git a/src/parser/parser.mly b/src/parser/parser.mly index a803b9bb2e..2039b23c1a 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -185,10 +185,10 @@ let legal_module_list : Ast.inline_element Loc.with_location list -> bool = %token MINUS "-" %token PLUS "+" -%token Style "{i" (* or '{b' etc *) +%token Style "{i" (* or '{b' etc *) (* or '{C' or '{R', but this syntax has been deprecated and is only kept around so legacy codebases don't break :p *) -%token Paragraph_style "{L" +%token Paragraph_style "{L" %token MODULES "{!modules:" @@ -200,7 +200,7 @@ let legal_module_list : Ast.inline_element Loc.with_location list -> bool = %token Code_block "{[]}" %token Code_span "[]" -%token List "{ol" (* or '{ul' *) +%token List "{ol" (* or '{ul' *) %token LI "{li" %token DASH "{-" @@ -490,7 +490,7 @@ let style := Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.ensure not_empty warning children - |> Writer.map (fun c -> `Styled (style, trim_start c)) + |> Writer.map (fun c -> `Styled (Tokens.to_ast_style style, trim_start c)) } | style = Style; RIGHT_BRACE; { @@ -499,7 +499,7 @@ let style := let what = Tokens.describe @@ Style style in Writer.Warning (Parse_error.should_not_be_empty ~what span) in - Writer.with_warning (`Styled (style, [])) warning + Writer.with_warning (`Styled (Tokens.to_ast_style style, [])) warning } | style = Style; RIGHT_CODE_DELIMITER; { @@ -514,7 +514,7 @@ let style := let span = Loc.of_position $sloc in Writer.Warning (Parse_error.should_not_be_empty ~what:style_desc span) in - return (`Styled (style, [])) + return (`Styled (Tokens.to_ast_style style, [])) |> Writer.warning not_allowed |> Writer.warning should_not_be_empty } @@ -525,7 +525,7 @@ let style := let in_what = Tokens.describe @@ Style style in Writer.Warning (Parse_error.end_not_allowed ~in_what span) in - Writer.with_warning (`Styled (style, [])) warning + Writer.with_warning (`Styled (Tokens.to_ast_style style, [])) warning } (* LINKS + REFS *) @@ -704,7 +704,7 @@ let item_heavy == let list_heavy := | list_kind = List; whitespace?; items = sequence_nonempty(item_heavy); RIGHT_BRACE; { - Writer.map (fun items -> `List (list_kind, `Heavy, items)) items + Writer.map (fun items -> `List (Tokens.ast_list_kind list_kind, `Heavy, items)) items } | list_kind = List; RIGHT_BRACE; { @@ -713,7 +713,7 @@ let list_heavy := let span = Loc.of_position $sloc in Writer.Warning (Parse_error.should_not_be_empty ~what span) in - let node = `List (list_kind, `Heavy, []) in + let node = `List (Tokens.ast_list_kind list_kind, `Heavy, []) in Writer.with_warning node warning } | list_kind = List; whitespace?; items = sequence_nonempty(item_heavy); errloc = position(error); @@ -725,7 +725,7 @@ let list_heavy := Parse_error.illegal illegal_input span in let* items = Writer.warning (Writer.InputNeeded warning) items in - return @@ `List (list_kind, `Heavy, items) + return @@ `List (Tokens.ast_list_kind list_kind, `Heavy, items) } | list_kind = List; whitespace?; errloc = position(error); { @@ -735,7 +735,7 @@ let list_heavy := let span = Loc.of_position loc in Parse_error.illegal illegal_input span in - return @@ `List (list_kind, `Heavy, []) + return @@ `List (Tokens.ast_list_kind list_kind, `Heavy, []) |> Writer.warning (Writer.InputNeeded warning) } diff --git a/src/parser/tokens.ml b/src/parser/tokens.ml index 61c5ffcc33..c9c3257f19 100644 --- a/src/parser/tokens.ml +++ b/src/parser/tokens.ml @@ -3,10 +3,18 @@ type ref_kind = Simple | With_replacement type media = Reference of string | Link of string type media_target = Audio | Video | Image -type paragraph_style = [ `Left | `Center | `Right ] +type alignment = Left | Center | Right -type style = [ `Bold | `Italic | `Emphasis | `Superscript | `Subscript ] -type table_cell_kind = [ `Header | `Data ] +type style = Bold | Italic | Emphasis | Superscript | Subscript +type table_cell_kind = Header | Data + +type list_kind = Ordered | Unordered + +type internal_reference = URL | File | Document + +let ast_list_kind : list_kind -> Ast.list_kind = function + | Ordered -> `Ordered + | Unordered -> `Unordered type token = | Space of string @@ -27,9 +35,9 @@ type token = | Verbatim of string | RIGHT_CODE_DELIMITER | RIGHT_BRACE - | Paragraph_style of paragraph_style + | Paragraph_style of alignment | Style of style - | List of Ast.list_kind + | List of list_kind | LI | DASH | TABLE_LIGHT @@ -45,7 +53,7 @@ type token = | Param of string | Raise of string | RETURN - | See of ([ `Url | `File | `Document ] * string) + | See of (internal_reference * string) | Since of string | Before of string | Version of string @@ -90,16 +98,16 @@ let print : token -> string = function | Verbatim _ -> "{v" | RIGHT_CODE_DELIMITER -> "]}" | RIGHT_BRACE -> "}" - | Paragraph_style `Left -> "'{L'" - | Paragraph_style `Center -> "'{C'" - | Paragraph_style `Right -> "'{R'" - | Style `Bold -> "'{b'" - | Style `Italic -> "'{i'" - | Style `Emphasis -> "'{e'" - | Style `Superscript -> "'{^'" - | Style `Subscript -> "'{_'" - | List `Ordered -> "{ol" - | List `Unordered -> "{ul" + | Paragraph_style Left -> "'{L'" + | Paragraph_style Center -> "'{C'" + | Paragraph_style Right -> "'{R'" + | Style Bold -> "'{b'" + | Style Italic -> "'{i'" + | Style Emphasis -> "'{e'" + | Style Superscript -> "'{^'" + | Style Subscript -> "'{_'" + | List Ordered -> "{ol" + | List Unordered -> "{ul" | LI -> "'{li ...}'" | DASH -> "'{- ...}'" | TABLE_LIGHT -> "{t" @@ -145,14 +153,14 @@ let describe : token -> string = function | Word w -> Printf.sprintf "'%s'" w | Code_span _ -> "'[...]' (code)" | Raw_markup _ -> "'{%...%}' (raw markup)" - | Paragraph_style `Left -> "'{L ...}' (left alignment)" - | Paragraph_style `Center -> "'{C ...}' (center alignment)" - | Paragraph_style `Right -> "'{R ...}' (right alignment)" - | Style `Bold -> "'{b ...}' (boldface text)" - | Style `Italic -> "'{i ...}' (italic text)" - | Style `Emphasis -> "'{e ...}' (emphasized text)" - | Style `Superscript -> "'{^...}' (superscript)" - | Style `Subscript -> "'{_...}' (subscript)" + | Paragraph_style Left -> "'{L ...}' (left alignment)" + | Paragraph_style Center -> "'{C ...}' (center alignment)" + | Paragraph_style Right -> "'{R ...}' (right alignment)" + | Style Bold -> "'{b ...}' (boldface text)" + | Style Italic -> "'{i ...}' (italic text)" + | Style Emphasis -> "'{e ...}' (emphasized text)" + | Style Superscript -> "'{^...}' (superscript)" + | Style Subscript -> "'{_...}' (subscript)" | Math_span _ -> "'{m ...}' (math span)" | Math_block _ -> "'{math ...}' (math block)" | Simple_ref _ -> "'{!...}' (cross-reference)" @@ -167,8 +175,8 @@ let describe : token -> string = function | Code_block _ -> "'{[...]}' (code block)" | Verbatim _ -> "'{v ... v}' (verbatim text)" | MODULES -> "'{!modules ...}'" - | List `Unordered -> "'{ul ...}' (bulleted list)" - | List `Ordered -> "'{ol ...}' (numbered list)" + | List Unordered -> "'{ul ...}' (bulleted list)" + | List Ordered -> "'{ol ...}' (numbered list)" | LI -> "'{li ...}' (list item)" | DASH -> "'{- ...}' (list item)" | TABLE_LIGHT -> "'{t ...}' (table)" @@ -215,10 +223,24 @@ let empty_code_block = output = None; } +let of_ast_style : Ast.style -> style = function + | `Bold -> Bold + | `Italic -> Italic + | `Emphasis -> Emphasis + | `Superscript -> Superscript + | `Subscript -> Subscript + +let to_ast_style : style -> Ast.style = function + | Bold -> `Bold + | Italic -> `Italic + | Emphasis -> `Emphasis + | Superscript -> `Superscript + | Subscript -> `Subscript + let describe_inline : Ast.inline_element -> string = function | `Word w -> describe @@ Word w | `Space _ -> describe @@ Space "" - | `Styled (style, _) -> describe @@ Style style + | `Styled (style, _) -> describe @@ Style (of_ast_style style) | `Code_span _ -> describe @@ Code_span "" | `Math_span _ -> describe @@ Math_span "" | `Raw_markup x -> describe @@ Raw_markup x @@ -254,8 +276,13 @@ let describe_nestable_block : Ast.nestable_block_element -> string = function | `Math_block _ -> describe @@ Math_block "" | `Media _ as media -> describe @@ of_media media +let of_ast_ref : [ `Document | `File | `Url ] -> internal_reference = function + | `Document -> Document + | `File -> File + | `Url -> URL + let describe_tag : Ast.tag -> string = function - | `See (kind, _, _) -> describe @@ See (kind, "") + | `See (kind, _, _) -> describe @@ See (of_ast_ref kind, "") | `Author s -> describe @@ Author s | `Deprecated _ -> describe DEPRECATED | `Param (s, _) -> describe @@ Param s From 5502b166798bfd78f55a439ae766773ff26d5840 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 18 Dec 2024 14:10:08 -0500 Subject: [PATCH 097/150] remove TODO.md --- src/parser/TODO.md | 29 ----------------------------- src/parser/ast.ml | 6 ++++++ 2 files changed, 6 insertions(+), 29 deletions(-) delete mode 100644 src/parser/TODO.md diff --git a/src/parser/TODO.md b/src/parser/TODO.md deleted file mode 100644 index 40a5d044c5..0000000000 --- a/src/parser/TODO.md +++ /dev/null @@ -1,29 +0,0 @@ -- How to solve combinatorial explosion in elements like lists and tables? - - Ex: in a "heavy" list, an illegal token can exist in every nesting level - - {ul Something_illegal {li foo} {li bar} } - - {ul {li foo} Something_illegal {li bar} } - - {ul {li foo} {li bar Something_illegal} } - - Matching on each possible combination not only requires a great deal of - effort and repeated code, it weakens Menhir's ability to reduce the correct - rule - - My thought is that we do something like this: - ```ocaml - let list_error_cases := - | List; items = list_items+; loc = located(error); - { Warning.unexpected_at loc.location } - | List; loc = located(error); - { (* Same as above *)} - ``` - - With this strategy, we lose some information, but if we modify our - warning type (currently a function from filename to warning) to be a - function which takes the input text, we can at least show the user the specific - span that is illegal by mapping over our warnings and evaluating them. - - - So this warning looks something like - ```ocaml - fun ~filename input -> - String.split_on_char '\n' input - |> get_error_span location - |> Warning.unexpected_at location - - ``` diff --git a/src/parser/ast.ml b/src/parser/ast.ml index 28a3508168..79dda3c2f9 100644 --- a/src/parser/ast.ml +++ b/src/parser/ast.ml @@ -1,5 +1,11 @@ (** Abstract syntax tree representing ocamldoc comments *) +(* TODO: (@faycarsons) + We no longer need polymorphism in the parser, so for performance and + simplicity's sake the AST should (probably, assuming no issues in other + parts of Odoc) be refactored to use nominal sum types +*) + type list_kind = [ `Ordered | `Unordered ] type list_syntax = [ `Light | `Heavy ] type list_item = [ `Li | `Dash ] From c2c27f9c16b99f5ee10c573933019a2224354692 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 18 Dec 2024 14:38:09 -0500 Subject: [PATCH 098/150] Fix @see kind --- src/parser/parser.mly | 6 +++--- src/parser/tokens.ml | 5 +++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 2039b23c1a..c2bd58540d 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -220,7 +220,7 @@ let legal_module_list : Ast.inline_element Loc.with_location list -> bool = %token Param "@param" %token Raise "@raise(s)" %token RETURN "@return" -%token <[ `Url | `File | `Document ] * string> See "@see" +%token See "@see" %token Since "@since" %token Before "@before" %token Version "@version" @@ -406,10 +406,10 @@ let see := | (kind, href) = See; children = sequence_nonempty(locatedM(nestable_block_element)); { let* children = children in - return @@ `See (kind, href, children) + return @@ `See (Tokens.to_ast_ref kind, href, children) } | (kind, href) = See; - { return @@ `See (kind, href, []) } + { return @@ `See (Tokens.to_ast_ref kind, href, []) } let param := | ident = Param; children = sequence_nonempty(locatedM(nestable_block_element)); diff --git a/src/parser/tokens.ml b/src/parser/tokens.ml index c9c3257f19..dab42e504f 100644 --- a/src/parser/tokens.ml +++ b/src/parser/tokens.ml @@ -281,6 +281,11 @@ let of_ast_ref : [ `Document | `File | `Url ] -> internal_reference = function | `File -> File | `Url -> URL +let to_ast_ref : internal_reference -> [ `Url | `File | `Document ] = function + | URL -> `Url + | File -> `File + | Document -> `Document + let describe_tag : Ast.tag -> string = function | `See (kind, _, _) -> describe @@ See (of_ast_ref kind, "") | `Author s -> describe @@ Author s From 9b8e18fd5d8bc8a3b3cfd78d09496f2b9ea4b478 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 18 Dec 2024 15:53:51 -0500 Subject: [PATCH 099/150] Trim tag bodies, fix section heading --- src/parser/lexer.mll | 23 ++++++--- src/parser/parser.mly | 43 ++++++++-------- src/parser/test_driver/tester.ml | 86 +++++++++++++++++++++++++++----- 3 files changed, 109 insertions(+), 43 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 8c0511afdb..e9c588fae5 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -313,6 +313,15 @@ let heading_level lexbuf input level = let buffer_add_lexeme buffer lexbuf = Buffer.add_string buffer (Lexing.lexeme lexbuf) +let trim_horizontal_start : string -> string = fun s -> + let rec go idx = + let c = s.[idx] in + if Char.equal c ' ' + then go @@ succ idx + else String.sub s idx (String.length s - idx) + in + go 0 + } let markup_char = @@ -575,7 +584,7 @@ and token input = parse { emit lexbuf input (Section_heading (heading_level lexbuf input level, None)) } | "@author" ((horizontal_space+ [^ '\r' '\n']*)? as author) - { emit lexbuf input (Author author) } + { emit lexbuf input (Author (trim_horizontal_start author )) } | "@deprecated" { emit lexbuf input DEPRECATED } @@ -590,22 +599,22 @@ and token input = parse { emit lexbuf input RETURN } | "@see" horizontal_space* '<' ([^ '>']* as url) '>' - { emit lexbuf input (See (URL, url)) } + { emit lexbuf input (See (URL, trim_horizontal_start url)) } | "@see" horizontal_space* '\'' ([^ '\'']* as filename) '\'' - { emit lexbuf input (See (File, filename)) } + { emit lexbuf input (See (File, trim_horizontal_start filename)) } | "@see" horizontal_space* '"' ([^ '"']* as name) '"' - { emit lexbuf input (See (Document, name)) } + { emit lexbuf input (See (Document, trim_horizontal_start name)) } | "@since" ((horizontal_space+ [^ '\r' '\n']*)? as version) - { emit lexbuf input (Since version) } + { emit lexbuf input (Since (trim_horizontal_start version)) } | "@before" horizontal_space+ ((_ # space_char)+ as version) - { emit lexbuf input (Before version) } + { emit lexbuf input (Before (trim_horizontal_start version)) } | "@version" ((horizontal_space+ [^ '\r' '\n']*)? as version) - { emit lexbuf input (Version version) } + { emit lexbuf input (Version (trim_horizontal_start version)) } | "@canonical" ((horizontal_space+ [^ '\r' '\n']*)? as identifier) { emit lexbuf input (Canonical identifier) } diff --git a/src/parser/parser.mly b/src/parser/parser.mly index c2bd58540d..456f45a14d 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -241,8 +241,6 @@ let legal_module_list : Ast.inline_element Loc.with_location list -> bool = %token END -%on_error_reduce inline_element - %start main %% @@ -338,44 +336,43 @@ let toplevel_error := (* SECTION HEADING *) let section_heading := - | (num, title) = Section_heading; children = sequence(locatedM(inline_element)); RIGHT_BRACE; + | (num, title) = Section_heading; children = sequence_nonempty(locatedM(inline_element)); RIGHT_BRACE; { - let warning = - let span = Loc.of_position $sloc in - let what = Tokens.describe @@ Section_heading (num, title) in - Writer.Warning (Parse_error.should_not_be_empty ~what span) - in - Writer.ensure not_empty warning children - |> Writer.map (fun c -> `Heading (num, title, trim_start c)) + Writer.map (fun c -> `Heading (num, title, trim_start c)) children } - - | (num, title) = Section_heading; RIGHT_CODE_DELIMITER; - { + | (num, title) = Section_heading; RIGHT_BRACE; + { let should_not_be_empty = let span = Loc.of_position $sloc in let what = Tokens.describe @@ Section_heading (num, title) in Writer.Warning (Parse_error.should_not_be_empty ~what span) in - let not_allowed = - let span = Loc.of_position $sloc in - let what = Tokens.describe RIGHT_CODE_DELIMITER in - let in_what = Tokens.describe @@ Section_heading (num, title) in - Writer.Warning (Parse_error.not_allowed ~what ~in_what span) + let node = `Heading (num, title, []) in + Writer.with_warning node should_not_be_empty + } + | (num, title) = Section_heading; errloc = position(error); + { + let illegal = Writer.InputNeeded (fun input -> + let (start_pos, end_pos) = errloc in + let span = Loc.of_position (start_pos, end_pos) in + let err = Loc.extract ~input ~start_pos ~end_pos in + Parse_error.illegal err span) in return @@ `Heading (num, title, []) - |> Writer.warning should_not_be_empty - |> Writer.warning not_allowed + |> Writer.warning illegal } - + (* TAGS *) let tag == - | with_content = tag_with_content; { (with_content : Ast.tag Writer.t) } - | bare = tag_bare; { (bare : Ast.tag Writer.t) } + | with_content = tag_with_content; Single_newline?; { (with_content : Ast.tag Writer.t) } + | bare = tag_bare; Single_newline?; { (bare : Ast.tag Writer.t) } let tag_with_content := | DEPRECATED; children = sequence_nonempty(locatedM(nestable_block_element)); { Writer.map (fun c -> `Deprecated c) children } + | DEPRECATED; + { return @@ `Deprecated [] } | DEPRECATED; RIGHT_BRACE; { let warning = diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index fc0ad784bb..fa59cd3d58 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -42,21 +42,78 @@ let error_recovery = ("EOI in modules", "{!modules: Foo Bar"); ] -let math = +let open_t = [ - ( "position", - {|{math - \alpha(x)=\left\{ - \begin{array}{ll} % beginning of the array - x \% 4\\ % some variable modulo 4 - \frac{1}{1+e^{-kx}}\\ % something else - \frac{e^x-e^{-x}}{e^x+e^{-x}} % another action - \end{array} % end of the array - \right. - }|} - ); + ("basic", "@open"); + ("prefix", "@openfoo"); + ("extra_whitespace", "@open"); + ("followed_by_junk", "@open foo"); + ("followed_by_paragraph", "@open\nfoo"); + ("followed_by_tag", "@open\n@deprecated"); + ("with_list", "@open - foo"); + ] + +let utf8 = + [ + ("lambda", "\xce\xbb"); + ("words", "\xce\xbb \xce\xbb"); + ("no_validation", "Î"); + ("escapes", "\xce\xbb\\}"); + ("newline", "\xce\xbb \n \xce\xbb"); + ("paragraphs", "\xce\xbb \n\n \xce\xbb"); + ("code_span", "[\xce\xbb]"); + ("minus", "\xce\xbb-\xce\xbb"); + ("shorthand_list", "- \xce\xbb"); + ("styled", "{b \xce\xbb}"); + ("reference_target", "{!\xce\xbb}"); + ("code_block", "{[\xce\xbb]}"); + ("verbatim", "{v \xce\xbb v}"); + ("label", "{2:\xce\xbb Bar}"); + ("author", "@author \xce\xbb"); + ("param", "@param \xce\xbb"); + ("raise", "@raise \xce\xbb"); + ("see", "@see <\xce\xbb>"); + ("since", "@since \xce\xbb"); + ("before", "@before \xce\xbb"); + ("version", "@version \xce\xbb"); + ("right_brace", "\xce\xbb}"); ] +let bad_markup = + [ + ("left_brace", "{"); + ("left_brace_with_letter", "{g"); + ("left_brace_with_letters", "{gg"); + ("empty_braces", "{}"); + ("left_space", "{ foo}"); + ("left_spaces", "{ foo}"); + ("left_space_eof", "{"); + ("braces_instead_of_brackets", "{foo}"); + ("right_brace", "}"); + ("right_brace_in_paragraph", "foo}"); + ("multiple_right_brace", "foo } bar } baz"); + ("right_brace_in_list_item", "- foo}"); + ("right_brace_in_code_span", "[foo}]"); + ("right_brace_in_code_block", "{[foo}]}"); + ("right_brace_in_verbatim_text", "{v foo} v}"); + ("right_brace_in_author", "@author Foo}"); + ("right_brace_in_deprecated", "@deprecated }"); + ("right_bracket", "]"); + ("right_bracket_in_paragraph", "foo]"); + ("right_bracket_in_shorthand_list", "- foo]"); + ("right_bracket_in_code_span", "[]]"); + ("right_bracket_in_style", "{b]}"); + ("right_bracket_in_verbatim", "{v ] v}"); + ("right_bracket_in_list", "{ul ]}"); + ("right_bracket_in_list_item", "{ul {li ]}}"); + ("right_bracket_in_heading", "{2 ]}"); + ("right_bracket_in_author", "@author Foo]"); + ("at", "@"); + ("cr", ""); + ] + +let isolated = [ ("label", "{2:\xce\xbb Bar}") ] + (* Cases (mostly) taken from the 'odoc for library authors' document *) let documentation_cases = [ @@ -195,10 +252,13 @@ let () = | "code" -> code_cases | "recovery" | "r" -> error_recovery | "docs" | "d" -> documentation_cases + | "open" | "o" -> open_t + | "utf8" | "u" -> utf8 + | "isolated" | "i" -> isolated | _ -> print_endline "unrecognized argument - running documentation_cases"; documentation_cases) - else math + else bad_markup in let sucesses, failures = List.partition_map run_test cases in let sucesses = format_successes sucesses in From d7b5c44218cbd57f66781cda76f6a037cb46b26b Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 19 Dec 2024 11:49:30 -0500 Subject: [PATCH 100/150] update `Parse_error.illegal` to take description of parent element --- src/parser/odoc_parser.ml | 1 - src/parser/parse_error.ml | 9 +++++++-- src/parser/parser.mly | 18 ++++++++++++------ 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/src/parser/odoc_parser.ml b/src/parser/odoc_parser.ml index 6be1643bfd..62e27fb7f6 100644 --- a/src/parser/odoc_parser.ml +++ b/src/parser/odoc_parser.ml @@ -128,7 +128,6 @@ let parse_comment : location:Lexing.position -> text:string -> t = file = Lexing.(location.pos_fname); } in - (* Remove the `Loc.with_location` wrapping our token because Menhir cannot handle that *) let ast, warnings = Writer.run ~input:text @@ Parser.main (Lexer.token lexer_state) lexbuf in diff --git a/src/parser/parse_error.ml b/src/parser/parse_error.ml index e13ad380e6..674713ca3d 100644 --- a/src/parser/parse_error.ml +++ b/src/parser/parse_error.ml @@ -1,7 +1,12 @@ let capitalize_ascii = Astring.String.Ascii.capitalize -let illegal : string -> Loc.span -> Warning.t = - Warning.make "Illegal character or syntax in '%s'" +let illegal : ?in_what:string -> string -> Loc.span -> Warning.t = + fun ?in_what illegal_section span -> + match in_what with + | Some ctx -> + Warning.make "Illegal character or syntax '%s' in %s" illegal_section ctx + span + | None -> Warning.make "Illegal character or syntax '%s'" illegal_section span let bad_markup : ?suggestion:string -> string -> Loc.span -> Warning.t = fun ?suggestion -> Warning.make ?suggestion "'%s': bad markup." diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 456f45a14d..89d38e17c2 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -356,7 +356,8 @@ let section_heading := let (start_pos, end_pos) = errloc in let span = Loc.of_position (start_pos, end_pos) in let err = Loc.extract ~input ~start_pos ~end_pos in - Parse_error.illegal err span) + let in_what = Tokens.describe @@ Section_heading (num, title) in + Parse_error.illegal ~in_what err span) in return @@ `Heading (num, title, []) |> Writer.warning illegal @@ -719,7 +720,8 @@ let list_heavy := let (start_pos, end_pos) as loc = errloc in let illegal_input = Loc.extract ~input ~start_pos ~end_pos in let span = Loc.of_position loc in - Parse_error.illegal illegal_input span + let in_what = Tokens.describe @@ List list_kind in + Parse_error.illegal ~in_what illegal_input span in let* items = Writer.warning (Writer.InputNeeded warning) items in return @@ `List (Tokens.ast_list_kind list_kind, `Heavy, items) @@ -730,7 +732,8 @@ let list_heavy := let (start_pos, end_pos) as loc = errloc in let illegal_input = Loc.extract ~input ~start_pos ~end_pos in let span = Loc.of_position loc in - Parse_error.illegal illegal_input span + let in_what = Tokens.describe (List list_kind) in + Parse_error.illegal ~in_what illegal_input span in return @@ `List (Tokens.ast_list_kind list_kind, `Heavy, []) |> Writer.warning (Writer.InputNeeded warning) @@ -753,7 +756,8 @@ let cell_heavy := let (start_pos, end_pos) as loc = errloc in let illegal_input = Loc.extract ~input ~start_pos ~end_pos in let span = Loc.of_position loc in - Parse_error.illegal illegal_input span + let in_what = Tokens.describe @@ Table_cell cell_kind in + Parse_error.illegal ~in_what illegal_input span in Option.value ~default:(return []) children |> Writer.map (fun children -> (children, cell_kind)) @@ -769,7 +773,8 @@ let row_heavy := let (start_pos, end_pos) as loc = errloc in let illegal_input = Loc.extract ~input ~start_pos ~end_pos in let span = Loc.of_position loc in - Parse_error.illegal illegal_input span + let in_what = Tokens.describe TABLE_ROW in + Parse_error.illegal ~in_what illegal_input span in Option.value ~default:(return []) children |> Writer.warning (Writer.InputNeeded warning) @@ -786,7 +791,8 @@ let table_heavy := let (start_pos, end_pos) as loc = errloc in let illegal_input = Loc.extract ~input ~start_pos ~end_pos in let span = Loc.of_position loc in - Parse_error.illegal illegal_input span + let in_what = Tokens.describe TABLE_HEAVY in + Parse_error.illegal ~in_what illegal_input span in Option.value ~default:(return []) grid |> Writer.map (fun grid -> `Table ((grid, None), `Heavy)) From e45ca40e83584c9e29f7b701ef0f2dd8cbda56e9 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 19 Dec 2024 15:38:42 -0500 Subject: [PATCH 101/150] Add EOF handling to References with replacement text --- src/parser/parser.mly | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 89d38e17c2..60ecda152e 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -544,16 +544,30 @@ let reference := | ref_body = located(Ref_with_replacement); RIGHT_BRACE; { let node = `Reference (`With_text, ref_body, []) in - let what = Tokens.describe @@ Ref_with_replacement (Loc.value ref_body) in let warning = let span = Loc.of_position $sloc |> Loc.nudge_start (String.length "{{!") in + let what = Tokens.describe @@ Ref_with_replacement (Loc.value ref_body) in Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.with_warning node warning } + | ref_body = located(Ref_with_replacement); children = sequence_nonempty(locatedM(inline_element))?; END; + { + let not_allowed = + let span = + Loc.of_position $sloc + |> Loc.nudge_start @@ String.length "{{!" + in + let in_what = Tokens.describe @@ Ref_with_replacement (Loc.value ref_body) in + Writer.Warning (Parse_error.end_not_allowed ~in_what span) + in + let* children = Option.value ~default:(return []) children in + let node = `Reference (`With_text, ref_body, children) in + Writer.with_warning node not_allowed + } let link := | link_body = Simple_link; RIGHT_BRACE; From 6507011f0d6292a6afb261d175d330443aa0e360 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Fri, 20 Dec 2024 10:09:26 -0500 Subject: [PATCH 102/150] TODO --- src/parser/TODO.md | 22 ++++++++++++++++++++++ src/parser/lexer.mll | 5 +++++ 2 files changed, 27 insertions(+) create mode 100644 src/parser/TODO.md diff --git a/src/parser/TODO.md b/src/parser/TODO.md new file mode 100644 index 0000000000..7398343369 --- /dev/null +++ b/src/parser/TODO.md @@ -0,0 +1,22 @@ +- Leading white-space on tags + - Tests expect tag bodies (for i.e. "@version" or "@canonical", Lexer line 610) to have no + leading white-space. Fiddling with the lexer's rules to fix this seems to + break its ability to match on these rules correctly. + - Worst case we trim the start of the tag body +- Locations + - Not sure what the problem is here, not specifically at least + - The lexer has a, maybe unnecessarily, complex location handling system + with lots of curried functions, which was written with the intention of + wrapping tokens in `Loc.with_location` + - Do we rewrite it from scratch? (~week of work) + - Or fix it as it is now +- Code blocks + - Code blocks do not work now. The lexer relies on cooperation from the parser + which is not possible with Menhir. + - Our only option is to break the `Code_block` token up in multiple tokens, + the question is how: + - Do we split it into `Code_block` and `Code_block_w_output`? + - Or into its delimiters, i.e. `RIGHT_CODE_BLOCK`, `RIGHT_BRACKET`, + `CODE_BLOCK_META` etc? + - Seeing as the AST expects a string for its content I can see a benefit + in parsing said content within the lexer diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index e9c588fae5..3960b55496 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -607,6 +607,11 @@ and token input = parse | "@see" horizontal_space* '"' ([^ '"']* as name) '"' { emit lexbuf input (See (Document, trim_horizontal_start name)) } + (* NOTE: These tags will match the whitespace preceding the content and pass + that to the token. I've tried to match on the whitespace as a separate + thing from the token body but that seems to cause problems. + This is (maybe?) an issue because the tests expect the token body to have + no leading whitespace. What do we do here? *) | "@since" ((horizontal_space+ [^ '\r' '\n']*)? as version) { emit lexbuf input (Since (trim_horizontal_start version)) } From acbbe4ab42df66749ffeef8a7c784a9d992ad83e Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 23 Dec 2024 14:12:03 -0500 Subject: [PATCH 103/150] Fix leading and trailing whitespace in tags @version, @canonical, @since, etc --- src/parser/TODO.md | 2 ++ src/parser/lexer.mll | 20 +++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/parser/TODO.md b/src/parser/TODO.md index 7398343369..e01ea8e961 100644 --- a/src/parser/TODO.md +++ b/src/parser/TODO.md @@ -10,9 +10,11 @@ wrapping tokens in `Loc.with_location` - Do we rewrite it from scratch? (~week of work) - Or fix it as it is now + - Or! move location adjustment to parser - Code blocks - Code blocks do not work now. The lexer relies on cooperation from the parser which is not possible with Menhir. + - Our only option is to break the `Code_block` token up in multiple tokens, the question is how: - Do we split it into `Code_block` and `Code_block_w_output`? diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 3960b55496..2671e39ce3 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -441,7 +441,7 @@ and token input = parse | word_char (word_char | bullet_char | '@')* | bullet_char (word_char | bullet_char | '@')+ as w - { emit lexbuf input (Word (unescape_word w)) } + { (Word (unescape_word w)) } | '[' { code_span @@ -583,7 +583,7 @@ and token input = parse | '{' (['0'-'9']+ as level) { emit lexbuf input (Section_heading (heading_level lexbuf input level, None)) } - | "@author" ((horizontal_space+ [^ '\r' '\n']*)? as author) + | "@author" horizontal_space+ (([^ '\r' '\n']*)? as author) { emit lexbuf input (Author (trim_horizontal_start author )) } | "@deprecated" @@ -612,17 +612,23 @@ and token input = parse thing from the token body but that seems to cause problems. This is (maybe?) an issue because the tests expect the token body to have no leading whitespace. What do we do here? *) - | "@since" ((horizontal_space+ [^ '\r' '\n']*)? as version) - { emit lexbuf input (Since (trim_horizontal_start version)) } + | "@since" horizontal_space+ (([^ '\r' '\n']+) as version) + { emit lexbuf input (Since version) } + | "@since" + { Since "" } | "@before" horizontal_space+ ((_ # space_char)+ as version) { emit lexbuf input (Before (trim_horizontal_start version)) } - | "@version" ((horizontal_space+ [^ '\r' '\n']*)? as version) - { emit lexbuf input (Version (trim_horizontal_start version)) } + | "@version" horizontal_space+ (([^ '\r' '\n']+) as version) + { emit lexbuf input (Version version) } + | "@version" + { Version "" } - | "@canonical" ((horizontal_space+ [^ '\r' '\n']*)? as identifier) + | "@canonical" horizontal_space+ (([^ '\r' '\n']+) as identifier) { emit lexbuf input (Canonical identifier) } + | "@canonical" + { Canonical "" } | "@inline" { emit lexbuf input INLINE } From aed2720c7ac24ace386fae4de7e43a4d5e08d21f Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 23 Dec 2024 14:18:50 -0500 Subject: [PATCH 104/150] remove unnecessary string trim in `@before` lexing --- src/parser/lexer.mll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 2671e39ce3..3ce0bc7056 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -618,7 +618,7 @@ and token input = parse { Since "" } | "@before" horizontal_space+ ((_ # space_char)+ as version) - { emit lexbuf input (Before (trim_horizontal_start version)) } + { emit lexbuf input (Before version) } | "@version" horizontal_space+ (([^ '\r' '\n']+) as version) { emit lexbuf input (Version version) } From 3f9884e435732a276e9ea87d51d58b99b5c6401c Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 23 Dec 2024 14:30:26 -0500 Subject: [PATCH 105/150] Clean up syntax in lexer --- src/parser/lexer.mll | 67 ++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 3ce0bc7056..5c6240715e 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -374,12 +374,13 @@ rule reference_paren_content input start ref_offset start_offset depth_paren reference_paren_content input start ref_offset start_offset (depth_paren - 1) buffer lexbuf } | eof - { warning - lexbuf - input - ~start_offset - (Parse_error.unclosed_bracket ~bracket:"(") ; - Buffer.contents buffer } + { + let unclosed_bracket = + Parse_error.unclosed_bracket ~bracket:"(" + in + warning lexbuf input ~start_offset unclosed_bracket; + Buffer.contents buffer + } | _ { buffer_add_lexeme buffer lexbuf ; @@ -403,12 +404,13 @@ and reference_content input start start_offset buffer = parse reference_content input start start_offset buffer lexbuf } | eof - { warning - lexbuf - input - ~start_offset - (Parse_error.unclosed_bracket ~bracket:start) ; - Buffer.contents buffer } + { + let unclosed_bracket = + Parse_error.unclosed_bracket ~bracket:start + in + warning lexbuf input ~start_offset unclosed_bracket; + Buffer.contents buffer + } | _ { buffer_add_lexeme buffer lexbuf ; @@ -534,20 +536,19 @@ and token input = parse } | "{v" - { verbatim - (Buffer.create 1024) None (Lexing.lexeme_start lexbuf) input lexbuf } + { verbatim (Buffer.create 1024) None (Lexing.lexeme_start lexbuf) input lexbuf } | "{%" ((raw_markup_target as target) ':')? (raw_markup as s) ("%}" | eof as e) { let token = Raw_markup (target, s) in - if e <> "%}" then - warning - lexbuf - input - ~start_offset:(Lexing.lexeme_end lexbuf) - (Parse_error.not_allowed + let not_allowed = + Parse_error.not_allowed ~what:(Tokens.describe END) - ~in_what:(Tokens.describe token)); + ~in_what:(Tokens.describe token) + in + let start_offset = Lexing.lexeme_end lexbuf in + if e <> "%}" then + warning lexbuf input ~start_offset not_allowed; emit lexbuf input token } | "{ul" @@ -717,13 +718,15 @@ and code_span buffer nesting_level start_offset input = parse code_span buffer nesting_level start_offset input lexbuf } | eof - { warning - lexbuf - input - (Parse_error.not_allowed + { + let not_allowed = + Parse_error.not_allowed ~what:(Tokens.describe END) - ~in_what:(Tokens.describe (Code_span ""))); - emit lexbuf input (Code_span (Buffer.contents buffer)) ~start_offset } + ~in_what:(Tokens.describe (Code_span "")) + in + warning lexbuf input not_allowed; + emit lexbuf input (Code_span (Buffer.contents buffer)) ~start_offset + } | _ as c { Buffer.add_char buffer c; @@ -848,12 +851,10 @@ and verbatim buffer last_false_terminator start_offset input = parse and bad_markup_recovery start_offset input = parse | [^ '}']+ as text '}' as rest { let suggestion = - Printf.sprintf "did you mean '{!%s}' or '[%s]'?" text text in - warning - lexbuf - input - ~start_offset - (Parse_error.bad_markup ("{" ^ rest) ~suggestion); + Printf.sprintf "did you mean '{!%s}' or '[%s]'?" text text + in + let bad_markup = Parse_error.bad_markup ("{" ^ rest) ~suggestion in + warning lexbuf input ~start_offset bad_markup; emit lexbuf input (Code_span text) ~start_offset} (* The second field of the metadata. From 166bb19c42ceebafb4aa39346e96afa55792508e Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 23 Dec 2024 16:47:38 -0500 Subject: [PATCH 106/150] Return tokens directly in lexbuf --- src/parser/lexer.mll | 265 +++++++++++++++++++------------------ src/parser/odoc_parser.ml | 1 + src/parser/odoc_parser.mli | 1 + 3 files changed, 138 insertions(+), 129 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 5c6240715e..3b42dce14c 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -47,6 +47,11 @@ let math_constr kind x = let copy : 'a -> 'a = fun t -> Obj.magic t |> Obj.dup |> Obj.magic +let update_content_newlines : content:string -> Lexing.lexbuf -> unit = + fun ~content lexbuf -> + String.iter + (function '\n' -> Lexing.new_line lexbuf | _ -> ()) + content (* This is used for code and verbatim blocks. It can be done with a regular expression, but the regexp gets quite ugly, so a function is easier to @@ -155,24 +160,12 @@ type input = { mutable warnings : Warning.t list; } -let mkloc input lexbuf = - let open Lexing in - let pos_to_span pos = - Loc.{ line = pos.pos_lnum; - column = pos.pos_cnum - pos.pos_bol } - in - let file = input.file - and start = pos_to_span lexbuf.lex_start_p - and end_ = pos_to_span lexbuf.lex_curr_p in - { Loc.file; start; end_ } - -let with_loc : Lexing.lexbuf -> input -> 'a -> 'a Loc.with_location = - fun lexbuf input -> - let location = mkloc input lexbuf in +let with_loc : Lexing.lexbuf -> input -> Loc.span -> ('a -> 'a Loc.with_location) = + fun _lexbuf _input location -> Loc.at location let with_location_adjustments : - (Lexing.lexbuf -> input -> 'a) + (Lexing.lexbuf -> input -> Loc.span -> 'a) -> Lexing.lexbuf -> input -> ?start_offset:int @@ -180,11 +173,8 @@ let with_location_adjustments : -> ?end_offset:int -> ?adjust_end_by:string -> 'a - = - fun - k lexbuf input ?start_offset ?adjust_start_by ?end_offset ?adjust_end_by - value -> - + = + fun k lexbuf input ?start_offset ?adjust_start_by ?end_offset ?adjust_end_by value -> let start = match start_offset with | None -> Lexing.lexeme_start lexbuf @@ -205,18 +195,17 @@ let with_location_adjustments : | None -> end_ | Some s -> end_ - String.length s in - lexbuf.lex_start_p <- { lexbuf.lex_start_p with pos_cnum = start + lexbuf.lex_start_p.pos_bol }; - lexbuf.lex_curr_p <- { lexbuf.lex_curr_p with pos_cnum = end_ }; - k lexbuf input value - -let emit = - with_location_adjustments (fun _ _ token -> token) - + let location = { + Loc.file = input.file; + start = input.offset_to_location start; + end_ = input.offset_to_location end_; + } + in + k lexbuf input location value let warning = - with_location_adjustments (fun lexbuf input error -> - let location = mkloc input lexbuf in - input.warnings <- error location :: input.warnings) + with_location_adjustments @@ fun _lexbuf input location error -> + input.warnings <- error location :: input.warnings let reference_token media start ( target : string ) input lexbuf = match start with @@ -274,7 +263,7 @@ let emit_verbatim lexbuf input start_offset buffer = |> trim_leading_blank_lines |> trim_trailing_blank_lines in - emit lexbuf input (Verbatim t) ~start_offset + Verbatim t (* The locations have to be treated carefully in this function. We need to ensure that the []`Code_block] location matches the entirety of the block including the terminator, @@ -282,15 +271,15 @@ let emit_verbatim lexbuf input start_offset buffer = Note that the location reflects the content _without_ stripping of whitespace, whereas the value of the content in the tree has whitespace stripped from the beginning, and trailing empty lines removed. *) -let emit_code_block ~start_offset ~content_offset ~lexbuf input meta delimiter terminator buffer output = +let emit_code_block ~start_offset:_ ~content_offset ~lexbuf input meta delimiter terminator buffer output = let content_location = input.offset_to_location content_offset in let content = Buffer.contents buffer |> trim_trailing_blank_lines |> with_location_adjustments - (fun _ _location c -> + (fun _ _ _ -> let first_line_offset = content_location.column in - trim_leading_whitespace ~first_line_offset c) + trim_leading_whitespace ~first_line_offset) lexbuf input |> trim_leading_blank_lines @@ -301,12 +290,12 @@ let emit_code_block ~start_offset ~content_offset ~lexbuf input meta delimiter t lexbuf input in - emit ~start_offset lexbuf input (Code_block { meta; delimiter; content; output }) + Code_block { meta; delimiter; content; output } let heading_level lexbuf input level = if String.length level >= 2 && level.[0] = '0' then begin - warning - lexbuf input ~start_offset:1 (Parse_error.leading_zero_in_heading_level level) + let leading_zero = Parse_error.leading_zero_in_heading_level level in + warning lexbuf input ~start_offset:1 leading_zero end; int_of_string level @@ -418,28 +407,32 @@ and reference_content input start start_offset buffer = parse and token input = parse | horizontal_space* eof - { emit lexbuf input END } + { END } | ((horizontal_space* newline as prefix) horizontal_space* ((newline horizontal_space*)+ as suffix) as ws) - { - Lexing.new_line lexbuf; - Lexing.new_line lexbuf; - emit lexbuf input (Blank_line ws) ~adjust_start_by:prefix ~adjust_end_by:suffix } + { + (* Account for the first newline we got *) + update_content_newlines ~content:("\n" ^ prefix ^ suffix) lexbuf; + Blank_line ws + } | (horizontal_space* newline horizontal_space* as ws) { Lexing.new_line lexbuf; - emit lexbuf input (Single_newline ws) } + Single_newline ws + } | (horizontal_space+ as ws) - { emit lexbuf input (Space ws) } + { Space ws } | (horizontal_space* (newline horizontal_space*)? as p) '}' - { emit lexbuf input RIGHT_BRACE ~adjust_start_by:p } + { + update_content_newlines ~content:p lexbuf; + RIGHT_BRACE } | '|' - { emit lexbuf input BAR } + { BAR } | word_char (word_char | bullet_char | '@')* | bullet_char (word_char | bullet_char | '@')+ as w @@ -450,34 +443,34 @@ and token input = parse (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) input lexbuf } | '-' - { emit lexbuf input MINUS } + { MINUS } | '+' - { emit lexbuf input PLUS } + { PLUS } | "{b" - { emit lexbuf input (Style Bold) } + { Style Bold } | "{i" - { emit lexbuf input (Style Italic) } + { Style Italic } | "{e" - { emit lexbuf input (Style Emphasis) } + { Style Emphasis } | "{L" - { emit lexbuf input (Paragraph_style Left) } + { Paragraph_style Left } | "{C" - { emit lexbuf input (Paragraph_style Center) } + { Paragraph_style Center } | "{R" - { emit lexbuf input (Paragraph_style Right) } + { Paragraph_style Right } | "{^" - { emit lexbuf input (Style Superscript) } + { Style Superscript } | "{_" - { emit lexbuf input (Style Subscript) } + { Style Subscript } | "{math" space_char { math Block (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) (copy lexbuf.lex_curr_p) input lexbuf } @@ -487,7 +480,7 @@ and token input = parse | "{!modules:" - { emit lexbuf input MODULES } + { MODULES } | (media_start as start) { @@ -495,8 +488,8 @@ and token input = parse let target = reference_content input start start_offset (Buffer.create 16) lexbuf in - let token = reference_token media start target input lexbuf in - emit ~start_offset lexbuf input token } + reference_token media start target input lexbuf + } | "{[" { code_block (Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf) None (Buffer.create 256) "" input lexbuf } @@ -514,7 +507,7 @@ and token input = parse in let emit_truncated_code_block () = let empty_content = with_location_adjustments with_loc lexbuf input "" in - emit ~start_offset lexbuf input (Code_block { meta = Some { language = lang_tag; tags = None }; delimiter = Some delimiter; content = empty_content; output = None}) + Code_block { meta = Some { language = lang_tag; tags = None }; delimiter = Some delimiter; content = empty_content; output = None} in match code_block_metadata_tail input lexbuf with | Ok metadata -> code_block start_offset (Lexing.lexeme_end lexbuf) (Some metadata) (Buffer.create 256) delimiter input lexbuf @@ -522,8 +515,10 @@ and token input = parse warning lexbuf input ~start_offset Parse_error.truncated_code_block_meta; emit_truncated_code_block () | Error (`Invalid_char c) -> - warning lexbuf input ~start_offset - (Parse_error.language_tag_invalid_char lang_tag_ c); + let language_tag_invalid_char = + Parse_error.language_tag_invalid_char lang_tag_ c + in + warning lexbuf input ~start_offset language_tag_invalid_char; (* NOTE : (@faycarsons) Metadata should not be `None` *) code_block start_offset (Lexing.lexeme_end lexbuf) None (Buffer.create 256) delimiter input lexbuf @@ -540,73 +535,76 @@ and token input = parse | "{%" ((raw_markup_target as target) ':')? (raw_markup as s) ("%}" | eof as e) - { let token = Raw_markup (target, s) in - let not_allowed = - Parse_error.not_allowed - ~what:(Tokens.describe END) - ~in_what:(Tokens.describe token) - in - let start_offset = Lexing.lexeme_end lexbuf in - if e <> "%}" then - warning lexbuf input ~start_offset not_allowed; - emit lexbuf input token } + { + let token = Raw_markup (target, s) in + if e <> "%}" then begin + let not_allowed = + Parse_error.not_allowed + ~what:(Tokens.describe END) + ~in_what:(Tokens.describe token) + in + let start_offset = Lexing.lexeme_end lexbuf in + warning lexbuf input ~start_offset not_allowed + end; + token + } | "{ul" - { emit lexbuf input (List Unordered) } + { List Unordered } | "{ol" - { emit lexbuf input (List Ordered) } + { List Ordered } | "{li" - { emit lexbuf input LI } + { LI } | "{-" - { emit lexbuf input DASH } + { DASH } | "{table" - { emit lexbuf input TABLE_HEAVY } + { TABLE_HEAVY } | "{t" - { emit lexbuf input TABLE_LIGHT } + { TABLE_LIGHT } | "{tr" - { emit lexbuf input TABLE_ROW } + { TABLE_ROW } | "{th" - { emit lexbuf input (Table_cell `Header) } + { Table_cell `Header } | "{td" - { emit lexbuf input (Table_cell `Data) } + { Table_cell `Data } | '{' (['0'-'9']+ as level) ':' (([^ '}'] # space_char)* as label) - { emit lexbuf input (Section_heading (heading_level lexbuf input level, Some label)) } + { Section_heading (heading_level lexbuf input level, Some label) } | '{' (['0'-'9']+ as level) - { emit lexbuf input (Section_heading (heading_level lexbuf input level, None)) } + { Section_heading (heading_level lexbuf input level, None) } | "@author" horizontal_space+ (([^ '\r' '\n']*)? as author) - { emit lexbuf input (Author (trim_horizontal_start author )) } + { Author (trim_horizontal_start author ) } | "@deprecated" - { emit lexbuf input DEPRECATED } + { DEPRECATED } | "@param" horizontal_space+ ((_ # space_char)+ as name) - { emit lexbuf input (Param name) } + { Param name } | ("@raise" | "@raises") horizontal_space+ ((_ # space_char)+ as name) - { emit lexbuf input (Raise name) } + { Raise name } | ("@return" | "@returns") - { emit lexbuf input RETURN } + { RETURN } | "@see" horizontal_space* '<' ([^ '>']* as url) '>' - { emit lexbuf input (See (URL, trim_horizontal_start url)) } + { See (URL, trim_horizontal_start url) } | "@see" horizontal_space* '\'' ([^ '\'']* as filename) '\'' - { emit lexbuf input (See (File, trim_horizontal_start filename)) } + { See (File, trim_horizontal_start filename) } | "@see" horizontal_space* '"' ([^ '"']* as name) '"' - { emit lexbuf input (See (Document, trim_horizontal_start name)) } + { See (Document, trim_horizontal_start name) } (* NOTE: These tags will match the whitespace preceding the content and pass that to the token. I've tried to match on the whitespace as a separate @@ -614,74 +612,77 @@ and token input = parse This is (maybe?) an issue because the tests expect the token body to have no leading whitespace. What do we do here? *) | "@since" horizontal_space+ (([^ '\r' '\n']+) as version) - { emit lexbuf input (Since version) } + { Since version } | "@since" { Since "" } | "@before" horizontal_space+ ((_ # space_char)+ as version) - { emit lexbuf input (Before version) } + { Before version } | "@version" horizontal_space+ (([^ '\r' '\n']+) as version) - { emit lexbuf input (Version version) } + { Version version } | "@version" { Version "" } | "@canonical" horizontal_space+ (([^ '\r' '\n']+) as identifier) - { emit lexbuf input (Canonical identifier) } + { Canonical identifier } | "@canonical" { Canonical "" } | "@inline" - { emit lexbuf input INLINE } + { INLINE } | "@open" - { emit lexbuf input OPEN } + { OPEN } | "@closed" - { emit lexbuf input CLOSED } + { CLOSED } | "@hidden" - { emit lexbuf input HIDDEN } + { HIDDEN } | "]}" - { emit lexbuf input RIGHT_CODE_DELIMITER} + { RIGHT_CODE_DELIMITER } | '{' - { try bad_markup_recovery (Lexing.lexeme_start lexbuf) input lexbuf - with Failure _ -> - warning lexbuf - input - (Parse_error.bad_markup - "{" ~suggestion:"escape the brace with '\\{'."); - emit lexbuf input (Word "{") } + { + try bad_markup_recovery (Lexing.lexeme_start lexbuf) input lexbuf + with Failure _ -> begin + let bad_markup = + Parse_error.bad_markup "{" ~suggestion:"escape the brace with '\\{'." + in + warning lexbuf input bad_markup + end; + (Word "{") + } | ']' { warning lexbuf input Parse_error.unpaired_right_bracket; - emit lexbuf input (Word "]") } + Word "]" } | "@param" { warning lexbuf input Parse_error.truncated_param; - emit lexbuf input (Param "") } + Param "" } | ("@raise" | "@raises") as tag { warning lexbuf input (Parse_error.truncated_raise tag); - emit lexbuf input (Raise "") } + Raise "" } | "@before" { warning lexbuf input Parse_error.truncated_before; - emit lexbuf input (Before "") } + Before "" } | "@see" { warning lexbuf input Parse_error.truncated_see; - emit lexbuf input (Word "@see") } + Word "@see" } | '@' ['a'-'z' 'A'-'Z']+ as tag { warning lexbuf input (Parse_error.unknown_tag tag); - emit lexbuf input (Word tag) } + Word tag } | '@' { warning lexbuf input Parse_error.stray_at; - emit lexbuf input (Word "@") } + Word "@" } | '\r' { warning lexbuf input Parse_error.stray_cr; @@ -689,12 +690,14 @@ and token input = parse and code_span buffer nesting_level start_offset input = parse | ']' - { if nesting_level = 0 then - emit lexbuf input (Code_span (Buffer.contents buffer)) ~start_offset + { + if nesting_level = 0 then + Code_span (Buffer.contents buffer) else begin Buffer.add_char buffer ']'; code_span buffer (nesting_level - 1) start_offset input lexbuf - end } + end + } | '[' { Buffer.add_char buffer '['; @@ -704,18 +707,23 @@ and code_span buffer nesting_level start_offset input = parse { Buffer.add_char buffer c; code_span buffer nesting_level start_offset input lexbuf } - | newline horizontal_space* (newline horizontal_space*)+ - { let not_allowed = + | newline horizontal_space* ((newline horizontal_space*)+ as ws) + { + let not_allowed = Parse_error.not_allowed ~what:(Tokens.describe (Blank_line "\n\n")) ~in_what:(Tokens.describe (Code_span "")) in warning lexbuf input not_allowed; + update_content_newlines ~content:("\n" ^ ws) lexbuf; Buffer.add_char buffer ' '; code_span buffer nesting_level start_offset input lexbuf } | newline horizontal_space* - { Buffer.add_char buffer ' '; - code_span buffer nesting_level start_offset input lexbuf } + { + Lexing.new_line lexbuf; + Buffer.add_char buffer ' '; + code_span buffer nesting_level start_offset input lexbuf + } | eof { @@ -725,7 +733,7 @@ and code_span buffer nesting_level start_offset input = parse ~in_what:(Tokens.describe (Code_span "")) in warning lexbuf input not_allowed; - emit lexbuf input (Code_span (Buffer.contents buffer)) ~start_offset + Code_span (Buffer.contents buffer) } | _ as c @@ -737,7 +745,7 @@ and math kind buffer nesting_level start_offset start_pos input = parse { if nesting_level == 0 then ( lexbuf.lex_start_p <- start_pos; - emit lexbuf input (math_constr kind (Buffer.contents buffer)) ~start_offset) + math_constr kind (Buffer.contents buffer)) else ( Buffer.add_char buffer '}'; math kind buffer (pred nesting_level) start_offset start_pos input lexbuf) @@ -752,9 +760,6 @@ and math kind buffer nesting_level start_offset start_pos input = parse math kind buffer nesting_level start_offset start_pos input lexbuf } | (newline) as s { - (* - Printf.printf "Adding newline!\nPosition:\n %s\n" (Loc.fmt @@ Loc.of_position ( lexbuf.lex_start_p, lexbuf.lex_curr_p )); - *) Lexing.new_line lexbuf; match kind with | Inline -> @@ -777,7 +782,7 @@ and math kind buffer nesting_level start_offset start_pos input = parse ~in_what:(Tokens.describe (math_constr kind "")) in warning lexbuf input unexpected_eof; - emit lexbuf input (math_constr kind (Buffer.contents buffer)) ~start_offset + math_constr kind (Buffer.contents buffer) } | _ as c { Buffer.add_char buffer c; @@ -850,12 +855,14 @@ and verbatim buffer last_false_terminator start_offset input = parse and bad_markup_recovery start_offset input = parse | [^ '}']+ as text '}' as rest - { let suggestion = + { + let suggestion = Printf.sprintf "did you mean '{!%s}' or '[%s]'?" text text in let bad_markup = Parse_error.bad_markup ("{" ^ rest) ~suggestion in warning lexbuf input ~start_offset bad_markup; - emit lexbuf input (Code_span text) ~start_offset} + Code_span text + } (* The second field of the metadata. This rule keeps whitespaces and newlines in the 'metadata' field except the diff --git a/src/parser/odoc_parser.ml b/src/parser/odoc_parser.ml index 62e27fb7f6..ed1b0a19e6 100644 --- a/src/parser/odoc_parser.ml +++ b/src/parser/odoc_parser.ml @@ -89,6 +89,7 @@ module Tester = struct let reversed_newlines = reversed_newlines let offset_to_location = offset_to_location let string_of_token = Tokens.describe + let default_token = Tokens.Word "" end (* Given a Loc.point and the result of [parse_comment], this function returns diff --git a/src/parser/odoc_parser.mli b/src/parser/odoc_parser.mli index ecee24d90e..ccf6959e0b 100644 --- a/src/parser/odoc_parser.mli +++ b/src/parser/odoc_parser.mli @@ -41,6 +41,7 @@ module Tester : sig val run : input:string -> Ast.t Writer.t -> Ast.t * Warning.t list val main : (Lexing.lexbuf -> token) -> Lexing.lexbuf -> Ast.t Writer.t val string_of_token : token -> string + val default_token : token end (** Warnings produced during parsing. *) From 59c6ea46a19374e818b4c1b4ef059772807cdf3d Mon Sep 17 00:00:00 2001 From: faycarsons Date: Fri, 27 Dec 2024 16:39:32 -0500 Subject: [PATCH 107/150] inline_element+nestable_block_element return elt w/ location --- src/parser/TODO.md | 7 + src/parser/lexer.mll | 8 +- src/parser/loc.ml | 53 +++ src/parser/loc.mli | 3 + src/parser/parser.mly | 740 +++++++++++++++++++++--------------------- src/parser/writer.ml | 3 + 6 files changed, 448 insertions(+), 366 deletions(-) diff --git a/src/parser/TODO.md b/src/parser/TODO.md index e01ea8e961..82ad4abcd3 100644 --- a/src/parser/TODO.md +++ b/src/parser/TODO.md @@ -11,6 +11,13 @@ - Do we rewrite it from scratch? (~week of work) - Or fix it as it is now - Or! move location adjustment to parser + - This is the approach I'm taking. Everywhere the parser uses the + `Inline_element` and `nestable_block_element` rules, they're wrapped in + a location, so we can instead emit them wrapped in a location initially + instead of afterwards with a higher-order rule. + - This looks like getting the location of opening and closing delimiters + for delimited elements, and setting their location to the span between + those two points - Code blocks - Code blocks do not work now. The lexer relies on cooperation from the parser which is not possible with Menhir. diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 3b42dce14c..adf337992f 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -271,7 +271,7 @@ let emit_verbatim lexbuf input start_offset buffer = Note that the location reflects the content _without_ stripping of whitespace, whereas the value of the content in the tree has whitespace stripped from the beginning, and trailing empty lines removed. *) -let emit_code_block ~start_offset:_ ~content_offset ~lexbuf input meta delimiter terminator buffer output = +let emit_code_block ~content_offset ~lexbuf input meta delimiter terminator buffer output = let content_location = input.offset_to_location content_offset in let content = Buffer.contents buffer @@ -894,14 +894,14 @@ and code_block_metadata_tail input = parse and code_block start_offset content_offset metadata prefix delim input = parse | ("]" (delim_char* as delim') "[") as terminator { if delim = delim' then - emit_code_block ~start_offset ~content_offset ~lexbuf input None (Some delim) terminator prefix None + emit_code_block ~content_offset ~lexbuf input None (Some delim) terminator prefix None else (Buffer.add_string prefix terminator; code_block start_offset content_offset metadata prefix delim input lexbuf) } | ("]" (delim_char* as delim') "}") as terminator { if delim = delim' then - emit_code_block ~start_offset ~content_offset ~lexbuf input None (Some delim ) terminator prefix None + emit_code_block ~content_offset ~lexbuf input None (Some delim ) terminator prefix None else ( Buffer.add_string prefix terminator; code_block start_offset content_offset metadata prefix delim input lexbuf @@ -910,7 +910,7 @@ and code_block start_offset content_offset metadata prefix delim input = parse | eof { warning lexbuf input ~start_offset Parse_error.truncated_code_block; - emit_code_block ~start_offset ~content_offset ~lexbuf input None (Some delim ) "" prefix None + emit_code_block ~content_offset ~lexbuf input None (Some delim ) "" prefix None } | (_ as c) { diff --git a/src/parser/loc.ml b/src/parser/loc.ml index 587e968526..880e150e93 100644 --- a/src/parser/loc.ml +++ b/src/parser/loc.ml @@ -49,6 +49,10 @@ let span spans = let last = List.fold_left (fun _ span -> span) first spans in { file = first.file; start = first.start; end_ = last.end_ } +let delimited : 'a with_location -> 'b with_location -> span = + fun { location = startpos; _ } { location = endpos; _ } -> + { file = startpos.file; start = startpos.start; end_ = endpos.end_ } + let nudge_start offset span = { span with start = { span.start with column = span.start.column + offset } } @@ -68,3 +72,52 @@ let spans_multiple_lines = function _; } -> end_line > start_line + +let map_location : (span -> span) -> 'a with_location -> 'a with_location = + fun f located -> { located with location = f located.location } + +(* Utilities for offsetting the locations of various AST nodes to be more accurate *) + +let _offset_inline_elt located = + let f = + match located.value with + | `Word _ | `Space _ -> Fun.id + | `Code_span _ -> nudge_start (-1) + | `Raw_markup (_, _) | `Styled _ | `Math_span _ -> nudge_start (-3) + | `Reference (`Simple, _, _) -> nudge_start (-2) + | `Reference (`With_text, ref, _) -> + let ref = value ref in + nudge_start (-(String.length ref + 4)) + | `Link (_, []) -> nudge_start (-2) + | `Link (link, _ :: _) -> nudge_start (-(String.length link + 3)) + in + map_location f located + +(* +and nestable_block_element = + [ `Paragraph of inline_element with_location list + | `Code_block of code_block + | `Verbatim of string + | `Modules of string with_location list + | `List of + list_kind * list_syntax * nestable_block_element with_location list list + | `Table of table + | `Math_block of string (** @since 2.0.0 *) + | `Media of reference_kind * media_href with_location * string * media + (** @since 3.0.0 *) ] +*) + +let _offset_block_elt located = + let f = + match located.value with + | `Paragraph children -> + let loc = span @@ List.map location children in + Fun.const loc + | `Verbatim _ -> nudge_start (-2) + | `Modules _ -> nudge_start @@ -String.length "{!modules:" + | `List (`Heavy, _, _) -> nudge_start (-4) + | `Table (_, `Heavy) -> nudge_start @@ -String.length "{table" + | `Math_block _ -> nudge_start @@ -String.length "{math" + | _ -> Fun.id + in + map_location f located diff --git a/src/parser/loc.mli b/src/parser/loc.mli index 16fac93439..007aab2c13 100644 --- a/src/parser/loc.mli +++ b/src/parser/loc.mli @@ -31,6 +31,9 @@ type +'a with_location = { location : span; value : 'a } val fmt : span -> string (** Format a `span` and return the resulting string *) +val delimited : 'a with_location -> 'b with_location -> span +(** Returns the span which contains both arguments *) + val nudge_map_start : int -> 'a with_location -> 'a with_location val nudge_map_end : int -> 'a with_location -> 'a with_location diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 60ecda152e..9a056770af 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -97,35 +97,40 @@ - If there's only one row and it's not the align row, then it's data *) - let construct_table - : Ast.inline_element Loc.with_location list list list -> - Ast.nestable_block_element = - function - | [only_row] -> ( - match valid_align_row only_row with - | Ok align -> - `Table (([[]], Some align), `Light) - | _ -> - `Table (([as_data only_row], None), `Light)) - | align :: data when is_valid_align align -> - let align = Result.get_ok @@ valid_align_row align in - `Table ( (List.map as_data data, Some align) , `Light) - | header :: align :: data when is_valid_align align -> - let align = Result.get_ok @@ valid_align_row align in - `Table ((as_header header :: List.map as_data data, Some align), `Light) - | data -> `Table ((List.map as_data data, None), `Light) + let construct_table : + span:Loc.span -> + Ast.inline_element Loc.with_location list list list -> + Ast.nestable_block_element Loc.with_location = + fun ~span grid -> + match grid with + | [only_row] -> ( + match valid_align_row only_row with + | Ok align -> + Loc.at span @@ `Table (([[]], Some align), `Light) + | _ -> + Loc.at span @@ `Table (([as_data only_row], None), `Light)) + | align :: data when is_valid_align align -> + let align = Result.get_ok @@ valid_align_row align in + Loc.at span @@ `Table ( (List.map as_data data, Some align) , `Light) + | header :: align :: data when is_valid_align align -> + let align = Result.get_ok @@ valid_align_row align in + Loc.at span @@ `Table ((as_header header :: List.map as_data data, Some align), `Light) + | data -> Loc.at span @@ `Table ((List.map as_data data, None), `Light) let unclosed_table ?(data : Ast.inline_element Loc.with_location list list list Writer.t option) - warning : Ast.nestable_block_element Writer.t = + ~span + warning : Ast.nestable_block_element Loc.with_location Writer.t = let node = match data with | Some data -> Writer.map - (fun data -> `Table ((List.map as_data data, None), `Light)) + (fun data -> Loc.at span @@ `Table ((List.map as_data data, None), `Light)) data - | None -> Writer.return @@ `Table (([], None), `Light) + | None -> + let inner = Loc.at span @@ `Table (([], None), `Light) in + return inner in Writer.warning warning node @@ -147,6 +152,11 @@ | Loc.{value = `Space _; _ } :: xs -> xs | xs -> xs + let paragraph : Ast.inline_element Loc.with_location list -> Ast.nestable_block_element Loc.with_location = + fun elts -> + let span = List.map Loc.location elts |> Loc.span in + Loc.at span @@ `Paragraph elts + let rec inline_element_inner : Ast.inline_element -> string = function | `Space s -> s | `Word s -> s @@ -254,10 +264,13 @@ let position(rule) == _ = rule; { $sloc } let sequence(rule) == xs = list(rule); { Writer.sequence xs } let sequence_nonempty(rule) == xs = nonempty_list(rule); { Writer.sequence xs } +let delimited_location(opening, rule, closing) := startpos = located(opening); inner = rule; endpos = located(closing); { + let span = Loc.delimited startpos endpos in + Loc.at span inner +} + let separated_nonempty_sequence(sep, rule) := xs = separated_nonempty_list(sep, rule); { Writer.sequence xs } -let separated_sequence(sep, rule) := - | ~ = separated_nonempty_sequence(sep, rule); <> - | { return [] } +let separated_sequence(sep, rule) := xs = separated_list(sep, rule); { Writer.sequence xs } (* WHITESPACE *) @@ -279,14 +292,14 @@ let line_breaks := (* ENTRY *) let main := - | nodes = sequence_nonempty(locatedM(toplevel)); whitespace?; END; { nodes } + | nodes = sequence_nonempty(toplevel); whitespace?; END; { nodes } | END; { return [] } let toplevel := - | block = nestable_block_element; { Writer.map (fun b -> (b :> Ast.block_element) ) block } - | t = tag; { Writer.map (fun t -> `Tag t) t } - | ~ = section_heading; <> - | ~ = toplevel_error; <> + | block = nestable_block_element; { Writer.map (Loc.map @@ fun b -> (b :> Ast.block_element) ) block } + | t = locatedM(tag); { Writer.map (Loc.map @@ fun t -> `Tag t) t } + | ~ = locatedM(section_heading); <> + | ~ = locatedM(toplevel_error); <> let toplevel_error := | brace = located(RIGHT_BRACE); @@ -313,7 +326,7 @@ let toplevel_error := in (ret : Ast.block_element Writer.t) } - | elt = locatedM(inline_element); errloc = position(RIGHT_BRACE); + | elt = inline_element; errloc = position(RIGHT_BRACE); { let span = Loc.of_position errloc in let warning = @@ -322,7 +335,7 @@ let toplevel_error := let* elt = Writer.warning warning elt in return @@ `Paragraph [elt; Loc.at span @@ `Word "}"] } - | elt = locatedM(inline_element); RIGHT_CODE_DELIMITER; + | elt = inline_element; RIGHT_CODE_DELIMITER; { let warning = let span = Loc.of_position $sloc in let what = Tokens.describe RIGHT_CODE_DELIMITER in @@ -336,7 +349,7 @@ let toplevel_error := (* SECTION HEADING *) let section_heading := - | (num, title) = Section_heading; children = sequence_nonempty(locatedM(inline_element)); RIGHT_BRACE; + | (num, title) = Section_heading; children = sequence_nonempty(inline_element); RIGHT_BRACE; { Writer.map (fun c -> `Heading (num, title, trim_start c)) children } @@ -370,7 +383,7 @@ let tag == | bare = tag_bare; Single_newline?; { (bare : Ast.tag Writer.t) } let tag_with_content := - | DEPRECATED; children = sequence_nonempty(locatedM(nestable_block_element)); + | DEPRECATED; children = sequence_nonempty(nestable_block_element); { Writer.map (fun c -> `Deprecated c) children } | DEPRECATED; { return @@ `Deprecated [] } @@ -381,7 +394,7 @@ let tag_with_content := in Writer.with_warning (`Deprecated []) warning } - | RETURN; children = sequence_nonempty(locatedM(nestable_block_element)); + | RETURN; children = sequence_nonempty(nestable_block_element); { Writer.map (fun c -> `Return c) children } | ~ = before; <> | ~ = raise; <> @@ -389,19 +402,19 @@ let tag_with_content := | ~ = param; <> let before := - | version = Before; children = sequence_nonempty(locatedM(nestable_block_element)); + | version = Before; children = sequence_nonempty(nestable_block_element); { Writer.map (fun c -> `Before (version, c)) children } | version = Before; { return @@ `Before (version, []) } let raise := - | exn = Raise; children = sequence_nonempty(locatedM(nestable_block_element)); + | exn = Raise; children = sequence_nonempty(nestable_block_element); { Writer.map (fun c -> `Raise (exn, c)) children } | exn = Raise; { return @@ `Raise (exn, []) } let see := - | (kind, href) = See; children = sequence_nonempty(locatedM(nestable_block_element)); + | (kind, href) = See; children = sequence_nonempty(nestable_block_element); { let* children = children in return @@ `See (Tokens.to_ast_ref kind, href, children) @@ -410,7 +423,7 @@ let see := { return @@ `See (Tokens.to_ast_ref kind, href, []) } let param := - | ident = Param; children = sequence_nonempty(locatedM(nestable_block_element)); + | ident = Param; children = sequence_nonempty(nestable_block_element); { Writer.map (fun c -> `Param (ident, c)) children } | ident = Param; { return @@ `Param (ident, [])} @@ -465,11 +478,13 @@ let tag_bare := let inline_element := (* Single token inline elements which are mostly handled in the lexer *) - | s = inline_elt_legal_whitespace; { return s } - | c = Code_span; { return @@ `Code_span c } - | m = Raw_markup; { return @@ `Raw_markup m } - | w = Word; { return @@ `Word w } - | m = Math_span; { return @@ `Math_span m } + | s = located(inline_elt_legal_whitespace); { return s } + | c = located(Code_span); { + return @@ Loc.map (fun c -> `Code_span c) c + } + | m = located(Raw_markup); { return @@ Loc.map (fun m -> `Raw_markup m) m } + | w = located(Word); { return @@ Loc.map (fun w -> `Word w) w } + | m = located(Math_span); { return @@ Loc.map (fun m -> `Math_span m) m } (* More complex/recursive inline elements should have their own rule *) | ~ = style; <> | ~ = reference; <> @@ -480,117 +495,119 @@ let inline_elt_legal_whitespace := | ~ = Single_newline; <`Space> let style := - | style = Style; children = sequence(locatedM(inline_element)); RIGHT_BRACE; - { - let warning = - let span = Loc.of_position $sloc in - let what = Tokens.describe @@ Style style in - Writer.Warning (Parse_error.should_not_be_empty ~what span) - in - Writer.ensure not_empty warning children - |> Writer.map (fun c -> `Styled (Tokens.to_ast_style style, trim_start c)) - } - | style = Style; RIGHT_BRACE; - { - let warning = - let span = Loc.of_position $sloc in - let what = Tokens.describe @@ Style style in - Writer.Warning (Parse_error.should_not_be_empty ~what span) - in - Writer.with_warning (`Styled (Tokens.to_ast_style style, [])) warning - } - | style = Style; RIGHT_CODE_DELIMITER; - { - - let style_desc = Tokens.describe @@ Style style in - let not_allowed = - let span = Loc.of_position $sloc in - let what = Tokens.describe RIGHT_CODE_DELIMITER in - Writer.Warning (Parse_error.not_allowed ~what ~in_what:style_desc span) - in - let should_not_be_empty = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.should_not_be_empty ~what:style_desc span) - in - return (`Styled (Tokens.to_ast_style style, [])) - |> Writer.warning not_allowed - |> Writer.warning should_not_be_empty - } - | style = Style; END; - { - let warning = - let span = Loc.of_position $sloc in - let in_what = Tokens.describe @@ Style style in - Writer.Warning (Parse_error.end_not_allowed ~in_what span) - in - Writer.with_warning (`Styled (Tokens.to_ast_style style, [])) warning - } + | style = located(Style); children = sequence(inline_element); endpos = located( RIGHT_BRACE ); { + let span = Loc.delimited style endpos in + let style = style.Loc.value in + let warning = + let what = Tokens.describe @@ Style style in + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + Writer.ensure not_empty warning children + |> Writer.map (fun c -> Loc.at span @@ `Styled (Tokens.to_ast_style style, trim_start c)) + } + | style = located(Style); endpos = located(RIGHT_BRACE); { + let span = Loc.delimited style endpos in + let style = style.Loc.value in + let warning = + let what = Tokens.describe @@ Style style in + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + let inner = Loc.at span @@ `Styled (Tokens.to_ast_style style, []) in + Writer.with_warning inner warning + } + | style = located(Style); endpos = located(RIGHT_CODE_DELIMITER); { + let span = Loc.delimited style endpos in + let style = style.Loc.value in + let style_desc = Tokens.describe @@ Style style in + let not_allowed = + let what = Tokens.describe RIGHT_CODE_DELIMITER in + Writer.Warning (Parse_error.not_allowed ~what ~in_what:style_desc span) + in + let should_not_be_empty = + Writer.Warning (Parse_error.should_not_be_empty ~what:style_desc span) + in + let inner = Loc.at span @@ `Styled (Tokens.to_ast_style style, []) in + return inner + |> Writer.warning not_allowed + |> Writer.warning should_not_be_empty + } + | style = located(Style); endpos = located(END); { + let span = Loc.delimited style endpos in + let style = style.Loc.value in + let warning = + let span = Loc.of_position $sloc in + let in_what = Tokens.describe @@ Style style in + Writer.Warning (Parse_error.end_not_allowed ~in_what span) + in + let inner = Loc.at span @@ `Styled (Tokens.to_ast_style style, []) in + Writer.with_warning inner warning + } (* LINKS + REFS *) let reference := - | ref_body = located(Simple_ref); children = sequence(locatedM(inline_element)); - { - let+ children = children in - let ref_body = Loc.nudge_map_start (String.length "{!") ref_body in - `Reference (`Simple, ref_body, trim_start children) - } - | ref_body = located(Ref_with_replacement); children = sequence_nonempty(locatedM(inline_element)); RIGHT_BRACE; - { - let+ children = children in - let ref_body = Loc.nudge_map_start (String.length "{{!") ref_body in - `Reference (`With_text, ref_body, trim_start children) - } - | ref_body = located(Ref_with_replacement); RIGHT_BRACE; - { - let node = `Reference (`With_text, ref_body, []) in - let warning = - let span = - Loc.of_position $sloc - |> Loc.nudge_start (String.length "{{!") - in - let what = Tokens.describe @@ Ref_with_replacement (Loc.value ref_body) in - Writer.Warning (Parse_error.should_not_be_empty ~what span) - in - Writer.with_warning node warning - } - | ref_body = located(Ref_with_replacement); children = sequence_nonempty(locatedM(inline_element))?; END; - { - let not_allowed = - let span = - Loc.of_position $sloc - |> Loc.nudge_start @@ String.length "{{!" - in - let in_what = Tokens.describe @@ Ref_with_replacement (Loc.value ref_body) in - Writer.Warning (Parse_error.end_not_allowed ~in_what span) - in - let* children = Option.value ~default:(return []) children in - let node = `Reference (`With_text, ref_body, children) in - Writer.with_warning node not_allowed + | ref_body = located(Simple_ref); children = sequence(inline_element); { + let+ children = children in + let startpos = Loc.nudge_start (-2) ref_body.Loc.location in + let span = Loc.span @@ startpos :: List.map Loc.location children in + let ref_body = Loc.at startpos ref_body.Loc.value in + Loc.at span @@ `Reference (`Simple, ref_body, trim_start children) + } + | ref_body = located(Ref_with_replacement); children = sequence_nonempty(inline_element); endpos = located(RIGHT_BRACE); { + let+ children = children in + let startpos = Loc.nudge_map_start (-3) ref_body in + let ref_body = Loc.same startpos ref_body.Loc.value in + let span = Loc.delimited startpos endpos in + Loc.at span @@ `Reference (`With_text, ref_body, trim_start children) + } + | ref_body = located(Ref_with_replacement); endpos = located(RIGHT_BRACE); { + let startpos = Loc.nudge_map_start (-3) ref_body in + let span = Loc.delimited startpos endpos in + let node = Loc.at span @@ `Reference (`With_text, ref_body, []) in + let warning = + let what = Tokens.describe @@ Ref_with_replacement (Loc.value ref_body) in + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + Writer.with_warning node warning + } + | ref_body = located(Ref_with_replacement); children = sequence_nonempty(inline_element)?; endpos = located(END); { + let Loc.{value = ref_body_value; location} = ref_body in + let startpos = Loc.nudge_start (-3) location in + let ref_body : string Loc.with_location = Loc.at startpos ref_body_value in + let span = Loc.delimited ref_body endpos in + let not_allowed = + let in_what = Tokens.describe (Ref_with_replacement ref_body_value) in + Writer.Warning (Parse_error.end_not_allowed ~in_what span) + in + let* children = Option.value ~default:(return []) children in + let node = Loc.at span @@ `Reference (`With_text, ref_body, children) in + Writer.with_warning node not_allowed } let link := - | link_body = Simple_link; RIGHT_BRACE; + | link_body = located(Simple_link); endpos = located(RIGHT_BRACE); { - let node = `Link (link_body, []) in + let span = Loc.delimited (Loc.nudge_map_start (-2) link_body) endpos in + let link_body = link_body.Loc.value in + let node = Loc.at span @@ `Link (link_body, []) in let url = String.trim link_body in if "" = url then let what = Tokens.describe @@ Simple_link link_body in let warning = - let span = Loc.of_position $sloc in Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.with_warning node warning else return node } - | link_body = Link_with_replacement; children = sequence_nonempty(locatedM(inline_element)); RIGHT_BRACE; + | link_body = located(Link_with_replacement); children = sequence_nonempty(inline_element); endpos = located(RIGHT_BRACE); { let* c = children in - let node = `Link (link_body, c) in + let span = Loc.delimited link_body endpos in + let link_body = link_body.Loc.value in + let node = Loc.at span @@ `Link (link_body, c) in if "" = link_body then let what = Tokens.describe @@ Link_with_replacement link_body in - let span = Loc.of_position $sloc in let warning = Writer.Warning (Parse_error.should_not_be_empty ~what span) in @@ -598,13 +615,13 @@ let link := else return node } - | link_body = Link_with_replacement; RIGHT_BRACE; + | link_body = located(Link_with_replacement); endpos = located(RIGHT_BRACE); { let span = - Loc.of_position $sloc - |> Loc.nudge_start (String.length "{{!") + Loc.delimited link_body endpos in - let node = `Link (link_body, []) in + let link_body = link_body.Loc.value in + let node = Loc.at span @@ `Link (link_body, []) in let what = Tokens.describe @@ Link_with_replacement link_body in let warning = Writer.Warning (Parse_error.should_not_be_empty ~what span) @@ -615,8 +632,8 @@ let link := (* LIST *) let list_light_item_unordered == - | MINUS; ~ = locatedM(nestable_block_element); <> - | horizontal_whitespace; MINUS; item = locatedM(nestable_block_element); + | MINUS; ~ = nestable_block_element; <> + | horizontal_whitespace; MINUS; item = nestable_block_element; { let warning = let span = Loc.of_position $sloc in @@ -626,8 +643,8 @@ let list_light_item_unordered == } let list_light_item_ordered == - | PLUS; ~ = locatedM(nestable_block_element); <> - | horizontal_whitespace; PLUS; item = locatedM(nestable_block_element); + | PLUS; ~ = nestable_block_element; <> + | horizontal_whitespace; PLUS; item = nestable_block_element; { let warning = let span = Loc.of_position $sloc in @@ -637,121 +654,99 @@ let list_light_item_ordered == } let list_light := - | children = separated_nonempty_list(Single_newline, list_light_item_ordered); - { let* children = Writer.sequence children in return @@ `List (`Ordered, `Light, [ children ]) } - | children = separated_nonempty_list(Single_newline, list_light_item_unordered); - { let* children = Writer.sequence children in return @@ `List (`Unordered, `Light, [ children ]) } + | children = separated_nonempty_sequence(Single_newline, list_light_item_ordered); { + let* children = children in + let span = Loc.span @@ List.map Loc.location children in + let inner = Loc.at span @@ `List (`Ordered, `Light, [ children ]) in + return inner + } + | children = separated_nonempty_sequence(Single_newline, list_light_item_unordered); { + let* children = children in + let span = Loc.span @@ List.map Loc.location children in + let inner = Loc.at span @@ `List (`Unordered, `Light, [ children ]) in + return inner + } -let item_heavy == - | LI; whitespace; items = sequence(locatedM(nestable_block_element)); RIGHT_BRACE; whitespace?; - { - let warning = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span) - in - Writer.ensure not_empty warning items - } - | LI; items = sequence(locatedM(nestable_block_element)); RIGHT_BRACE; whitespace?; - { - let warning = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.should_be_followed_by_whitespace ~what:(Tokens.describe LI) span) - in - let writer = - let warning = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span) - in - Writer.ensure not_empty warning items - in - Writer.warning warning writer - } - | DASH; whitespace?; items = sequence(locatedM(nestable_block_element)); RIGHT_BRACE; whitespace?; - { - let warning = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span) - in - Writer.ensure not_empty warning items - } - | LI; whitespace?; items = sequence(locatedM(nestable_block_element))?; END; - { - let warning = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.end_not_allowed ~in_what:(Tokens.describe LI) span) - in - match items with - | Some items -> - let writer = - let warning = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span) - in - Writer.ensure not_empty warning items - in - Writer.warning warning writer - | None -> - Writer.with_warning [] warning - } - | DASH; whitespace?; items = sequence(locatedM(nestable_block_element))?; END; - { - let warning = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.end_not_allowed ~in_what:(Tokens.describe DASH) span) +let item_heavy := + | startpos = located(LI); whitespace; items = sequence(nestable_block_element); endpos = located(RIGHT_BRACE); whitespace?; { + let span = Loc.delimited startpos endpos in + let warning = + Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span) + in + Writer.ensure not_empty warning items + } + | startpos = located(LI); items = sequence(nestable_block_element); endpos = located(RIGHT_BRACE); whitespace?; { + let span = Loc.delimited startpos endpos in + let should_be_followed_by_whitespace = + Writer.Warning (Parse_error.should_be_followed_by_whitespace ~what:(Tokens.describe LI) span) + in + let should_not_be_empty = + Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span) + in + Writer.ensure not_empty should_not_be_empty items + |> Writer.warning should_be_followed_by_whitespace + } + | startpos = located(DASH); whitespace?; items = sequence(nestable_block_element); endpos = located(RIGHT_BRACE); whitespace?; { + let warning = + let span = Loc.delimited startpos endpos in + Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span) + in + Writer.ensure not_empty warning items + } + | startpos = located(DASH); whitespace?; items = sequence_nonempty(nestable_block_element)?; endpos = located(END); { + let end_not_allowed = + Writer.Warning (Parse_error.end_not_allowed ~in_what:(Tokens.describe DASH) endpos.Loc.location) + in + match items with + | Some writer -> + Writer.warning end_not_allowed writer + | None -> + let span = Loc.delimited startpos endpos in + let should_not_be_empty = + Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe DASH) span) in - match items with - | Some items -> - let writer = - let warning = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe DASH) span) - in - Writer.ensure not_empty warning items - in - Writer.warning warning writer - | None -> - Writer.with_warning [] warning - } + Writer.with_warning [] should_not_be_empty |> Writer.warning end_not_allowed + } let list_heavy := - | list_kind = List; whitespace?; items = sequence_nonempty(item_heavy); RIGHT_BRACE; - { - Writer.map (fun items -> `List (Tokens.ast_list_kind list_kind, `Heavy, items)) items - } - | list_kind = List; RIGHT_BRACE; - { - let warning = - let what = Tokens.describe @@ List list_kind in - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.should_not_be_empty ~what span) - in - let node = `List (Tokens.ast_list_kind list_kind, `Heavy, []) in - Writer.with_warning node warning - } - | list_kind = List; whitespace?; items = sequence_nonempty(item_heavy); errloc = position(error); - { - let warning = fun input -> - let (start_pos, end_pos) as loc = errloc in - let illegal_input = Loc.extract ~input ~start_pos ~end_pos in - let span = Loc.of_position loc in - let in_what = Tokens.describe @@ List list_kind in - Parse_error.illegal ~in_what illegal_input span - in - let* items = Writer.warning (Writer.InputNeeded warning) items in - return @@ `List (Tokens.ast_list_kind list_kind, `Heavy, items) - } - | list_kind = List; whitespace?; errloc = position(error); - { - let warning = fun input -> - let (start_pos, end_pos) as loc = errloc in - let illegal_input = Loc.extract ~input ~start_pos ~end_pos in - let span = Loc.of_position loc in - let in_what = Tokens.describe (List list_kind) in - Parse_error.illegal ~in_what illegal_input span - in - return @@ `List (Tokens.ast_list_kind list_kind, `Heavy, []) - |> Writer.warning (Writer.InputNeeded warning) - } + | list_kind = located(List); whitespace?; items = sequence_nonempty(item_heavy); endpos = located(RIGHT_BRACE); { + let span = Loc.delimited list_kind endpos in + let* items : Ast.nestable_block_element Loc.with_location list list = items in + let inner = Loc.at span @@ `List (Tokens.ast_list_kind list_kind.Loc.value, `Heavy, items) in + return inner + } + | list_kind = located(List); endpos = located(RIGHT_BRACE); { + let span = Loc.delimited list_kind endpos in + let warning = + let what = Tokens.describe @@ List list_kind.Loc.value in + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + let node = Loc.at span @@ `List (Tokens.ast_list_kind list_kind.Loc.value, `Heavy, []) in + Writer.with_warning node warning + } + | list_kind = located(List); whitespace?; items = sequence_nonempty(item_heavy); errloc = position(error); { + let span = Loc.(span [list_kind.location; Loc.of_position errloc]) in + let warning = fun input -> + let (start_pos, end_pos) = errloc in + let illegal_input = Loc.extract ~input ~start_pos ~end_pos in + let in_what = Tokens.describe @@ List list_kind.Loc.value in + Parse_error.illegal ~in_what illegal_input span + in + let* items : Ast.nestable_block_element Loc.with_location list list = Writer.warning (Writer.InputNeeded warning) items in + let inner = Loc.at span @@ `List (Tokens.ast_list_kind list_kind.Loc.value, `Heavy, items) in + return inner + } + | list_kind = located(List); whitespace?; errloc = position(error); { + let span = Loc.(span [list_kind.location; Loc.of_position errloc]) in + let warning = fun input -> + let (start_pos, end_pos) = errloc in + let illegal_input = Loc.extract ~input ~start_pos ~end_pos in + let in_what = Tokens.describe (List list_kind.Loc.value) in + Parse_error.illegal ~in_what illegal_input span + in + let inner = Loc.at span @@ `List (Tokens.ast_list_kind list_kind.Loc.value, `Heavy, []) in + Writer.with_warning inner (Writer.InputNeeded warning) + } let list_element := | ~ = list_light; <> @@ -760,11 +755,11 @@ let list_element := (* TABLES *) let cell_heavy := - | cell_kind = Table_cell; whitespace?; children = sequence_nonempty(locatedM(nestable_block_element)); RIGHT_BRACE; whitespace?; + | cell_kind = Table_cell; whitespace?; children = sequence_nonempty(nestable_block_element); RIGHT_BRACE; whitespace?; { Writer.map (fun c -> (c, cell_kind)) children } | cell_kind = Table_cell; RIGHT_BRACE; whitespace?; { return ([], cell_kind) } - | cell_kind = Table_cell; children = sequence_nonempty(locatedM(nestable_block_element))?; errloc = position(error); + | cell_kind = Table_cell; children = sequence_nonempty(nestable_block_element)?; errloc = position(error); { let warning = fun input -> let (start_pos, end_pos) as loc = errloc in @@ -794,12 +789,16 @@ let row_heavy := |> Writer.warning (Writer.InputNeeded warning) } -let table_heavy := - | TABLE_HEAVY; whitespace?; grid = sequence_nonempty(row_heavy); RIGHT_BRACE; - { Writer.map (fun g -> `Table ((g, None), `Heavy)) grid } - | TABLE_HEAVY; RIGHT_BRACE; - { return (`Table (([], None), `Heavy)) } - | TABLE_HEAVY; whitespace?; grid = sequence_nonempty(row_heavy)?; errloc = position(error); +let table_heavy := + | grid = delimited_location(TABLE_HEAVY, whitespace?; sequence_nonempty(row_heavy), RIGHT_BRACE); { + Writer.map (Loc.map (fun grid -> `Table ((grid, None), `Heavy))) (Writer.sequence_loc grid) + } + | startpos = located(TABLE_HEAVY); endpos = located(RIGHT_BRACE); { + let span = Loc.(span [startpos.location; endpos.location]) in + let inner = Loc.at span @@ `Table (([], None), `Heavy) in + return inner + } + | startpos = located(TABLE_HEAVY); whitespace?; grid = sequence_nonempty(row_heavy)?; errloc = position(error); { let warning = fun input -> let (start_pos, end_pos) as loc = errloc in @@ -808,26 +807,25 @@ let table_heavy := let in_what = Tokens.describe TABLE_HEAVY in Parse_error.illegal ~in_what illegal_input span in + let span = Loc.(span [startpos.location; (Loc.of_position errloc)]) in Option.value ~default:(return []) grid - |> Writer.map (fun grid -> `Table ((grid, None), `Heavy)) + |> Writer.map (fun grid -> Loc.at span @@ `Table ((grid, None), `Heavy)) |> Writer.warning (Writer.InputNeeded warning) } (* LIGHT TABLE *) let table_light_legal_elt := - (* Single token inline elements which are mostly handled in the lexer *) - | s = horizontal_whitespace; { return s } - | c = Code_span; { return @@ `Code_span c } - | m = Raw_markup; { return @@ `Raw_markup m } - | w = Word; { return @@ `Word w } - | m = Math_span; { return @@ `Math_span m } - (* More complex/recursive inline elements should have their own rule *) + | s = located(horizontal_whitespace); { return s } + | c = located(Code_span); { return @@ Loc.map (fun c -> `Code_span c) c } + | m = located(Raw_markup); { return @@ Loc.map (fun m -> `Raw_markup m) m } + | w = located(Word); { return @@ Loc.map (fun w -> `Word w) w } + | m = located(Math_span); { return @@ Loc.map (fun m -> `Math_span m) m } | ~ = style; <> | ~ = reference; <> | ~ = link; <> -let cell_content_light := ~ = sequence_nonempty(locatedM(table_light_legal_elt)); <> +let cell_content_light := ~ = sequence_nonempty(table_light_legal_elt); <> let cell := | ~ = cell_content_light; <> | ~ = cell_content_light; BAR; <> @@ -840,28 +838,35 @@ let row_light := let rows_light := ~ = sequence_nonempty(row_light); <> -let table_start_light := TABLE_LIGHT; whitespace?; {} +let table_start_light := startpos = located(TABLE_LIGHT); whitespace?; { startpos } let table_light := - | table_start_light; data = rows_light; RIGHT_BRACE; { Writer.map construct_table data } - | table_start_light; RIGHT_BRACE; - { return @@ `Table (([[]], None), `Light) } - | table_start_light; data = rows_light; END; - { - let in_what = Tokens.describe TABLE_LIGHT in - let warning = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.end_not_allowed ~in_what span) - in - unclosed_table ~data warning - } - | TABLE_LIGHT; any_whitespace?; END; + | startpos = table_start_light; data = rows_light; endpos = located(RIGHT_BRACE); { + let span = Loc.delimited startpos endpos in + Writer.map (construct_table ~span) data + } + | startpos = table_start_light; endpos = located(RIGHT_BRACE); { + let span = Loc.delimited startpos endpos in + let inner = Loc.at span @@ `Table (([[]], None), `Light) in + return inner + } + | startpos = table_start_light; data = rows_light; endpos = located(END); { + let in_what = Tokens.describe TABLE_LIGHT in + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.end_not_allowed ~in_what span) + in + let span = Loc.delimited startpos endpos in + unclosed_table ~span ~data warning + } + | startpos = located(TABLE_LIGHT); any_whitespace?; endpos = located(END); { let in_what = Tokens.describe TABLE_LIGHT in let warning = let span = Loc.of_position $sloc in Writer.Warning (Parse_error.end_not_allowed ~in_what span) in - unclosed_table warning + let span = Loc.delimited startpos endpos in + unclosed_table ~span warning } let table := @@ -878,14 +883,16 @@ let media := let (located_media_kind, media_href) = split_simple_media media in let wrapped_located_kind = Loc.map href_of_media located_media_kind in let kind = media_kind_of_target media_href in - return @@ `Media (`Simple, wrapped_located_kind, "", kind) + let inner = Loc.(at media.location @@ `Media (`Simple, wrapped_located_kind, "", kind)) in + return inner } | media = located(Media_with_replacement); whitespace*; { let (located_media_kind, media_href, content) = split_replacement_media media in let wrapped_located_kind = Loc.map href_of_media located_media_kind in let kind = media_kind_of_target media_href in - return @@ `Media (`With_text, wrapped_located_kind, content, kind) + let inner = Loc.(at media.location @@ `Media (`With_text, wrapped_located_kind, content, kind)) in + return inner } (* TOP-LEVEL ELEMENTS *) @@ -914,84 +921,93 @@ let paragraph_style := Writer.warning warning ws } -let verbatim := v = Verbatim; - { - let what = Tokens.describe @@ Verbatim v in - let warning = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.should_not_be_empty ~what span) - in - Writer.ensure has_content warning (return v) - |> Writer.map (fun v -> `Verbatim v) - } - -let paragraph := - | items = sequence_nonempty(locatedM(inline_element)); - { Writer.map (fun i -> `Paragraph (trim_start i)) items } - -let code_block := c = Code_block; - { - - return (`Code_block c) } - -let math_block := m = Math_block; - { - let what = Tokens.describe @@ Math_block m in - let warning = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.should_not_be_empty ~what span) - in - Writer.ensure has_content warning (return m) - |> Writer.map (fun m -> `Math_block m) - } - -let modules := - | MODULES; modules = sequence(locatedM(inline_element)); RIGHT_BRACE; - { - let in_what = Tokens.describe MODULES in - let* modules = modules in - let not_allowed = - let span = Loc.span @@ List.map Loc.location modules in - let first_offending = - List.find_opt - (function - | `Word _ | `Space _ -> false - | _ -> true) - (List.map Loc.value modules : Ast.inline_element list) - in - let what = Option.map Tokens.describe_inline first_offending |> Option.value ~default:String.empty in - Writer.Warning (Parse_error.not_allowed ~what ~in_what span) - in - let is_empty = - let span = Loc.span @@ List.map Loc.location modules in - let what = Tokens.describe MODULES in - Writer.Warning (Parse_error.should_not_be_empty ~what span) - in - let inner = `Modules (List.map (Loc.map inline_element_inner) modules) in - List.fold_left (fun writer (f, w) -> Writer.ensure f w writer) (return modules) [(not_empty, is_empty); (legal_module_list, not_allowed)] - |> Writer.map (Fun.const inner) - } - | MODULES; modules = sequence(locatedM(inline_element)); END; - { - let in_what = Tokens.describe MODULES in - let* modules = modules in +let verbatim := verbatim = located(Verbatim); { + let Loc.{ value; location } = verbatim in + let what = Tokens.describe @@ Verbatim value in + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + let location = Loc.nudge_start (-String.length "{v ") location in + let verbatim = Loc.at location @@ `Verbatim value in + Writer.ensure has_content warning (return value) + |> Writer.map (Fun.const verbatim) +} + +let paragraph := items = sequence_nonempty(inline_element); { + Writer.map paragraph items +} + +let code_block := c = Code_block; { + let span = Loc.of_position $sloc in + return (Loc.at span @@ `Code_block c) +} + +let math_block := m = Math_block; { + let what = Tokens.describe @@ Math_block m in + let span = Loc.of_position $sloc in + let warning = + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + Writer.ensure has_content warning (return m) + |> Writer.map (fun m -> Loc.at span @@ `Math_block m) +} + +let modules := startpos = located(MODULES); modules = sequence(inline_element); endpos = located(RIGHT_BRACE); { + let in_what = Tokens.describe MODULES in + let* modules = modules in + let not_allowed = let span = Loc.span @@ List.map Loc.location modules in - let not_allowed = - let first_offending = - List.find_opt - (function - | `Word _ | `Space _ -> false - | _ -> true) - (List.map Loc.value modules : Ast.inline_element list) - in - let what = Option.map Tokens.describe_inline first_offending |> Option.value ~default:String.empty in - Writer.Warning (Parse_error.not_allowed ~what ~in_what span) + let first_offending = + List.find_opt + (function + | `Word _ | `Space _ -> false + | _ -> true) + (List.map Loc.value modules : Ast.inline_element list) in - let unexpected_end = - Writer.Warning (Parse_error.end_not_allowed ~in_what:(Tokens.describe MODULES) span) + let what = Option.map Tokens.describe_inline first_offending |> Option.value ~default:String.empty in + Writer.Warning (Parse_error.not_allowed ~what ~in_what span) + in + let is_empty = + let span = Loc.span @@ List.map Loc.location modules in + let what = Tokens.describe MODULES in + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + let span = Loc.(span [startpos.location; endpos.location]) in + let inner = Loc.at span @@ `Modules (List.map (Loc.map inline_element_inner) modules) in + (* Test the content for errors *) + let* _ = List.fold_left + (fun writer (f, w) -> Writer.ensure f w writer) + (return modules) + [ + (* predicate + error pairs *) + (not_empty, is_empty); + (legal_module_list, not_allowed) + ] + in + return inner + } + | startpos = located(MODULES); modules = sequence(inline_element); endpos = located(END); { + let in_what = Tokens.describe MODULES in + let* modules = modules in + let span = Loc.span @@ List.map Loc.location modules in + let not_allowed = + let first_offending = + List.find_opt + (function + | `Word _ | `Space _ -> false + | _ -> true) + (List.map Loc.value modules : Ast.inline_element list) in - let inner = `Modules (List.map (Loc.map inline_element_inner) modules) in - return inner - |> Writer.warning not_allowed - |> Writer.warning unexpected_end - } + let what = Option.map Tokens.describe_inline first_offending |> Option.value ~default:String.empty in + Writer.Warning (Parse_error.not_allowed ~what ~in_what span) + in + let unexpected_end = + Writer.Warning (Parse_error.end_not_allowed ~in_what:(Tokens.describe MODULES) span) + in + let span = Loc.(span [startpos.location; endpos.location]) in + let inner = Loc.at span @@ `Modules (List.map (Loc.map inline_element_inner) modules) in + return inner + |> Writer.warning not_allowed + |> Writer.warning unexpected_end + } diff --git a/src/parser/writer.ml b/src/parser/writer.ml index 728f4bb186..9e6aaaded5 100644 --- a/src/parser/writer.ml +++ b/src/parser/writer.ml @@ -41,6 +41,9 @@ let sequence : 'a t list -> 'a list t = let xs, ws = List.fold_left go ([], []) xs in Writer (List.rev xs, ws) +let sequence_loc : 'a t Loc.with_location -> 'a Loc.with_location t = + fun { value; location } -> map (Loc.at location) value + let map2 : ('a -> 'b -> 'c) -> 'a t -> 'b t -> 'c t = fun f (Writer (a, ws)) (Writer (b, wsb)) -> Writer (f a b, wsb @ ws) From d2737ebf2685438ee634a5ae0c68ffc3caa8b5ff Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 30 Dec 2024 15:12:04 -0500 Subject: [PATCH 108/150] Fix Tag and Math block/span locations --- src/parser/lexer.mll | 49 ++++---- src/parser/loc.ml | 2 + src/parser/loc.mli | 2 + src/parser/odoc_parser.ml | 1 + src/parser/parser.mly | 236 ++++++++++++++++++++++---------------- src/parser/tokens.ml | 12 +- 6 files changed, 171 insertions(+), 131 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index adf337992f..003d0d957f 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -36,16 +36,6 @@ let unescape_word : string -> string = fun s -> scan_word 0; Buffer.contents buffer -type math_kind = - Inline | Block - -let math_constr kind x = - match kind with - | Inline -> Math_span x - | Block -> Math_block x - -let copy : 'a -> 'a = fun t -> - Obj.magic t |> Obj.dup |> Obj.magic let update_content_newlines : content:string -> Lexing.lexbuf -> unit = fun ~content lexbuf -> @@ -160,6 +150,16 @@ type input = { mutable warnings : Warning.t list; } + +type math_kind = + Inline | Block + +let math_constr input kind x start_offset = + let start_pos = input.offset_to_location start_offset in + match kind with + | Inline -> Math_span Tokens.{ start = start_pos; content = x } + | Block -> Math_block Tokens.{ start = start_pos; content = x } + let with_loc : Lexing.lexbuf -> input -> Loc.span -> ('a -> 'a Loc.with_location) = fun _lexbuf _input location -> Loc.at location @@ -473,10 +473,10 @@ and token input = parse { Style Subscript } | "{math" space_char - { math Block (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) (copy lexbuf.lex_curr_p) input lexbuf } + { math Block (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) input lexbuf } | "{m" horizontal_space - { math Inline (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) (copy lexbuf.lex_curr_p) input lexbuf } + { math Inline (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) input lexbuf } | "{!modules:" @@ -740,24 +740,23 @@ and code_span buffer nesting_level start_offset input = parse { Buffer.add_char buffer c; code_span buffer nesting_level start_offset input lexbuf } -and math kind buffer nesting_level start_offset start_pos input = parse +and math kind buffer nesting_level start_offset input = parse | '}' { if nesting_level == 0 then ( - lexbuf.lex_start_p <- start_pos; - math_constr kind (Buffer.contents buffer)) + math_constr input kind (Buffer.contents buffer)) start_offset else ( Buffer.add_char buffer '}'; - math kind buffer (pred nesting_level) start_offset start_pos input lexbuf) + math kind buffer (pred nesting_level) start_offset input lexbuf) } | '{' { Buffer.add_char buffer '{'; - math kind buffer (succ nesting_level) start_offset start_pos input lexbuf + math kind buffer (succ nesting_level) start_offset input lexbuf } | ("\\{" | "\\}") as s { Buffer.add_string buffer s; - math kind buffer nesting_level start_offset start_pos input lexbuf } + math kind buffer nesting_level start_offset input lexbuf } | (newline) as s { Lexing.new_line lexbuf; @@ -765,28 +764,28 @@ and math kind buffer nesting_level start_offset start_pos input = parse | Inline -> let not_allowed = Parse_error.not_allowed - ~what:(Tokens.describe (Blank_line "\n")) - ~in_what:(Tokens.describe (math_constr kind "")) + ~what:(Tokens.describe (Single_newline "\n")) + ~in_what:(Tokens.describe (math_constr input kind "" start_offset)) in warning lexbuf input not_allowed; Buffer.add_char buffer '\n'; - math kind buffer nesting_level start_offset start_pos input lexbuf + math kind buffer nesting_level start_offset input lexbuf | Block -> Buffer.add_string buffer s; - math kind buffer nesting_level start_offset start_pos input lexbuf + math kind buffer nesting_level start_offset input lexbuf } | eof { let unexpected_eof = Parse_error.end_not_allowed - ~in_what:(Tokens.describe (math_constr kind "")) + ~in_what:(Tokens.describe (math_constr input kind "" start_offset)) in warning lexbuf input unexpected_eof; - math_constr kind (Buffer.contents buffer) + math_constr input kind (Buffer.contents buffer) start_offset } | _ as c { Buffer.add_char buffer c; - math kind buffer nesting_level start_offset start_pos input lexbuf } + math kind buffer nesting_level start_offset input lexbuf } and media tok_descr buffer nesting_level start_offset input = parse | '}' diff --git a/src/parser/loc.ml b/src/parser/loc.ml index 880e150e93..cf42693b8b 100644 --- a/src/parser/loc.ml +++ b/src/parser/loc.ml @@ -9,6 +9,8 @@ let map f annotated = { annotated with value = f annotated.value } let is predicate { value; _ } = predicate value let same annotated value = { annotated with value } +let dummy_pos : point = { line = -1; column = -1 } + let of_position : ?filename:string -> Lexing.position * Lexing.position -> span = fun ?filename (start, end_) -> diff --git a/src/parser/loc.mli b/src/parser/loc.mli index 007aab2c13..f85c24f185 100644 --- a/src/parser/loc.mli +++ b/src/parser/loc.mli @@ -8,6 +8,8 @@ type point = { line : int; column : int } (** A specific character *) +val dummy_pos : point + type span = { file : string; start : point; end_ : point } (** A range of characters between [start] and [end_] in a particular file *) diff --git a/src/parser/odoc_parser.ml b/src/parser/odoc_parser.ml index ed1b0a19e6..7626e26310 100644 --- a/src/parser/odoc_parser.ml +++ b/src/parser/odoc_parser.ml @@ -120,6 +120,7 @@ let parse_comment : location:Lexing.position -> text:string -> t = (* We cannot directly pass parameters to Menhir without converting our parser to a module functor. So we pass our current filename to the lexbuf here *) Lexing.(set_filename lexbuf location.pos_fname); + Lexing.(set_position lexbuf location); let lexer_state = Lexer. { diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 9a056770af..4aa6c6caf7 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -202,8 +202,8 @@ let legal_module_list : Ast.inline_element Loc.with_location list -> bool = %token MODULES "{!modules:" -%token Math_span "{m" -%token Math_block "{math" +%token Math_span "{m" +%token Math_block "{math" %token Raw_markup "{%%}" @@ -251,6 +251,10 @@ let legal_module_list : Ast.inline_element Loc.with_location list -> bool = %token END +%type tag_bare +%type tag_with_content +%type tag + %start main %% @@ -297,9 +301,9 @@ let main := let toplevel := | block = nestable_block_element; { Writer.map (Loc.map @@ fun b -> (b :> Ast.block_element) ) block } - | t = locatedM(tag); { Writer.map (Loc.map @@ fun t -> `Tag t) t } + | t = tag; { Writer.map (fun loc -> Loc.{ loc with value = `Tag loc.value }) t } | ~ = locatedM(section_heading); <> - | ~ = locatedM(toplevel_error); <> + | ~ = toplevel_error; <> let toplevel_error := | brace = located(RIGHT_BRACE); @@ -310,41 +314,42 @@ let toplevel_error := Writer.Warning (Parse_error.bad_markup what span) in let as_text = Loc.same brace @@ `Word "{" in - let node = (`Paragraph [ as_text ]) in + let node = (Loc.same brace @@ `Paragraph [ as_text ]) in Writer.with_warning node warning } - | t = tag; RIGHT_BRACE; - { - let* tag_descr = Writer.map Tokens.describe_tag (t : Ast.tag Writer.t) in - let warning = - let span = Loc.of_position $sloc in - let what = Tokens.describe RIGHT_BRACE in - Writer.Warning (Parse_error.not_allowed ~what ~in_what:tag_descr span) - in - let ret = Writer.map (fun t -> ( `Tag t : Ast.block_element )) t - |> Writer.warning warning - in - (ret : Ast.block_element Writer.t) - } - | elt = inline_element; errloc = position(RIGHT_BRACE); - { - let span = Loc.of_position errloc in - let warning = - Writer.Warning (Parse_error.unpaired_right_brace span) - in - let* elt = Writer.warning warning elt in - return @@ `Paragraph [elt; Loc.at span @@ `Word "}"] - } - | elt = inline_element; RIGHT_CODE_DELIMITER; - { let warning = - let span = Loc.of_position $sloc in - let what = Tokens.describe RIGHT_CODE_DELIMITER in - let in_what = Tokens.describe_inline @@ Writer.unwrap_located elt in - Writer.Warning (Parse_error.not_allowed ~what ~in_what span) - in - let* elt = Writer.warning warning elt in - return @@ `Paragraph [elt] - } + | t = tag; RIGHT_BRACE; { + let* tag_descr = Writer.map (fun t -> Tokens.describe_tag @@ t.Loc.value) t in + let warning = + let span = Loc.of_position $sloc in + let what = Tokens.describe RIGHT_BRACE in + Writer.Warning (Parse_error.not_allowed ~what ~in_what:tag_descr span) + in + let ret = + Writer.map (fun loc -> Loc.{ loc with value = ( `Tag loc.value : Ast.block_element ) }) t + |> Writer.warning warning + in + (ret : Ast.block_element Loc.with_location Writer.t) + } + | elt = inline_element; errloc = located(RIGHT_BRACE); { + let* elt = elt in + let span = Loc.delimited elt errloc in + let warning = + Writer.Warning (Parse_error.unpaired_right_brace span) + in + let inner = Loc.at span @@ `Paragraph [elt; Loc.at span @@ `Word "}"] in + Writer.with_warning inner warning + } + | elt = inline_element; endpos = located(RIGHT_CODE_DELIMITER); { + let span = Loc.delimited (Writer.unwrap elt) endpos in + let warning = + let what = Tokens.describe RIGHT_CODE_DELIMITER in + let in_what = Tokens.describe_inline @@ Writer.unwrap_located elt in + Writer.Warning (Parse_error.not_allowed ~what ~in_what span) + in + let* elt = elt in + let inner = Loc.at span @@ `Paragraph [elt] in + Writer.with_warning inner warning + } (* SECTION HEADING *) @@ -379,75 +384,99 @@ let section_heading := (* TAGS *) let tag == - | with_content = tag_with_content; Single_newline?; { (with_content : Ast.tag Writer.t) } - | bare = tag_bare; Single_newline?; { (bare : Ast.tag Writer.t) } + | with_content = tag_with_content; Single_newline?; { with_content } + | bare = tag_bare; Single_newline?; { bare } let tag_with_content := - | DEPRECATED; children = sequence_nonempty(nestable_block_element); - { Writer.map (fun c -> `Deprecated c) children } - | DEPRECATED; - { return @@ `Deprecated [] } - | DEPRECATED; RIGHT_BRACE; - { - let warning = - Writer.Warning (Parse_error.unpaired_right_brace @@ Loc.of_position $sloc) - in - Writer.with_warning (`Deprecated []) warning - } - | RETURN; children = sequence_nonempty(nestable_block_element); - { Writer.map (fun c -> `Return c) children } + | startpos = located(DEPRECATED); children = located(sequence_nonempty(nestable_block_element)); { + let span = Loc.delimited startpos children in + Writer.map (fun c -> Loc.at span @@ `Deprecated c) children.Loc.value + } + | pos = located(DEPRECATED); + { return @@ { pos with Loc.value = `Deprecated [] } } + | pos = located(DEPRECATED); errloc = located(RIGHT_BRACE); { + let warning = + Writer.Warning (Parse_error.unpaired_right_brace @@ errloc.Loc.location) + in + Writer.with_warning ({ pos with Loc.value = `Deprecated [] }) warning + } + | startpos = located(RETURN); children = located(sequence_nonempty(nestable_block_element)); { + let span = Loc.delimited startpos children in + Writer.map (fun c -> Loc.at span @@ `Return c) children.Loc.value + } | ~ = before; <> | ~ = raise; <> | ~ = see; <> | ~ = param; <> let before := - | version = Before; children = sequence_nonempty(nestable_block_element); - { Writer.map (fun c -> `Before (version, c)) children } - | version = Before; - { return @@ `Before (version, []) } + | content = located(Before); children = sequence_nonempty(nestable_block_element); { + let* children = children in + let span = Loc.span @@ content.Loc.location :: List.map Loc.location children in + let inner = Loc.at span @@ `Before (content.Loc.value, children) in + return inner + } + | version = located(Before); { + return { version with value = `Before (version.Loc.value, []) } + } let raise := - | exn = Raise; children = sequence_nonempty(nestable_block_element); - { Writer.map (fun c -> `Raise (exn, c)) children } - | exn = Raise; - { return @@ `Raise (exn, []) } + | exn = located(Raise); children = located(sequence_nonempty(nestable_block_element)); { + let span = Loc.delimited exn children in + let exn = exn.Loc.value in + let* children = children.Loc.value in + let inner = Loc.at span @@ `Raise (exn, children) in + return inner + } + | exn = located(Raise); { + return { exn with Loc.value = `Raise (exn.Loc.value, []) } + } let see := - | (kind, href) = See; children = sequence_nonempty(nestable_block_element); + | content = located(See); children = located(sequence_nonempty(nestable_block_element)); { - let* children = children in - return @@ `See (Tokens.to_ast_ref kind, href, children) + let span = Loc.delimited content children in + let (kind, href) = content.Loc.value in + let* children = children.Loc.value in + let inner = Loc.at span @@ `See (Tokens.to_ast_ref kind, href, children) in + return inner } - | (kind, href) = See; - { return @@ `See (Tokens.to_ast_ref kind, href, []) } + | content = located(See); { + let (kind, href) = content.Loc.value in + return { content with value = `See (Tokens.to_ast_ref kind, href, []) } + } let param := - | ident = Param; children = sequence_nonempty(nestable_block_element); - { Writer.map (fun c -> `Param (ident, c)) children } - | ident = Param; - { return @@ `Param (ident, [])} + | content = located(Param); children = sequence_nonempty(nestable_block_element); { + let* children = children in + let span = Loc.span @@ content.Loc.location :: List.map Loc.location children in + let ident = content.Loc.value in + let inner = Loc.at span @@ `Param (ident, children) in + return inner + } + | content = located(Param); + { return { content with Loc.value = `Param (content.Loc.value, []) }} let tag_bare := - | version = Version; + | content = located(Version); { + let Loc.{ value = version; location } = content in let what = Tokens.describe (Version version) in let warning = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.should_not_be_empty ~what span) + Writer.Warning (Parse_error.should_not_be_empty ~what location) in Writer.ensure has_content warning (return version) - |> Writer.map (fun v -> `Version v) + |> Writer.map (fun v -> { content with value = `Version v }) } - | version = Since; + | content = located(Since); { + let Loc.{ value = version; location } = content in let what = Tokens.describe (Since version) in let warning = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.should_not_be_empty ~what span) + Writer.Warning (Parse_error.should_not_be_empty ~what location) in Writer.ensure has_content warning (return version) - |> Writer.map (fun v -> `Since v) + |> Writer.map (fun v -> { content with value = `Since v }) } | impl = located(Canonical); { @@ -457,34 +486,41 @@ let tag_bare := Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.ensure (Loc.is has_content) warning @@ return impl - |> Writer.map (fun v -> `Canonical v) + |> Writer.map (fun v -> Loc.at impl.Loc.location @@ `Canonical v) } - | author = Author; + | content = located(Author); { - let what = Tokens.describe @@ Author author in + let Loc.{ value; location } = content in + let what = Tokens.describe @@ Author value in let warning = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.should_not_be_empty ~what span) + Writer.Warning (Parse_error.should_not_be_empty ~what location) in - Writer.ensure has_content warning @@ return author - |> Writer.map (fun a -> `Author a) + Writer.ensure has_content warning @@ return value + |> Writer.map (fun a -> { content with value = `Author a }) } - | OPEN; { return `Open } - | INLINE; { return `Inline } - | CLOSED; { return `Closed } - | HIDDEN; { return `Hidden } + | pos = position(OPEN); { let loc = Loc.of_position pos in return @@ Loc.at loc `Open } + | pos = position(INLINE); { let loc = Loc.of_position pos in return @@ Loc.at loc `Inline } + | pos = position(CLOSED); { let loc = Loc.of_position pos in return @@ Loc.at loc `Closed } + | pos = position(HIDDEN); { let loc = Loc.of_position pos in return @@ Loc.at loc `Hidden } (* INLINE ELEMENTS *) -let inline_element := - (* Single token inline elements which are mostly handled in the lexer *) +let inline_element := + | ~ = inline_element_without_whitespace; <> | s = located(inline_elt_legal_whitespace); { return s } + +let inline_element_without_whitespace := + (* Single token inline elements which are mostly handled in the lexer *) | c = located(Code_span); { return @@ Loc.map (fun c -> `Code_span c) c } | m = located(Raw_markup); { return @@ Loc.map (fun m -> `Raw_markup m) m } | w = located(Word); { return @@ Loc.map (fun w -> `Word w) w } - | m = located(Math_span); { return @@ Loc.map (fun m -> `Math_span m) m } + | m = located(Math_span); { + let Loc.{ value = Tokens.{ start; content }; location } = m in + let span = { location with start } in + return @@ Loc.at span (`Math_span content) + } (* More complex/recursive inline elements should have their own rule *) | ~ = style; <> | ~ = reference; <> @@ -817,13 +853,7 @@ let table_heavy := let table_light_legal_elt := | s = located(horizontal_whitespace); { return s } - | c = located(Code_span); { return @@ Loc.map (fun c -> `Code_span c) c } - | m = located(Raw_markup); { return @@ Loc.map (fun m -> `Raw_markup m) m } - | w = located(Word); { return @@ Loc.map (fun w -> `Word w) w } - | m = located(Math_span); { return @@ Loc.map (fun m -> `Math_span m) m } - | ~ = style; <> - | ~ = reference; <> - | ~ = link; <> + | ~ = inline_element_without_whitespace; <> let cell_content_light := ~ = sequence_nonempty(table_light_legal_elt); <> let cell := @@ -943,13 +973,15 @@ let code_block := c = Code_block; { return (Loc.at span @@ `Code_block c) } -let math_block := m = Math_block; { - let what = Tokens.describe @@ Math_block m in - let span = Loc.of_position $sloc in +let math_block := inner = located(Math_block); { + let Loc.{ value ; location } = inner in + let Tokens.{start; content} = value in + let span = { location with start } in + let what = Tokens.describe @@ Math_block value in let warning = Writer.Warning (Parse_error.should_not_be_empty ~what span) in - Writer.ensure has_content warning (return m) + Writer.ensure has_content warning (return content) |> Writer.map (fun m -> Loc.at span @@ `Math_block m) } diff --git a/src/parser/tokens.ml b/src/parser/tokens.ml index dab42e504f..7525258a52 100644 --- a/src/parser/tokens.ml +++ b/src/parser/tokens.ml @@ -11,6 +11,7 @@ type table_cell_kind = Header | Data type list_kind = Ordered | Unordered type internal_reference = URL | File | Document +type math = { start : Loc.point; content : string } let ast_list_kind : list_kind -> Ast.list_kind = function | Ordered -> `Ordered @@ -27,8 +28,9 @@ type token = | MODULES | Media of (media * media_target) | Media_with_replacement of (media * media_target * string) - | Math_span of string - | Math_block of string + (* Start location *) + | Math_span of math + | Math_block of math | Code_span of string | Code_block of Ast.code_block | Word of string @@ -242,7 +244,8 @@ let describe_inline : Ast.inline_element -> string = function | `Space _ -> describe @@ Space "" | `Styled (style, _) -> describe @@ Style (of_ast_style style) | `Code_span _ -> describe @@ Code_span "" - | `Math_span _ -> describe @@ Math_span "" + | `Math_span _ -> + describe @@ Math_span { start = Loc.dummy_pos; content = "" } | `Raw_markup x -> describe @@ Raw_markup x | `Link (l, []) -> describe @@ Simple_link l | `Link (l, _ :: _) -> describe @@ Link_with_replacement l @@ -273,7 +276,8 @@ let describe_nestable_block : Ast.nestable_block_element -> string = function | `List (_, _, _) -> "List" | `Table (_, kind) -> describe @@ if kind = `Light then TABLE_LIGHT else TABLE_HEAVY - | `Math_block _ -> describe @@ Math_block "" + | `Math_block _ -> + describe @@ Math_block { start = Loc.dummy_pos; content = "" } | `Media _ as media -> describe @@ of_media media let of_ast_ref : [ `Document | `File | `Url ] -> internal_reference = function From b598a5cc58b68ff69c3f55a9fd3dfc1d5ff5f116 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 31 Dec 2024 12:50:18 -0500 Subject: [PATCH 109/150] All block elements return spanning location --- src/parser/parser.mly | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 4aa6c6caf7..7df662a24b 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -264,6 +264,7 @@ let legal_module_list : Ast.inline_element Loc.with_location list -> bool = (* Utility which wraps the return value of a rule in `Loc.with_location` *) let locatedM(rule) == inner = rule; { Writer.map (wrap_location $sloc) inner } let located(rule) == inner = rule; { wrap_location $sloc inner } +let with_position(rule) == inner = rule; { (inner, $sloc) } let position(rule) == _ = rule; { $sloc } let sequence(rule) == xs = list(rule); { Writer.sequence xs } let sequence_nonempty(rule) == xs = nonempty_list(rule); { Writer.sequence xs } @@ -302,7 +303,7 @@ let main := let toplevel := | block = nestable_block_element; { Writer.map (Loc.map @@ fun b -> (b :> Ast.block_element) ) block } | t = tag; { Writer.map (fun loc -> Loc.{ loc with value = `Tag loc.value }) t } - | ~ = locatedM(section_heading); <> + | ~ = section_heading; <> | ~ = toplevel_error; <> let toplevel_error := @@ -354,31 +355,35 @@ let toplevel_error := (* SECTION HEADING *) let section_heading := - | (num, title) = Section_heading; children = sequence_nonempty(inline_element); RIGHT_BRACE; - { - Writer.map (fun c -> `Heading (num, title, trim_start c)) children + | content = located(Section_heading); children = sequence_nonempty(inline_element); endpos = located(RIGHT_BRACE); + { + let span = Loc.delimited content endpos in + let (num, title) = content.Loc.value in + Writer.map (fun c -> Loc.at span @@ `Heading (num, title, trim_start c)) children } - | (num, title) = Section_heading; RIGHT_BRACE; + | content = located(Section_heading); endpos = located(RIGHT_BRACE); { + let span = Loc.delimited content endpos + and (num, title) = content.Loc.value in let should_not_be_empty = - let span = Loc.of_position $sloc in let what = Tokens.describe @@ Section_heading (num, title) in Writer.Warning (Parse_error.should_not_be_empty ~what span) in - let node = `Heading (num, title, []) in + let node = Loc.at span @@ `Heading (num, title, []) in Writer.with_warning node should_not_be_empty } - | (num, title) = Section_heading; errloc = position(error); + | content = with_position(Section_heading); end_pos = position(error); { + let (num, title), start_pos = content in + let span = Loc.span @@ List.map Loc.of_position [start_pos; end_pos] in + let start_pos = fst start_pos and end_pos = snd end_pos in let illegal = Writer.InputNeeded (fun input -> - let (start_pos, end_pos) = errloc in - let span = Loc.of_position (start_pos, end_pos) in let err = Loc.extract ~input ~start_pos ~end_pos in let in_what = Tokens.describe @@ Section_heading (num, title) in Parse_error.illegal ~in_what err span) in - return @@ `Heading (num, title, []) - |> Writer.warning illegal + let inner = Loc.at span @@ `Heading (num, title, []) in + Writer.with_warning inner illegal } (* TAGS *) @@ -667,7 +672,7 @@ let link := (* LIST *) -let list_light_item_unordered == +let list_light_item_unordered := | MINUS; ~ = nestable_block_element; <> | horizontal_whitespace; MINUS; item = nestable_block_element; { @@ -678,7 +683,7 @@ let list_light_item_unordered == Writer.warning warning item } -let list_light_item_ordered == +let list_light_item_ordered := | PLUS; ~ = nestable_block_element; <> | horizontal_whitespace; PLUS; item = nestable_block_element; { From e75caa21404b8e9fedbe2333974af13afe1a4e74 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 31 Dec 2024 17:05:08 -0500 Subject: [PATCH 110/150] fix light lists --- src/parser/parser.mly | 151 ++++++++++++------------------- src/parser/test_driver/tester.ml | 6 +- 2 files changed, 64 insertions(+), 93 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 7df662a24b..bd3f784445 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -154,7 +154,7 @@ let paragraph : Ast.inline_element Loc.with_location list -> Ast.nestable_block_element Loc.with_location = fun elts -> - let span = List.map Loc.location elts |> Loc.span in + let span = List.map Loc.location (trim_start elts) |> Loc.span in Loc.at span @@ `Paragraph elts let rec inline_element_inner : Ast.inline_element -> string = function @@ -290,101 +290,69 @@ let any_whitespace := | ~ = whitespace; <> | ~ = Blank_line; <`Space> -let line_breaks := - | Blank_line; {} - | Single_newline; {} - (* ENTRY *) let main := - | nodes = sequence_nonempty(toplevel); whitespace?; END; { nodes } + | ~ = sequence_nonempty(toplevel); END; <> | END; { return [] } let toplevel := - | block = nestable_block_element; { Writer.map (Loc.map @@ fun b -> (b :> Ast.block_element) ) block } + | block = nestable_block_element; { (block :> Ast.block_element Loc.with_location Writer.t) } | t = tag; { Writer.map (fun loc -> Loc.{ loc with value = `Tag loc.value }) t } | ~ = section_heading; <> - | ~ = toplevel_error; <> let toplevel_error := - | brace = located(RIGHT_BRACE); - { - let warning = - let span = Loc.of_position $sloc in - let what = Tokens.describe RIGHT_BRACE in - Writer.Warning (Parse_error.bad_markup what span) - in - let as_text = Loc.same brace @@ `Word "{" in - let node = (Loc.same brace @@ `Paragraph [ as_text ]) in - Writer.with_warning node warning - } - | t = tag; RIGHT_BRACE; { - let* tag_descr = Writer.map (fun t -> Tokens.describe_tag @@ t.Loc.value) t in + | errloc = position(RIGHT_BRACE); { + let span = Loc.of_position errloc in let warning = - let span = Loc.of_position $sloc in let what = Tokens.describe RIGHT_BRACE in - Writer.Warning (Parse_error.not_allowed ~what ~in_what:tag_descr span) - in - let ret = - Writer.map (fun loc -> Loc.{ loc with value = ( `Tag loc.value : Ast.block_element ) }) t - |> Writer.warning warning - in - (ret : Ast.block_element Loc.with_location Writer.t) - } - | elt = inline_element; errloc = located(RIGHT_BRACE); { - let* elt = elt in - let span = Loc.delimited elt errloc in - let warning = - Writer.Warning (Parse_error.unpaired_right_brace span) - in - let inner = Loc.at span @@ `Paragraph [elt; Loc.at span @@ `Word "}"] in - Writer.with_warning inner warning + Writer.Warning (Parse_error.bad_markup what span) + in + let as_text = Loc.at span @@ `Word "{" in + let node = (Loc.same as_text @@ `Paragraph [ as_text ]) in + Writer.with_warning node warning } - | elt = inline_element; endpos = located(RIGHT_CODE_DELIMITER); { - let span = Loc.delimited (Writer.unwrap elt) endpos in + | errloc = position(RIGHT_CODE_DELIMITER); { + let span = Loc.of_position errloc in let warning = - let what = Tokens.describe RIGHT_CODE_DELIMITER in - let in_what = Tokens.describe_inline @@ Writer.unwrap_located elt in - Writer.Warning (Parse_error.not_allowed ~what ~in_what span) - in - let* elt = elt in - let inner = Loc.at span @@ `Paragraph [elt] in - Writer.with_warning inner warning + let what = Tokens.describe RIGHT_BRACE in + Writer.Warning (Parse_error.bad_markup what span) + in + let as_text = Loc.at span @@ `Word "{" in + let node = Loc.same as_text @@ `Paragraph [ as_text ] in + Writer.with_warning node warning } (* SECTION HEADING *) let section_heading := - | content = located(Section_heading); children = sequence_nonempty(inline_element); endpos = located(RIGHT_BRACE); - { - let span = Loc.delimited content endpos in - let (num, title) = content.Loc.value in - Writer.map (fun c -> Loc.at span @@ `Heading (num, title, trim_start c)) children - } - | content = located(Section_heading); endpos = located(RIGHT_BRACE); - { - let span = Loc.delimited content endpos - and (num, title) = content.Loc.value in - let should_not_be_empty = - let what = Tokens.describe @@ Section_heading (num, title) in - Writer.Warning (Parse_error.should_not_be_empty ~what span) - in - let node = Loc.at span @@ `Heading (num, title, []) in - Writer.with_warning node should_not_be_empty - } - | content = with_position(Section_heading); end_pos = position(error); - { - let (num, title), start_pos = content in - let span = Loc.span @@ List.map Loc.of_position [start_pos; end_pos] in - let start_pos = fst start_pos and end_pos = snd end_pos in - let illegal = Writer.InputNeeded (fun input -> - let err = Loc.extract ~input ~start_pos ~end_pos in - let in_what = Tokens.describe @@ Section_heading (num, title) in - Parse_error.illegal ~in_what err span) - in - let inner = Loc.at span @@ `Heading (num, title, []) in - Writer.with_warning inner illegal - } + | content = located(Section_heading); children = sequence_nonempty(inline_element); endpos = located(RIGHT_BRACE); { + let span = Loc.delimited content endpos in + let (num, title) = content.Loc.value in + Writer.map (fun c -> Loc.at span @@ `Heading (num, title, trim_start c)) children + } + | content = located(Section_heading); endpos = located(RIGHT_BRACE); { + let span = Loc.delimited content endpos + and (num, title) = content.Loc.value in + let should_not_be_empty = + let what = Tokens.describe @@ Section_heading (num, title) in + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + let node = Loc.at span @@ `Heading (num, title, []) in + Writer.with_warning node should_not_be_empty + } + | content = with_position(Section_heading); end_pos = position(error); { + let (num, title), start_pos = content in + let span = Loc.span @@ List.map Loc.of_position [start_pos; end_pos] in + let start_pos = fst start_pos and end_pos = snd end_pos in + let illegal = Writer.InputNeeded (fun input -> + let err = Loc.extract ~input ~start_pos ~end_pos in + let in_what = Tokens.describe @@ Section_heading (num, title) in + Parse_error.illegal ~in_what err span) + in + let inner = Loc.at span @@ `Heading (num, title, []) in + Writer.with_warning inner illegal + } (* TAGS *) @@ -536,7 +504,7 @@ let inline_elt_legal_whitespace := | ~ = Single_newline; <`Space> let style := - | style = located(Style); children = sequence(inline_element); endpos = located( RIGHT_BRACE ); { + | style = located(Style); children = sequence(inline_element); endpos = located(RIGHT_BRACE); { let span = Loc.delimited style endpos in let style = style.Loc.value in let warning = @@ -673,7 +641,7 @@ let link := (* LIST *) let list_light_item_unordered := - | MINUS; ~ = nestable_block_element; <> + | MINUS; horizontal_whitespace; ~ = nestable_block_element; <> | horizontal_whitespace; MINUS; item = nestable_block_element; { let warning = @@ -684,24 +652,23 @@ let list_light_item_unordered := } let list_light_item_ordered := - | PLUS; ~ = nestable_block_element; <> - | horizontal_whitespace; PLUS; item = nestable_block_element; - { - let warning = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.should_begin_on_its_own_line ~what:(Tokens.describe MINUS) span) - in - Writer.warning warning item - } + | PLUS; horizontal_whitespace; ~ = nestable_block_element; <> + | horizontal_whitespace; PLUS; item = nestable_block_element; { + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_begin_on_its_own_line ~what:(Tokens.describe MINUS) span) + in + Writer.warning warning item + } let list_light := - | children = separated_nonempty_sequence(Single_newline, list_light_item_ordered); { + | children = separated_nonempty_sequence(Single_newline, list_light_item_unordered); { let* children = children in let span = Loc.span @@ List.map Loc.location children in let inner = Loc.at span @@ `List (`Ordered, `Light, [ children ]) in return inner } - | children = separated_nonempty_sequence(Single_newline, list_light_item_unordered); { + | children = separated_nonempty_sequence(Single_newline, list_light_item_ordered); { let* children = children in let span = Loc.span @@ List.map Loc.location children in let inner = Loc.at span @@ `List (`Unordered, `Light, [ children ]) in @@ -789,7 +756,7 @@ let list_heavy := Writer.with_warning inner (Writer.InputNeeded warning) } -let list_element := +let odoc_list := | ~ = list_light; <> | ~ = list_heavy; <> @@ -937,7 +904,7 @@ let nestable_block_element := ~ = nestable_block_element_inner; any_whitespace?; let nestable_block_element_inner := | ~ = verbatim; <> | ~ = code_block; <> - | ~ = list_element; <> + | ~ = odoc_list; <> | ~ = table; <> | ~ = media; <> | ~ = math_block; <> diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index fa59cd3d58..98c932b1af 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -112,7 +112,11 @@ let bad_markup = ("cr", ""); ] -let isolated = [ ("label", "{2:\xce\xbb Bar}") ] +let isolated = + [ + ("Heavy list", "+ foo bar baz") + (* ("Multiple right brace", "Foo } Bar } Baz") *); + ] (* Cases (mostly) taken from the 'odoc for library authors' document *) let documentation_cases = From a91f1964d42b501f7cf53a0b4030ef2d2a315300 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 1 Jan 2025 14:17:02 -0500 Subject: [PATCH 111/150] fix '@see' tag --- src/parser/parser.mly | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index bd3f784445..ae8450bbda 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -406,14 +406,13 @@ let raise := } let see := - | content = located(See); children = located(sequence_nonempty(nestable_block_element)); - { - let span = Loc.delimited content children in - let (kind, href) = content.Loc.value in - let* children = children.Loc.value in - let inner = Loc.at span @@ `See (Tokens.to_ast_ref kind, href, children) in - return inner - } + | content = located(See); horizontal_whitespace; children = located(sequence_nonempty(nestable_block_element)); { + let span = Loc.delimited content children in + let (kind, href) = content.Loc.value in + let* children = children.Loc.value in + let inner = Loc.at span @@ `See (Tokens.to_ast_ref kind, href, children) in + return inner + } | content = located(See); { let (kind, href) = content.Loc.value in return { content with value = `See (Tokens.to_ast_ref kind, href, []) } @@ -642,14 +641,13 @@ let link := let list_light_item_unordered := | MINUS; horizontal_whitespace; ~ = nestable_block_element; <> - | horizontal_whitespace; MINUS; item = nestable_block_element; - { - let warning = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.should_begin_on_its_own_line ~what:(Tokens.describe MINUS) span) - in - Writer.warning warning item - } + | horizontal_whitespace; MINUS; item = nestable_block_element; { + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_begin_on_its_own_line ~what:(Tokens.describe MINUS) span) + in + Writer.warning warning item + } let list_light_item_ordered := | PLUS; horizontal_whitespace; ~ = nestable_block_element; <> From 7908e365d41c2cc3cd17fbb25f6d3ee30655bf37 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 2 Jan 2025 16:49:46 -0500 Subject: [PATCH 112/150] format, add stray tag handling --- src/parser/parser.mly | 89 ++++++++++++++++++++++++++++--------------- 1 file changed, 58 insertions(+), 31 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index ae8450bbda..060347fff3 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -300,9 +300,10 @@ let toplevel := | block = nestable_block_element; { (block :> Ast.block_element Loc.with_location Writer.t) } | t = tag; { Writer.map (fun loc -> Loc.{ loc with value = `Tag loc.value }) t } | ~ = section_heading; <> + | ~ = toplevel_error; <> let toplevel_error := - | errloc = position(RIGHT_BRACE); { + | errloc = position(RIGHT_BRACE); whitespace?; { let span = Loc.of_position errloc in let warning = let what = Tokens.describe RIGHT_BRACE in @@ -322,6 +323,32 @@ let toplevel_error := let node = Loc.same as_text @@ `Paragraph [ as_text ] in Writer.with_warning node warning } + | list = list_light; { + let* list = list in + let warning = + let Loc.{ value; location } = list in + let what = Tokens.describe @@ + match value with + | `List (`Ordered, `Light, _) -> Tokens.PLUS + | `List (`Unordered, `Light, _) -> Tokens.MINUS + | _ -> assert false (* Unreachable *) + in + Parse_error.should_begin_on_its_own_line ~what location + in + Writer.with_warning (list :> Ast.block_element Loc.with_location) (Writer.Warning warning) + } + | illegal_elt = tag; { + Writer.bind + illegal_elt + (fun illegal_elt -> + let should_begin_on_its_own_line = Writer.Warning ( + let what = Tokens.describe_tag illegal_elt.Loc.value in + Parse_error.should_begin_on_its_own_line ~what illegal_elt.Loc.location) + in + let illegal_elt = Loc.map (fun t -> `Tag t) illegal_elt in + let inner = (illegal_elt :> Ast.block_element Loc.with_location) in + Writer.with_warning inner should_begin_on_its_own_line) + } (* SECTION HEADING *) @@ -356,7 +383,7 @@ let section_heading := (* TAGS *) -let tag == +let tag := | with_content = tag_with_content; Single_newline?; { with_content } | bare = tag_bare; Single_newline?; { bare } @@ -373,17 +400,20 @@ let tag_with_content := in Writer.with_warning ({ pos with Loc.value = `Deprecated [] }) warning } - | startpos = located(RETURN); children = located(sequence_nonempty(nestable_block_element)); { + | startpos = located(RETURN); horizontal_whitespace; children = located(sequence_nonempty(nestable_block_element)); { let span = Loc.delimited startpos children in Writer.map (fun c -> Loc.at span @@ `Return c) children.Loc.value } + | pos = located(RETURN); horizontal_whitespace?; { + return (Loc.same pos @@ `Return []) + } | ~ = before; <> | ~ = raise; <> | ~ = see; <> | ~ = param; <> let before := - | content = located(Before); children = sequence_nonempty(nestable_block_element); { + | content = located(Before); horizontal_whitespace; children = sequence_nonempty(nestable_block_element); { let* children = children in let span = Loc.span @@ content.Loc.location :: List.map Loc.location children in let inner = Loc.at span @@ `Before (content.Loc.value, children) in @@ -394,14 +424,14 @@ let before := } let raise := - | exn = located(Raise); children = located(sequence_nonempty(nestable_block_element)); { + | exn = located(Raise); horizontal_whitespace; children = located(sequence_nonempty(nestable_block_element)); { let span = Loc.delimited exn children in let exn = exn.Loc.value in let* children = children.Loc.value in let inner = Loc.at span @@ `Raise (exn, children) in return inner } - | exn = located(Raise); { + | exn = located(Raise); horizontal_whitespace?; { return { exn with Loc.value = `Raise (exn.Loc.value, []) } } @@ -413,24 +443,25 @@ let see := let inner = Loc.at span @@ `See (Tokens.to_ast_ref kind, href, children) in return inner } - | content = located(See); { + | content = located(See); horizontal_whitespace?; { let (kind, href) = content.Loc.value in return { content with value = `See (Tokens.to_ast_ref kind, href, []) } } let param := - | content = located(Param); children = sequence_nonempty(nestable_block_element); { + | content = located(Param); horizontal_whitespace; children = sequence_nonempty(nestable_block_element); { let* children = children in let span = Loc.span @@ content.Loc.location :: List.map Loc.location children in let ident = content.Loc.value in let inner = Loc.at span @@ `Param (ident, children) in return inner } - | content = located(Param); - { return { content with Loc.value = `Param (content.Loc.value, []) }} + | content = located(Param); horizontal_whitespace?; { + return { content with Loc.value = `Param (content.Loc.value, []) } + } let tag_bare := - | content = located(Version); + | content = located(Version); horizontal_whitespace?; { let Loc.{ value = version; location } = content in let what = Tokens.describe (Version version) in @@ -470,10 +501,10 @@ let tag_bare := Writer.ensure has_content warning @@ return value |> Writer.map (fun a -> { content with value = `Author a }) } - | pos = position(OPEN); { let loc = Loc.of_position pos in return @@ Loc.at loc `Open } - | pos = position(INLINE); { let loc = Loc.of_position pos in return @@ Loc.at loc `Inline } - | pos = position(CLOSED); { let loc = Loc.of_position pos in return @@ Loc.at loc `Closed } - | pos = position(HIDDEN); { let loc = Loc.of_position pos in return @@ Loc.at loc `Hidden } + | pos = position(OPEN); whitespace?; { let loc = Loc.of_position pos in return @@ Loc.at loc `Open } + | pos = position(INLINE); whitespace?; { let loc = Loc.of_position pos in return @@ Loc.at loc `Inline } + | pos = position(CLOSED); whitespace?; { let loc = Loc.of_position pos in return @@ Loc.at loc `Closed } + | pos = position(HIDDEN); whitespace?; { let loc = Loc.of_position pos in return @@ Loc.at loc `Hidden } (* INLINE ELEMENTS *) @@ -654,7 +685,7 @@ let list_light_item_ordered := | horizontal_whitespace; PLUS; item = nestable_block_element; { let warning = let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.should_begin_on_its_own_line ~what:(Tokens.describe MINUS) span) + Writer.Warning (Parse_error.should_begin_on_its_own_line ~what:(Tokens.describe PLUS) span) in Writer.warning warning item } @@ -663,13 +694,13 @@ let list_light := | children = separated_nonempty_sequence(Single_newline, list_light_item_unordered); { let* children = children in let span = Loc.span @@ List.map Loc.location children in - let inner = Loc.at span @@ `List (`Ordered, `Light, [ children ]) in + let inner = Loc.at span @@ `List (`Unordered, `Light, [ children ]) in return inner } | children = separated_nonempty_sequence(Single_newline, list_light_item_ordered); { let* children = children in let span = Loc.span @@ List.map Loc.location children in - let inner = Loc.at span @@ `List (`Unordered, `Light, [ children ]) in + let inner = Loc.at span @@ `List (`Ordered, `Light, [ children ]) in return inner } @@ -897,9 +928,7 @@ let media := (* TOP-LEVEL ELEMENTS *) -let nestable_block_element := ~ = nestable_block_element_inner; any_whitespace?; <> - -let nestable_block_element_inner := +let nestable_block_element := | ~ = verbatim; <> | ~ = code_block; <> | ~ = odoc_list; <> @@ -910,16 +939,14 @@ let nestable_block_element_inner := | ~ = modules; <> | ~ = paragraph_style; <> -let paragraph_style := - | style = Paragraph_style; ws = paragraph; RIGHT_BRACE; - { - let warning = - let span = Loc.of_position $sloc in - let what = Tokens.describe @@ Paragraph_style style in - Writer.Warning (Parse_error.markup_should_not_be_used span ~what) - in - Writer.warning warning ws - } +let paragraph_style := | style = Paragraph_style; ws = paragraph; RIGHT_BRACE; { + let warning = + let span = Loc.of_position $sloc in + let what = Tokens.describe @@ Paragraph_style style in + Writer.Warning (Parse_error.markup_should_not_be_used span ~what) + in + Writer.warning warning ws + } let verbatim := verbatim = located(Verbatim); { let Loc.{ value; location } = verbatim in From 66bb9ac39bca0dceb3088c4fd62480a06a4d9149 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 6 Jan 2025 15:52:48 -0500 Subject: [PATCH 113/150] Split `Code_block` into regular and `Code_block_with_output` --- src/parser/dune | 2 +- src/parser/lexer.mll | 170 +++++++++++++++++++++++++++--------------- src/parser/parser.mly | 22 ++++-- src/parser/tokens.ml | 40 +++++----- 4 files changed, 145 insertions(+), 89 deletions(-) diff --git a/src/parser/dune b/src/parser/dune index 69d306aed1..902a6a576b 100644 --- a/src/parser/dune +++ b/src/parser/dune @@ -2,7 +2,7 @@ (menhir (modules parser) - (flags --table --external-tokens Tokens --explain)) + (flags --table --external-tokens Tokens --explain )) (library (name odoc_parser) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 003d0d957f..0e1fafd78a 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -160,10 +160,6 @@ let math_constr input kind x start_offset = | Inline -> Math_span Tokens.{ start = start_pos; content = x } | Block -> Math_block Tokens.{ start = start_pos; content = x } -let with_loc : Lexing.lexbuf -> input -> Loc.span -> ('a -> 'a Loc.with_location) = - fun _lexbuf _input location -> - Loc.at location - let with_location_adjustments : (Lexing.lexbuf -> input -> Loc.span -> 'a) -> Lexing.lexbuf @@ -271,26 +267,33 @@ let emit_verbatim lexbuf input start_offset buffer = Note that the location reflects the content _without_ stripping of whitespace, whereas the value of the content in the tree has whitespace stripped from the beginning, and trailing empty lines removed. *) -let emit_code_block ~content_offset ~lexbuf input meta delimiter terminator buffer output = +let emit_code_block lexbuf input ~content_offset ~metadata ~delimiter ~terminator ~content has_output = + let content = Buffer.contents content |> trim_trailing_blank_lines in let content_location = input.offset_to_location content_offset in + let content = + with_location_adjustments + (fun _ _location _ c -> + let first_line_offset = content_location.column in + trim_leading_whitespace ~first_line_offset c) + lexbuf + input + content + in + let content = trim_leading_blank_lines content in let content = - Buffer.contents buffer - |> trim_trailing_blank_lines - |> with_location_adjustments - (fun _ _ _ -> - let first_line_offset = content_location.column in - trim_leading_whitespace ~first_line_offset) - lexbuf - input - |> trim_leading_blank_lines - |> with_location_adjustments - ~adjust_end_by:terminator - ~start_offset:content_offset - with_loc - lexbuf - input + with_location_adjustments + ~adjust_end_by:terminator + ~start_offset:content_offset + (fun _ _ -> Loc.at) + lexbuf + input + content in - Code_block { meta; delimiter; content; output } + let inner = { metadata; delimiter; content } in + if has_output then + Code_block_with_output inner + else + Code_block inner let heading_level lexbuf input level = if String.length level >= 2 && level.[0] = '0' then begin @@ -492,42 +495,43 @@ and token input = parse } | "{[" - { code_block (Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf) None (Buffer.create 256) "" input lexbuf } + { code_block false (Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf) None (Buffer.create 256) "" input lexbuf } - | (("{" (delim_char* as delimiter) "@" horizontal_space*) as prefix) (language_tag_char+ as lang_tag_) + | (("{" (delim_char* as delimiter) "@" horizontal_space*) as prefix) (language_tag_char+ as language_tag) { let start_offset = Lexing.lexeme_start lexbuf in - let lang_tag = - with_location_adjustments - ~adjust_start_by:prefix - with_loc - lexbuf - input - lang_tag_ + let language_tag = + with_location_adjustments ~adjust_start_by:prefix (fun _ _ -> Loc.at) lexbuf input language_tag in let emit_truncated_code_block () = - let empty_content = with_location_adjustments with_loc lexbuf input "" in - Code_block { meta = Some { language = lang_tag; tags = None }; delimiter = Some delimiter; content = empty_content; output = None} + let empty_content = with_location_adjustments (fun _ _ -> Loc.at) lexbuf input "" in + Code_block { metadata = Some { language_tag; tags = None }; delimiter = Some delimiter; content = empty_content } + in + (* Disallow result block sections for code blocks without a delimiter. + This avoids the surprising parsing of '][' ending the code block. *) + let allow_result_block = delimiter <> "" in + let code_block_with_metadata metadata = + let content_offset = Lexing.lexeme_end lexbuf in + let metadata = Some { language_tag; tags = metadata } in + let prefix = Buffer.create 256 in + code_block allow_result_block start_offset content_offset metadata + prefix delimiter input lexbuf in match code_block_metadata_tail input lexbuf with - | Ok metadata -> code_block start_offset (Lexing.lexeme_end lexbuf) (Some metadata) (Buffer.create 256) delimiter input lexbuf + | Ok metadata -> code_block_with_metadata metadata | Error `Eof -> warning lexbuf input ~start_offset Parse_error.truncated_code_block_meta; emit_truncated_code_block () | Error (`Invalid_char c) -> - let language_tag_invalid_char = - Parse_error.language_tag_invalid_char lang_tag_ c - in - warning lexbuf input ~start_offset language_tag_invalid_char; - - (* NOTE : (@faycarsons) Metadata should not be `None` *) - code_block start_offset (Lexing.lexeme_end lexbuf) None (Buffer.create 256) delimiter input lexbuf + warning lexbuf input ~start_offset + (Parse_error.language_tag_invalid_char language_tag.Loc.value c); + code_block_with_metadata None } | "{@" horizontal_space* '[' { warning lexbuf input Parse_error.no_language_tag_in_meta; - code_block (Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf) None (Buffer.create 256) "" input lexbuf + code_block false (Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf) None (Buffer.create 256) "" input lexbuf } | "{v" @@ -872,13 +876,7 @@ and code_block_metadata_tail input = parse ((space_char* '[') as suffix) { let meta = - with_location_adjustments - ~adjust_start_by:prefix - ~adjust_end_by:suffix - with_loc - lexbuf - input - meta + with_location_adjustments ~adjust_start_by:prefix ~adjust_end_by:suffix (fun _ _ -> Loc.at) lexbuf input meta in Ok (Some meta) } @@ -889,30 +887,78 @@ and code_block_metadata_tail input = parse | eof { Error `Eof } -(* NOTE : (@faycarsons) This is currently broken!! *) -and code_block start_offset content_offset metadata prefix delim input = parse +and code_block allow_result_block start_offset content_offset metadata buffer delimiter input = parse | ("]" (delim_char* as delim') "[") as terminator - { if delim = delim' then - emit_code_block ~content_offset ~lexbuf input None (Some delim) terminator prefix None - else - (Buffer.add_string prefix terminator; - code_block start_offset content_offset metadata prefix delim input lexbuf) } + { if delimiter = delim' && allow_result_block then + emit_code_block + lexbuf + input + ~content_offset + ~metadata + ~delimiter:(Some delimiter) + ~terminator + ~content:buffer + true + else ( + Buffer.add_string buffer terminator; + code_block + allow_result_block + start_offset + content_offset metadata + buffer + delimiter + input + lexbuf + ) + } | ("]" (delim_char* as delim') "}") as terminator { - if delim = delim' then - emit_code_block ~content_offset ~lexbuf input None (Some delim ) terminator prefix None + if delimiter = delim' then + emit_code_block + lexbuf + input + ~content_offset + ~metadata + ~delimiter:(Some delimiter) + ~terminator + ~content:buffer + false else ( - Buffer.add_string prefix terminator; - code_block start_offset content_offset metadata prefix delim input lexbuf + Buffer.add_string buffer terminator; + code_block + allow_result_block + start_offset + content_offset + metadata + buffer + delimiter + input + lexbuf ) } | eof { warning lexbuf input ~start_offset Parse_error.truncated_code_block; - emit_code_block ~content_offset ~lexbuf input None (Some delim ) "" prefix None + emit_code_block + lexbuf + input + ~content_offset + ~metadata + ~delimiter:(Some delimiter) + ~terminator:"" + ~content:buffer + false } | (_ as c) { - Buffer.add_char prefix c; - code_block start_offset content_offset metadata prefix delim input lexbuf + Buffer.add_char buffer c; + code_block + allow_result_block + start_offset + content_offset + metadata + buffer + delimiter + input + lexbuf } diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 060347fff3..8659078e4e 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -180,7 +180,6 @@ let legal_module_list : Ast.inline_element Loc.with_location list -> bool = | `Word _ | `Space _ -> true | _ -> false) @@ List.map Loc.value xs - %} %token RIGHT_BRACE "{" @@ -207,7 +206,8 @@ let legal_module_list : Ast.inline_element Loc.with_location list -> bool = %token Raw_markup "{%%}" -%token Code_block "{[]}" +%token Code_block "{[]}" (* Self-contained code-block *) +%token Code_block_with_output "{[][" (* Code block that expects some block elements *) %token Code_span "[]" %token List "{ol" (* or '{ul' *) @@ -965,10 +965,20 @@ let paragraph := items = sequence_nonempty(inline_element); { Writer.map paragraph items } -let code_block := c = Code_block; { - let span = Loc.of_position $sloc in - return (Loc.at span @@ `Code_block c) -} +let code_block := + | content = located(Code_block); { + return @@ Loc.map (fun Tokens.{metadata; delimiter; content} -> + let meta = Option.map (fun Tokens.{language_tag; tags} -> Ast.{ language = language_tag; tags }) metadata in + `Code_block Ast.{ meta; delimiter; content; output = None } + ) content + } + | content = located(Code_block_with_output); output = sequence_nonempty(nestable_block_element); RIGHT_CODE_DELIMITER; { + let* output = output in + return @@ Loc.map (fun Tokens.{metadata; delimiter; content} -> + let meta = Option.map (fun Tokens.{language_tag; tags} -> Ast.{ language = language_tag; tags }) metadata in + `Code_block Ast.{ meta; delimiter; content; output = Some output } + ) content + } let math_block := inner = located(Math_block); { let Loc.{ value ; location } = inner in diff --git a/src/parser/tokens.ml b/src/parser/tokens.ml index 7525258a52..ec3ab8312b 100644 --- a/src/parser/tokens.ml +++ b/src/parser/tokens.ml @@ -17,6 +17,16 @@ let ast_list_kind : list_kind -> Ast.list_kind = function | Ordered -> `Ordered | Unordered -> `Unordered +type code_block = { + metadata : meta option; + delimiter : string option; + content : string Loc.with_location; +} +and meta = { + language_tag : string Loc.with_location; + tags : string Loc.with_location option; +} + type token = | Space of string | Single_newline of string @@ -32,7 +42,8 @@ type token = | Math_span of math | Math_block of math | Code_span of string - | Code_block of Ast.code_block + | Code_block of code_block + | Code_block_with_output of code_block | Word of string | Verbatim of string | RIGHT_CODE_DELIMITER @@ -95,7 +106,7 @@ let print : token -> string = function | Math_span _ -> "{m" | Math_block _ -> "{math" | Code_span _ -> "[" - | Code_block _ -> "{[" + | Code_block _ | Code_block_with_output _ -> "{[" | Word w -> w | Verbatim _ -> "{v" | RIGHT_CODE_DELIMITER -> "]}" @@ -174,7 +185,7 @@ let describe : token -> string = function | Blank_line _ -> "blank line" | RIGHT_BRACE -> "'}'" | RIGHT_CODE_DELIMITER -> "']}'" - | Code_block _ -> "'{[...]}' (code block)" + | Code_block _ | Code_block_with_output _ -> "'{[...]}' (code block)" | Verbatim _ -> "'{v ... v}' (verbatim text)" | MODULES -> "'{!modules ...}'" | List Unordered -> "'{ul ...}' (bulleted list)" @@ -207,23 +218,12 @@ let describe : token -> string = function | HIDDEN -> "'@hidden" let empty_code_block = - Ast. - { - meta = None; - delimiter = None; - content = - Loc. - { - value = ""; - location = - { - file = ""; - start = { line = 0; column = 0 }; - end_ = { line = 0; column = 0 }; - }; - }; - output = None; - } + { + metadata = None; + delimiter = None; + content = + Loc.at Loc.{ start = Loc.dummy_pos; end_ = Loc.dummy_pos; file = "" } ""; + } let of_ast_style : Ast.style -> style = function | `Bold -> Bold From 80a7bd3c2e51ba7efed8e9853c52a22ebac403c6 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 6 Jan 2025 17:03:14 -0500 Subject: [PATCH 114/150] cover remaining error cases --- src/parser/parser.mly | 50 ++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 8659078e4e..9a0ede029e 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -245,8 +245,7 @@ let legal_module_list : Ast.inline_element Loc.with_location list -> bool = %token Simple_link "{:" %token Link_with_replacement "{{:" %token Media "{(format)!" -(* where 'format' is audio, video, image *) -%token Media_with_replacement "{(format):" +%token Media_with_replacement "{(format):" (* where 'format' is audio, video, image *) %token Verbatim "{v" %token END @@ -290,11 +289,15 @@ let any_whitespace := | ~ = whitespace; <> | ~ = Blank_line; <`Space> +let line_break := + | ~ = Single_newline; <> + | ~ = Blank_line; <> + (* ENTRY *) let main := | ~ = sequence_nonempty(toplevel); END; <> - | END; { return [] } + | any_whitespace?; END; { return [] } let toplevel := | block = nestable_block_element; { (block :> Ast.block_element Loc.with_location Writer.t) } @@ -384,33 +387,39 @@ let section_heading := (* TAGS *) let tag := - | with_content = tag_with_content; Single_newline?; { with_content } - | bare = tag_bare; Single_newline?; { bare } + | with_content = tag_with_content; line_break?; { with_content } + | bare = tag_bare; line_break?; { bare } let tag_with_content := - | startpos = located(DEPRECATED); children = located(sequence_nonempty(nestable_block_element)); { + | ~ = before; <> + | ~ = raise; <> + | ~ = see; <> + | ~ = param; <> + | ~ = deprecated; <> + | ~ = return; <> + +let return := + | startpos = located(RETURN); horizontal_whitespace; children = located(sequence_nonempty(nestable_block_element)); { + let span = Loc.delimited startpos children in + Writer.map (fun c -> Loc.at span @@ `Return c) children.Loc.value + } + | pos = located(RETURN); horizontal_whitespace?; { + return (Loc.same pos @@ `Return []) + } + +let deprecated := + | startpos = located(DEPRECATED); horizontal_whitespace; children = located(sequence_nonempty(nestable_block_element)); { let span = Loc.delimited startpos children in Writer.map (fun c -> Loc.at span @@ `Deprecated c) children.Loc.value } - | pos = located(DEPRECATED); + | pos = located(DEPRECATED); horizontal_whitespace?; { return @@ { pos with Loc.value = `Deprecated [] } } - | pos = located(DEPRECATED); errloc = located(RIGHT_BRACE); { + | pos = located(DEPRECATED); horizontal_whitespace?; errloc = located(RIGHT_BRACE); { let warning = Writer.Warning (Parse_error.unpaired_right_brace @@ errloc.Loc.location) in Writer.with_warning ({ pos with Loc.value = `Deprecated [] }) warning } - | startpos = located(RETURN); horizontal_whitespace; children = located(sequence_nonempty(nestable_block_element)); { - let span = Loc.delimited startpos children in - Writer.map (fun c -> Loc.at span @@ `Return c) children.Loc.value - } - | pos = located(RETURN); horizontal_whitespace?; { - return (Loc.same pos @@ `Return []) - } - | ~ = before; <> - | ~ = raise; <> - | ~ = see; <> - | ~ = param; <> let before := | content = located(Before); horizontal_whitespace; children = sequence_nonempty(nestable_block_element); { @@ -928,7 +937,8 @@ let media := (* TOP-LEVEL ELEMENTS *) -let nestable_block_element := +let nestable_block_element := ~ = nestable_block_element_inner; any_whitespace?; <> +let nestable_block_element_inner := | ~ = verbatim; <> | ~ = code_block; <> | ~ = odoc_list; <> From 77be167f8d59952c2746da1bbb04f8e5e8a5a4a4 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 6 Jan 2025 17:05:24 -0500 Subject: [PATCH 115/150] fix leading whitespace error --- src/parser/parser.mly | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 9a0ede029e..25cbd2a6de 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -296,7 +296,7 @@ let line_break := (* ENTRY *) let main := - | ~ = sequence_nonempty(toplevel); END; <> + | any_whitespace?; ~ = sequence_nonempty(toplevel); END; <> | any_whitespace?; END; { return [] } let toplevel := From 3d7d8b7465ec93ecf8cff1d4afd4218b380494f9 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 6 Jan 2025 17:07:36 -0500 Subject: [PATCH 116/150] update tester --- src/parser/test_driver/tester.ml | 77 +++++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 5 deletions(-) diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index 98c932b1af..389aec27ee 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -53,6 +53,70 @@ let open_t = ("with_list", "@open - foo"); ] +let tags = + let raise_tests = + [ + ("raise basic", "@raise Foo"); + ("raise bare", "@raise"); + ("raise words", "@raise foo bar baz"); + ("raise prefix", "@raisefoo"); + ] + and return_tests = + [ + ("return basic", "@return"); + ("return words", "@return foo bar"); + ("return prefix", "@returnfoo"); + ] + and before_tests = + [ + ("before basic", "@before Foo"); + ("before bare", "@before"); + ("before words", "@before foo bar baz"); + ("before prefix", "@beforefoo"); + ] + and param_tests = + [ + ("param basic", "@param foo"); + ("param bare", "@param"); + ("param bare_with_whitespace", "@param"); + ("param immediate_newline", "@param\nfoo"); + ("param followed_by_whitespace", "@param foo"); + ("param extra_whitespace", "@param foo"); + ("param words", "@param foo bar baz"); + ("param multiline", "@param foo\nbar\nbaz"); + ("param paragraphs", "@param foo bar\n\nbaz"); + ("param two", "@param foo\n@param bar"); + ("param nested", "@param foo @param bar"); + ("param preceded_by_paragraph", "foo\n@param bar"); + ("param prefix", "@paramfoo"); + ("param after_code_block", "{[foo]} @param foo"); + ] + and deprecated_tests = + [ + ("basic", "@deprecated"); + ("words", "@deprecated foo bar"); + ("multiline", "@deprecated foo\nbar"); + ("paragraphs", "@deprecated foo\n\nbar"); + ("whitespace_only", "@deprecated"); + ("immediate_newline", "@deprecated\nfoo"); + ("immediate_cr_lf", "@deprecated\r\nfoo"); + ("immediate_blank_line", "@deprecated\n\nfoo"); + ("extra_whitespace", "@deprecated foo"); + ("followed_by_deprecated", "@deprecated foo\n@deprecated bar"); + ("followed_by_deprecated_cr_lf", "@deprecated foo\r\n@deprecated bar"); + ("nested_in_self", "@deprecated foo @deprecated bar"); + ("nested_in_self_at_start", "@deprecated @deprecated foo"); + ("preceded_by_paragraph", "foo\n@deprecated"); + ("preceded_by_shorthand_list", "- foo\n@deprecated"); + ("with_shorthand_list", "@deprecated - foo"); + ("with_shorthand_list_after_newline", "@deprecated\n- foo"); + ("prefix", "@deprecatedfoo"); + ("after_code_block", "{[foo]} @deprecated"); + ("followed_by_section", "@deprecated foo\n{2 Bar}"); + ] + in + raise_tests @ return_tests @ before_tests @ param_tests @ deprecated_tests + let utf8 = [ ("lambda", "\xce\xbb"); @@ -112,11 +176,7 @@ let bad_markup = ("cr", ""); ] -let isolated = - [ - ("Heavy list", "+ foo bar baz") - (* ("Multiple right brace", "Foo } Bar } Baz") *); - ] +let isolated = [ ("empty line", " \n @foo") ] (* Cases (mostly) taken from the 'odoc for library authors' document *) let documentation_cases = @@ -158,6 +218,10 @@ let documentation_cases = ("Code block", "{[\n let foo = 0 \n]}"); ] +let all_tests = + code_cases @ error_recovery @ open_t @ tags @ utf8 @ bad_markup + @ documentation_cases + open Test.Serialize let error err = Atom (Odoc_parser.Warning.to_string err) @@ -259,6 +323,9 @@ let () = | "open" | "o" -> open_t | "utf8" | "u" -> utf8 | "isolated" | "i" -> isolated + | "tags" | "t" -> tags + | "all" -> all_tests + | "deprecated" | "p" -> deprecated_tests | _ -> print_endline "unrecognized argument - running documentation_cases"; documentation_cases) From ae835ad68ff6f5b38f2cac2b2717ca0406b3731b Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 6 Jan 2025 17:18:57 -0500 Subject: [PATCH 117/150] Handle stray dashes, '+', and '-' --- src/parser/parser.mly | 35 ++++++++++++++++++++++++++++++-- src/parser/test_driver/tester.ml | 18 +++++++++++++++- 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 25cbd2a6de..d28fff747f 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -153,8 +153,9 @@ | xs -> xs let paragraph : Ast.inline_element Loc.with_location list -> Ast.nestable_block_element Loc.with_location = - fun elts -> - let span = List.map Loc.location (trim_start elts) |> Loc.span in + fun elts -> + let elts = trim_start elts in + let span = Loc.span @@ List.map Loc.location elts in Loc.at span @@ `Paragraph elts let rec inline_element_inner : Ast.inline_element -> string = function @@ -306,6 +307,36 @@ let toplevel := | ~ = toplevel_error; <> let toplevel_error := + | errloc = position(PLUS); whitespace?; { + let span = Loc.of_position errloc in + let warning = + let what = Tokens.describe PLUS in + Writer.Warning (Parse_error.bad_markup what span) + in + let as_text = Loc.at span @@ `Word "{" in + let node = (Loc.same as_text @@ `Paragraph [ as_text ]) in + Writer.with_warning node warning + } + | errloc = position(MINUS); whitespace?; { + let span = Loc.of_position errloc in + let warning = + let what = Tokens.describe MINUS in + Writer.Warning (Parse_error.bad_markup what span) + in + let as_text = Loc.at span @@ `Word "{" in + let node = (Loc.same as_text @@ `Paragraph [ as_text ]) in + Writer.with_warning node warning + } + | errloc = position(BAR); whitespace?; { + let span = Loc.of_position errloc in + let warning = + let what = Tokens.describe BAR in + Writer.Warning (Parse_error.bad_markup what span) + in + let as_text = Loc.at span @@ `Word "{" in + let node = (Loc.same as_text @@ `Paragraph [ as_text ]) in + Writer.with_warning node warning + } | errloc = position(RIGHT_BRACE); whitespace?; { let span = Loc.of_position errloc in let warning = diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index 389aec27ee..99c1b51c07 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -176,6 +176,22 @@ let bad_markup = ("cr", ""); ] +let dashes = + [ + ("minus_in_word", "foo-bar"); + ("minus_as_word", "foo -"); + ("plus_in_word", "foo+bar"); + ("plus_as_word", "foo +"); + ("bar_in_word", "foo|bar"); + ("escaped_bar_in_word", "foo\\|bar"); + ("bar_as_word", "foo |"); + ("negative_number", "-3.14 -1337"); + ("n_em_dash", "-- ---"); + ("minus_at", "-@"); + ("at_minus", "-@-"); + ("option", "--option"); + ] + let isolated = [ ("empty line", " \n @foo") ] (* Cases (mostly) taken from the 'odoc for library authors' document *) @@ -325,7 +341,7 @@ let () = | "isolated" | "i" -> isolated | "tags" | "t" -> tags | "all" -> all_tests - | "deprecated" | "p" -> deprecated_tests + | "dashes" | "ds" -> dashes | _ -> print_endline "unrecognized argument - running documentation_cases"; documentation_cases) From 78ec1a41a2ba0fca435c819432895bed1f33772e Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 7 Jan 2025 13:22:24 -0500 Subject: [PATCH 118/150] improve `Paragraph_style` location --- src/parser/parser.mly | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index d28fff747f..2d84a69f2f 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -980,10 +980,10 @@ let nestable_block_element_inner := | ~ = modules; <> | ~ = paragraph_style; <> -let paragraph_style := | style = Paragraph_style; ws = paragraph; RIGHT_BRACE; { +let paragraph_style := content = located(Paragraph_style); ws = paragraph; endpos = located(RIGHT_BRACE); { + let span = Loc.delimited content endpos in let warning = - let span = Loc.of_position $sloc in - let what = Tokens.describe @@ Paragraph_style style in + let what = Tokens.describe @@ Paragraph_style content.Loc.value in Writer.Warning (Parse_error.markup_should_not_be_used span ~what) in Writer.warning warning ws From 6f75b2fa48e7886d544079d61b6f6fe69d6004e9 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 7 Jan 2025 13:45:24 -0500 Subject: [PATCH 119/150] Fix paragraph splitting on newlines --- src/parser/parser.mly | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 2d84a69f2f..d978783b0e 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -387,7 +387,7 @@ let toplevel_error := (* SECTION HEADING *) let section_heading := - | content = located(Section_heading); children = sequence_nonempty(inline_element); endpos = located(RIGHT_BRACE); { + | content = located(Section_heading); children = sequence_nonempty(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { let span = Loc.delimited content endpos in let (num, title) = content.Loc.value in Writer.map (fun c -> Loc.at span @@ `Heading (num, title, trim_start c)) children @@ -548,9 +548,9 @@ let tag_bare := (* INLINE ELEMENTS *) -let inline_element := +let inline_element(ws) := | ~ = inline_element_without_whitespace; <> - | s = located(inline_elt_legal_whitespace); { return s } + | s = located(ws); { return s } let inline_element_without_whitespace := (* Single token inline elements which are mostly handled in the lexer *) @@ -569,12 +569,8 @@ let inline_element_without_whitespace := | ~ = reference; <> | ~ = link; <> -let inline_elt_legal_whitespace := - | ~ = Space; <`Space> - | ~ = Single_newline; <`Space> - let style := - | style = located(Style); children = sequence(inline_element); endpos = located(RIGHT_BRACE); { + | style = located(Style); children = sequence(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { let span = Loc.delimited style endpos in let style = style.Loc.value in let warning = @@ -625,14 +621,14 @@ let style := (* LINKS + REFS *) let reference := - | ref_body = located(Simple_ref); children = sequence(inline_element); { + | ref_body = located(Simple_ref); children = sequence(inline_element(whitespace)); { let+ children = children in let startpos = Loc.nudge_start (-2) ref_body.Loc.location in let span = Loc.span @@ startpos :: List.map Loc.location children in let ref_body = Loc.at startpos ref_body.Loc.value in Loc.at span @@ `Reference (`Simple, ref_body, trim_start children) } - | ref_body = located(Ref_with_replacement); children = sequence_nonempty(inline_element); endpos = located(RIGHT_BRACE); { + | ref_body = located(Ref_with_replacement); children = sequence_nonempty(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { let+ children = children in let startpos = Loc.nudge_map_start (-3) ref_body in let ref_body = Loc.same startpos ref_body.Loc.value in @@ -649,7 +645,7 @@ let reference := in Writer.with_warning node warning } - | ref_body = located(Ref_with_replacement); children = sequence_nonempty(inline_element)?; endpos = located(END); { + | ref_body = located(Ref_with_replacement); children = sequence_nonempty(inline_element(whitespace))?; endpos = located(END); { let Loc.{value = ref_body_value; location} = ref_body in let startpos = Loc.nudge_start (-3) location in let ref_body : string Loc.with_location = Loc.at startpos ref_body_value in @@ -679,7 +675,7 @@ let link := else return node } - | link_body = located(Link_with_replacement); children = sequence_nonempty(inline_element); endpos = located(RIGHT_BRACE); + | link_body = located(Link_with_replacement); children = sequence_nonempty(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { let* c = children in let span = Loc.delimited link_body endpos in @@ -981,13 +977,13 @@ let nestable_block_element_inner := | ~ = paragraph_style; <> let paragraph_style := content = located(Paragraph_style); ws = paragraph; endpos = located(RIGHT_BRACE); { - let span = Loc.delimited content endpos in - let warning = - let what = Tokens.describe @@ Paragraph_style content.Loc.value in - Writer.Warning (Parse_error.markup_should_not_be_used span ~what) - in - Writer.warning warning ws - } + let span = Loc.delimited content endpos in + let warning = + let what = Tokens.describe @@ Paragraph_style content.Loc.value in + Writer.Warning (Parse_error.markup_should_not_be_used span ~what) + in + Writer.warning warning ws +} let verbatim := verbatim = located(Verbatim); { let Loc.{ value; location } = verbatim in @@ -1002,7 +998,7 @@ let verbatim := verbatim = located(Verbatim); { |> Writer.map (Fun.const verbatim) } -let paragraph := items = sequence_nonempty(inline_element); { +let paragraph := items = sequence_nonempty(inline_element(horizontal_whitespace)); Single_newline?; { Writer.map paragraph items } @@ -1033,7 +1029,7 @@ let math_block := inner = located(Math_block); { |> Writer.map (fun m -> Loc.at span @@ `Math_block m) } -let modules := startpos = located(MODULES); modules = sequence(inline_element); endpos = located(RIGHT_BRACE); { +let modules := startpos = located(MODULES); modules = sequence(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { let in_what = Tokens.describe MODULES in let* modules = modules in let not_allowed = @@ -1067,7 +1063,7 @@ let modules := startpos = located(MODULES); modules = sequence(inline_element); in return inner } - | startpos = located(MODULES); modules = sequence(inline_element); endpos = located(END); { + | startpos = located(MODULES); modules = sequence(inline_element(whitespace)); endpos = located(END); { let in_what = Tokens.describe MODULES in let* modules = modules in let span = Loc.span @@ List.map Loc.location modules in From 441285c722db84c0ab7ee3c3c045ea88e8c9c377 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 9 Jan 2025 10:25:50 -0500 Subject: [PATCH 120/150] finish location refactor --- src/parser/lexer.mll | 183 ++++++++++-------- src/parser/loc.ml | 10 +- src/parser/loc.mli | 1 + src/parser/parser.mly | 424 ++++++++++++++++++++++-------------------- src/parser/tokens.ml | 123 ++++++------ 5 files changed, 399 insertions(+), 342 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 0e1fafd78a..592ff31641 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -154,11 +154,11 @@ type input = { type math_kind = Inline | Block -let math_constr input kind x start_offset = +let math_constr input kind inner start_offset = let start_pos = input.offset_to_location start_offset in match kind with - | Inline -> Math_span Tokens.{ start = start_pos; content = x } - | Block -> Math_block Tokens.{ start = start_pos; content = x } + | Inline -> Math_span Tokens.{ start = start_pos; inner } + | Block -> Math_block Tokens.{ start = start_pos; inner } let with_location_adjustments : (Lexing.lexbuf -> input -> Loc.span -> 'a) @@ -203,34 +203,35 @@ let warning = with_location_adjustments @@ fun _lexbuf input location error -> input.warnings <- error location :: input.warnings -let reference_token media start ( target : string ) input lexbuf = - match start with - | "{!" -> Simple_ref target - | "{{!" -> Ref_with_replacement target - | "{:" -> Simple_link (target) - | "{{:" -> Link_with_replacement (target) - - | "{image!" -> Media (Reference target, Image) - | "{image:" -> Media (Link target, Image) - | "{audio!" -> Media (Reference target, Audio) - | "{audio:" -> Media (Link target, Audio) - | "{video!" -> Media (Reference target, Video) - | "{video:" -> Media (Link target, Video) +let reference_token lexbuf input media ~opening_delimiter ~start_offset ~inner = + let start = input.offset_to_location start_offset in + match opening_delimiter with + | "{!" -> Simple_ref { inner; start } + | "{{!" -> Ref_with_replacement { inner; start } + | "{:" -> Simple_link { inner; start } + | "{{:" -> Link_with_replacement { inner; start } + + | "{image!" -> Media { inner = (Reference inner, Image); start } + | "{image:" -> Media { inner = (Link inner, Image); start } + | "{audio!" -> Media { inner = (Reference inner, Audio); start } + | "{audio:" -> Media { inner = (Link inner, Audio); start } + | "{video!" -> Media { inner = (Reference inner, Video); start } + | "{video:" -> Media { inner = (Link inner, Video); start } | _ -> let target, kind = - match start with - | "{{image!" -> Reference target, Image - | "{{image:" -> Link target, Image - | "{{audio!" -> Reference target, Audio - | "{{audio:" -> Link target, Audio - | "{{video!" -> Reference target, Video - | "{{video:" -> Link target, Video + match opening_delimiter with + | "{{image!" -> Reference inner, Image + | "{{image:" -> Link inner, Image + | "{{audio!" -> Reference inner, Audio + | "{{audio:" -> Link inner, Audio + | "{{video!" -> Reference inner, Video + | "{{video:" -> Link inner, Video | _ -> assert false in - let token_descr = Tokens.describe (Media_with_replacement (target, kind, "")) in + let token_descr = Tokens.describe (Media_with_replacement { inner = (target, kind, ""); start }) in let content = media token_descr (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) input lexbuf in - Media_with_replacement (target, kind, content) + Media_with_replacement { inner = (target, kind, content); start } let trim_leading_space_or_accept_whitespace lexbuf input start_offset text = match text.[0] with @@ -253,13 +254,14 @@ let trim_trailing_space_or_accept_whitespace text = | exception Invalid_argument _ -> text let emit_verbatim lexbuf input start_offset buffer = - let t = Buffer.contents buffer + let start = input.offset_to_location start_offset in + let inner = Buffer.contents buffer |> trim_trailing_space_or_accept_whitespace |> trim_leading_space_or_accept_whitespace lexbuf input start_offset |> trim_leading_blank_lines |> trim_trailing_blank_lines in - Verbatim t + Verbatim { inner; start } (* The locations have to be treated carefully in this function. We need to ensure that the []`Code_block] location matches the entirety of the block including the terminator, @@ -267,7 +269,7 @@ let emit_verbatim lexbuf input start_offset buffer = Note that the location reflects the content _without_ stripping of whitespace, whereas the value of the content in the tree has whitespace stripped from the beginning, and trailing empty lines removed. *) -let emit_code_block lexbuf input ~content_offset ~metadata ~delimiter ~terminator ~content has_output = +let emit_code_block lexbuf input ~start_offset ~content_offset ~metadata ~delimiter ~terminator ~content has_output = let content = Buffer.contents content |> trim_trailing_blank_lines in let content_location = input.offset_to_location content_offset in let content = @@ -289,11 +291,12 @@ let emit_code_block lexbuf input ~content_offset ~metadata ~delimiter ~terminato input content in - let inner = { metadata; delimiter; content } in + let inner = { metadata; delimiter; content } + and start = input.offset_to_location start_offset in if has_output then - Code_block_with_output inner + Code_block_with_output { inner; start } else - Code_block inner + Code_block { inner; start } let heading_level lexbuf input level = if String.length level >= 2 && level.[0] = '0' then begin @@ -331,7 +334,7 @@ let horizontal_space = let newline = '\n' | "\r\n" -let media_start = +let reference = "{!" | "{{!" | "{:" | "{{:" | "{image!" | "{{image!" | "{image:" | "{{image:" | "{video!" | "{{video!" | "{video:" | "{{video:" @@ -349,8 +352,7 @@ let language_tag_char = let delim_char = ['a'-'z' 'A'-'Z' '0'-'9' '_' ] -rule reference_paren_content input start ref_offset start_offset depth_paren - buffer = +rule reference_paren_content input start ref_offset start_offset depth_paren buffer = parse | '(' { @@ -413,14 +415,14 @@ and token input = parse { END } | ((horizontal_space* newline as prefix) - horizontal_space* ((newline horizontal_space*)+ as suffix) as ws) + horizontal_space* ((newline)+ as suffix) as ws) { (* Account for the first newline we got *) update_content_newlines ~content:("\n" ^ prefix ^ suffix) lexbuf; Blank_line ws } - | (horizontal_space* newline horizontal_space* as ws) + | (horizontal_space* newline as ws) { Lexing.new_line lexbuf; Single_newline ws @@ -429,7 +431,7 @@ and token input = parse | (horizontal_space+ as ws) { Space ws } - | (horizontal_space* (newline horizontal_space*)? as p) '}' + | (horizontal_space* (newline)? as p) '}' { update_content_newlines ~content:p lexbuf; RIGHT_BRACE } @@ -452,7 +454,7 @@ and token input = parse { PLUS } | "{b" - { Style Bold } + { Style Bold } | "{i" { Style Italic } @@ -485,13 +487,13 @@ and token input = parse | "{!modules:" { MODULES } -| (media_start as start) +| (reference as opening_delimiter) { let start_offset = Lexing.lexeme_start lexbuf in - let target = - reference_content input start start_offset (Buffer.create 16) lexbuf + let inner = + reference_content input opening_delimiter start_offset (Buffer.create 16) lexbuf in - reference_token media start target input lexbuf + reference_token lexbuf input media ~start_offset ~opening_delimiter ~inner } | "{[" @@ -505,7 +507,14 @@ and token input = parse in let emit_truncated_code_block () = let empty_content = with_location_adjustments (fun _ _ -> Loc.at) lexbuf input "" in - Code_block { metadata = Some { language_tag; tags = None }; delimiter = Some delimiter; content = empty_content } + Code_block { + inner = { + metadata = Some { language_tag; tags = None }; + delimiter = Some delimiter; + content = empty_content + }; + start = input.offset_to_location start_offset + } in (* Disallow result block sections for code blocks without a delimiter. This avoids the surprising parsing of '][' ending the code block. *) @@ -539,8 +548,9 @@ and token input = parse | "{%" ((raw_markup_target as target) ':')? (raw_markup as s) ("%}" | eof as e) - { - let token = Raw_markup (target, s) in + { + let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + let token = Raw_markup { inner = (target, s); start } in if e <> "%}" then begin let not_allowed = Parse_error.not_allowed @@ -581,57 +591,67 @@ and token input = parse { Table_cell `Data } | '{' (['0'-'9']+ as level) ':' (([^ '}'] # space_char)* as label) - { Section_heading (heading_level lexbuf input level, Some label) } + { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + Section_heading { inner = (heading_level lexbuf input level, Some label); start } } | '{' (['0'-'9']+ as level) - { Section_heading (heading_level lexbuf input level, None) } + { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + Section_heading { inner = (heading_level lexbuf input level, None); start } } | "@author" horizontal_space+ (([^ '\r' '\n']*)? as author) - { Author (trim_horizontal_start author ) } + { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + Author { inner = (trim_horizontal_start author); start } } | "@deprecated" { DEPRECATED } - | "@param" horizontal_space+ ((_ # space_char)+ as name) - { Param name } + | "@param" horizontal_space+ ((_ # space_char)+ as inner) + { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + Param { inner; start } } - | ("@raise" | "@raises") horizontal_space+ ((_ # space_char)+ as name) - { Raise name } + | ("@raise" | "@raises") horizontal_space+ ((_ # space_char)+ as inner) + { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + Raise { inner; start } } | ("@return" | "@returns") { RETURN } | "@see" horizontal_space* '<' ([^ '>']* as url) '>' - { See (URL, trim_horizontal_start url) } + { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + See { inner = (URL, trim_horizontal_start url); start } } | "@see" horizontal_space* '\'' ([^ '\'']* as filename) '\'' - { See (File, trim_horizontal_start filename) } + { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + See { inner = (File, trim_horizontal_start filename); start } } | "@see" horizontal_space* '"' ([^ '"']* as name) '"' - { See (Document, trim_horizontal_start name) } + { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + See { inner = (Document, trim_horizontal_start name); start } } (* NOTE: These tags will match the whitespace preceding the content and pass that to the token. I've tried to match on the whitespace as a separate thing from the token body but that seems to cause problems. This is (maybe?) an issue because the tests expect the token body to have no leading whitespace. What do we do here? *) - | "@since" horizontal_space+ (([^ '\r' '\n']+) as version) - { Since version } + | "@since" horizontal_space+ (([^ '\r' '\n']+) as inner) + { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + Since { inner; start } } | "@since" - { Since "" } + { Since { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } - | "@before" horizontal_space+ ((_ # space_char)+ as version) - { Before version } + | "@before" horizontal_space+ ((_ # space_char)+ as inner) + { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + Before { inner; start } } - | "@version" horizontal_space+ (([^ '\r' '\n']+) as version) - { Version version } + | "@version" horizontal_space+ (([^ '\r' '\n']+) as inner) + { Version { inner; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } | "@version" - { Version "" } + { Version { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } - | "@canonical" horizontal_space+ (([^ '\r' '\n']+) as identifier) - { Canonical identifier } + | "@canonical" horizontal_space+ (([^ '\r' '\n']+) as inner) + { Canonical { inner; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } | "@canonical" - { Canonical "" } + { Canonical { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } | "@inline" { INLINE } @@ -666,15 +686,15 @@ and token input = parse | "@param" { warning lexbuf input Parse_error.truncated_param; - Param "" } + Param { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } | ("@raise" | "@raises") as tag { warning lexbuf input (Parse_error.truncated_raise tag); - Raise "" } + Raise { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } | "@before" { warning lexbuf input Parse_error.truncated_before; - Before "" } + Before { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } | "@see" { warning lexbuf input Parse_error.truncated_see; @@ -696,7 +716,7 @@ and code_span buffer nesting_level start_offset input = parse | ']' { if nesting_level = 0 then - Code_span (Buffer.contents buffer) + Code_span { inner = (Buffer.contents buffer); start = input.offset_to_location start_offset } else begin Buffer.add_char buffer ']'; code_span buffer (nesting_level - 1) start_offset input lexbuf @@ -716,7 +736,7 @@ and code_span buffer nesting_level start_offset input = parse let not_allowed = Parse_error.not_allowed ~what:(Tokens.describe (Blank_line "\n\n")) - ~in_what:(Tokens.describe (Code_span "")) + ~in_what:(Tokens.describe (Code_span {inner = ""; start = Loc.dummy_pos})) in warning lexbuf input not_allowed; update_content_newlines ~content:("\n" ^ ws) lexbuf; @@ -734,10 +754,10 @@ and code_span buffer nesting_level start_offset input = parse let not_allowed = Parse_error.not_allowed ~what:(Tokens.describe END) - ~in_what:(Tokens.describe (Code_span "")) + ~in_what:(Tokens.describe (Code_span {inner = ""; start = Loc.dummy_pos})) in warning lexbuf input not_allowed; - Code_span (Buffer.contents buffer) + Code_span { inner = (Buffer.contents buffer); start = input.offset_to_location start_offset } } | _ as c @@ -839,7 +859,7 @@ and verbatim buffer last_false_terminator start_offset input = parse input (Parse_error.not_allowed ~what:(Tokens.describe END) - ~in_what:(Tokens.describe (Verbatim ""))) + ~in_what:(Tokens.describe (Verbatim {inner = ""; start = Loc.dummy_pos}))) | Some location -> warning lexbuf @@ -857,14 +877,14 @@ and verbatim buffer last_false_terminator start_offset input = parse and bad_markup_recovery start_offset input = parse - | [^ '}']+ as text '}' as rest + | [^ '}']+ as inner '}' as rest { let suggestion = - Printf.sprintf "did you mean '{!%s}' or '[%s]'?" text text + Printf.sprintf "did you mean '{!%s}' or '[%s]'?" inner inner in let bad_markup = Parse_error.bad_markup ("{" ^ rest) ~suggestion in warning lexbuf input ~start_offset bad_markup; - Code_span text + Code_span { inner; start = input.offset_to_location start_offset } } (* The second field of the metadata. @@ -892,7 +912,8 @@ and code_block allow_result_block start_offset content_offset metadata buffer de { if delimiter = delim' && allow_result_block then emit_code_block lexbuf - input + input + ~start_offset ~content_offset ~metadata ~delimiter:(Some delimiter) @@ -916,7 +937,8 @@ and code_block allow_result_block start_offset content_offset metadata buffer de if delimiter = delim' then emit_code_block lexbuf - input + input + ~start_offset ~content_offset ~metadata ~delimiter:(Some delimiter) @@ -941,7 +963,8 @@ and code_block allow_result_block start_offset content_offset metadata buffer de warning lexbuf input ~start_offset Parse_error.truncated_code_block; emit_code_block lexbuf - input + input + ~start_offset ~content_offset ~metadata ~delimiter:(Some delimiter) diff --git a/src/parser/loc.ml b/src/parser/loc.ml index cf42693b8b..e12e8d3107 100644 --- a/src/parser/loc.ml +++ b/src/parser/loc.ml @@ -1,4 +1,9 @@ type point = { line : int; column : int } + +let make_point : Lexing.position -> point = + fun Lexing.{ pos_lnum; pos_cnum; pos_bol; _ } -> + { line = pos_lnum; column = pos_cnum - pos_bol } + type span = { file : string; start : point; end_ : point } type +'a with_location = { location : span; value : 'a } @@ -14,10 +19,7 @@ let dummy_pos : point = { line = -1; column = -1 } let of_position : ?filename:string -> Lexing.position * Lexing.position -> span = fun ?filename (start, end_) -> - let to_point Lexing.{ pos_lnum; pos_cnum; pos_bol; _ } = - { line = pos_lnum; column = pos_cnum - pos_bol } - in - let start_point = to_point start and end_point = to_point end_ in + let start_point = make_point start and end_point = make_point end_ in { file = Option.value ~default:start.pos_fname filename; start = start_point; diff --git a/src/parser/loc.mli b/src/parser/loc.mli index f85c24f185..aa14e9ed97 100644 --- a/src/parser/loc.mli +++ b/src/parser/loc.mli @@ -8,6 +8,7 @@ type point = { line : int; column : int } (** A specific character *) +val make_point : Lexing.position -> point val dummy_pos : point type span = { file : string; start : point; end_ : point } diff --git a/src/parser/parser.mly b/src/parser/parser.mly index d978783b0e..311464bdd4 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -1,6 +1,5 @@ %{ open Writer.Prelude - let ( let+ ) = Fun.flip Writer.map let wrap_location : Lexing.position * Lexing.position -> 'a -> 'a Loc.with_location = fun pos value -> let location = Loc.of_position pos in @@ -202,14 +201,14 @@ let legal_module_list : Ast.inline_element Loc.with_location list -> bool = %token MODULES "{!modules:" -%token Math_span "{m" -%token Math_block "{math" +%token Math_span "{m" +%token Math_block "{math" -%token Raw_markup "{%%}" +%token <(string option * string) Tokens.with_start_pos> Raw_markup "{%%}" -%token Code_block "{[]}" (* Self-contained code-block *) -%token Code_block_with_output "{[][" (* Code block that expects some block elements *) -%token Code_span "[]" +%token Code_block "{[]}" (* Self-contained code-block *) +%token Code_block_with_output "{[][" (* Code block that expects some block elements *) +%token Code_span "[]" %token List "{ol" (* or '{ul' *) %token LI "{li" @@ -223,31 +222,31 @@ let legal_module_list : Ast.inline_element Loc.with_location list -> bool = %token Table_cell "{td" (* or '{th' for header *) (* Where N is an integer *) -%token Section_heading "{N:" +%token <(int * string option) Tokens.with_start_pos> Section_heading "{N:" (* Tags *) -%token Author "@author" +%token Author "@author" %token DEPRECATED "@deprecated" -%token Param "@param" -%token Raise "@raise(s)" +%token Param "@param" +%token Raise "@raise(s)" %token RETURN "@return" -%token See "@see" -%token Since "@since" -%token Before "@before" -%token Version "@version" -%token Canonical "@canonical" +%token <(Tokens.internal_reference * string) Tokens.with_start_pos> See "@see" +%token Since "@since" +%token Before "@before" +%token Version "@version" +%token Canonical "@canonical" %token INLINE "@inline" %token OPEN "@open" %token CLOSED "@closed" %token HIDDEN "@hidden" -%token Simple_ref "{!" -%token Ref_with_replacement "{{!" -%token Simple_link "{:" -%token Link_with_replacement "{{:" -%token Media "{(format)!" -%token Media_with_replacement "{(format):" (* where 'format' is audio, video, image *) -%token Verbatim "{v" +%token Simple_ref "{!" +%token Ref_with_replacement "{{!" +%token Simple_link "{:" +%token Link_with_replacement "{{:" +%token <(Tokens.media * Tokens.media_target) Tokens.with_start_pos> Media "{(format)!" +%token <(Tokens.media * Tokens.media_target * string) Tokens.with_start_pos> Media_with_replacement "{(format):" (* where 'format' is audio, video, image *) +%token Verbatim "{v" %token END @@ -387,28 +386,31 @@ let toplevel_error := (* SECTION HEADING *) let section_heading := - | content = located(Section_heading); children = sequence_nonempty(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { - let span = Loc.delimited content endpos in - let (num, title) = content.Loc.value in + | content = Section_heading; children = sequence_nonempty(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { + let Tokens.{ inner = (num, title); start } = content in + let span = { endpos.Loc.location with start } in Writer.map (fun c -> Loc.at span @@ `Heading (num, title, trim_start c)) children } - | content = located(Section_heading); endpos = located(RIGHT_BRACE); { - let span = Loc.delimited content endpos - and (num, title) = content.Loc.value in + | content = Section_heading; endpos = located(RIGHT_BRACE); { + let Tokens.{ inner = (num, title); start } = content in + let span = { endpos.Loc.location with start } in let should_not_be_empty = - let what = Tokens.describe @@ Section_heading (num, title) in + let what = Tokens.describe @@ Section_heading content in Writer.Warning (Parse_error.should_not_be_empty ~what span) in let node = Loc.at span @@ `Heading (num, title, []) in Writer.with_warning node should_not_be_empty } | content = with_position(Section_heading); end_pos = position(error); { - let (num, title), start_pos = content in - let span = Loc.span @@ List.map Loc.of_position [start_pos; end_pos] in + let Tokens.{ inner; start }, start_pos = content in + let (num, title) = inner in + let end_ = Loc.make_point @@ snd end_pos in + let file = (fst end_pos).Lexing.pos_fname in + let span = Loc.{ file; start; end_; } in let start_pos = fst start_pos and end_pos = snd end_pos in let illegal = Writer.InputNeeded (fun input -> let err = Loc.extract ~input ~start_pos ~end_pos in - let in_what = Tokens.describe @@ Section_heading (num, title) in + let in_what = Tokens.describe @@ Section_heading (fst content) in Parse_error.illegal ~in_what err span) in let inner = Loc.at span @@ `Heading (num, title, []) in @@ -439,9 +441,10 @@ let return := } let deprecated := - | startpos = located(DEPRECATED); horizontal_whitespace; children = located(sequence_nonempty(nestable_block_element)); { + | startpos = located(DEPRECATED); horizontal_whitespace; children = locatedM(sequence_nonempty(nestable_block_element)); { + let* children = children in let span = Loc.delimited startpos children in - Writer.map (fun c -> Loc.at span @@ `Deprecated c) children.Loc.value + return @@ Loc.at span (`Deprecated children.Loc.value) } | pos = located(DEPRECATED); horizontal_whitespace?; { return @@ { pos with Loc.value = `Deprecated [] } } @@ -453,94 +456,106 @@ let deprecated := } let before := - | content = located(Before); horizontal_whitespace; children = sequence_nonempty(nestable_block_element); { + | content = Before; horizontal_whitespace; children = sequence_nonempty(nestable_block_element); { let* children = children in - let span = Loc.span @@ content.Loc.location :: List.map Loc.location children in - let inner = Loc.at span @@ `Before (content.Loc.value, children) in + let Tokens.{ inner; start } = content in + let child_span = Loc.span @@ List.map Loc.location children in + let span = { child_span with start } in + let inner = Loc.at span @@ `Before (inner, children) in return inner } - | version = located(Before); { - return { version with value = `Before (version.Loc.value, []) } + | content = located(Before); { + let Loc.{ value = Tokens.{ inner; start }; location } = content in + return { Loc.value = `Before (inner, []); location = { location with start } } } let raise := - | exn = located(Raise); horizontal_whitespace; children = located(sequence_nonempty(nestable_block_element)); { - let span = Loc.delimited exn children in - let exn = exn.Loc.value in + | content = located(Raise); horizontal_whitespace; children = located(sequence_nonempty(nestable_block_element)); { + let Loc.{ value = Tokens.{ inner; start }; location } = content in + let location = { location with start } in let* children = children.Loc.value in - let inner = Loc.at span @@ `Raise (exn, children) in + let span = Loc.span @@ location :: List.map Loc.location children in + let inner = Loc.at span @@ `Raise (inner, children) in return inner } - | exn = located(Raise); horizontal_whitespace?; { - return { exn with Loc.value = `Raise (exn.Loc.value, []) } + | content = located(Raise); horizontal_whitespace?; { + let Loc.{ value = Tokens.{ inner; start }; location } = content in + let location = { location with start } in + return @@ Loc.at location (`Raise ( inner, [])) } let see := | content = located(See); horizontal_whitespace; children = located(sequence_nonempty(nestable_block_element)); { - let span = Loc.delimited content children in - let (kind, href) = content.Loc.value in + let Loc.{ value = Tokens.{ inner = (kind, href); start }; location } = content in + let span = Loc.delimited { content with location = { location with start }} children in let* children = children.Loc.value in let inner = Loc.at span @@ `See (Tokens.to_ast_ref kind, href, children) in return inner } | content = located(See); horizontal_whitespace?; { - let (kind, href) = content.Loc.value in - return { content with value = `See (Tokens.to_ast_ref kind, href, []) } + let Loc.{ value = Tokens.{ inner = (kind, href); start }; location } = content in + let location = { location with start } in + return @@ Loc.at location @@ `See (Tokens.to_ast_ref kind, href, []) } let param := | content = located(Param); horizontal_whitespace; children = sequence_nonempty(nestable_block_element); { let* children = children in - let span = Loc.span @@ content.Loc.location :: List.map Loc.location children in - let ident = content.Loc.value in - let inner = Loc.at span @@ `Param (ident, children) in + let Loc.{ value = Tokens.{ inner; start }; location } = content in + let span = Loc.span @@ { location with start } :: List.map Loc.location children in + let inner = Loc.at span @@ `Param (inner, children) in return inner } | content = located(Param); horizontal_whitespace?; { - return { content with Loc.value = `Param (content.Loc.value, []) } + let Loc.{ value = Tokens.{ inner; start }; location } = content in + return @@ Loc.at { location with start } (`Param (inner, [])) } let tag_bare := - | content = located(Version); horizontal_whitespace?; - { - let Loc.{ value = version; location } = content in - let what = Tokens.describe (Version version) in - let warning = - Writer.Warning (Parse_error.should_not_be_empty ~what location) - in - Writer.ensure has_content warning (return version) - |> Writer.map (fun v -> { content with value = `Version v }) - } - | content = located(Since); - { - let Loc.{ value = version; location } = content in - let what = Tokens.describe (Since version) in - let warning = - Writer.Warning (Parse_error.should_not_be_empty ~what location) - in - Writer.ensure has_content warning (return version) - |> Writer.map (fun v -> { content with value = `Since v }) - } - | impl = located(Canonical); - { - let what = Tokens.describe @@ Canonical "" in - let warning = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.should_not_be_empty ~what span) - in - Writer.ensure (Loc.is has_content) warning @@ return impl - |> Writer.map (fun v -> Loc.at impl.Loc.location @@ `Canonical v) - } - | content = located(Author); - { - let Loc.{ value; location } = content in - let what = Tokens.describe @@ Author value in - let warning = - Writer.Warning (Parse_error.should_not_be_empty ~what location) - in - Writer.ensure has_content warning @@ return value - |> Writer.map (fun a -> { content with value = `Author a }) - } + | content = located(Version); horizontal_whitespace?; { + let Loc.{ value; location } = content in + let Tokens.{ inner; start } = value in + let span = { location with start } in + let what = Tokens.describe (Version value) in + let warning = + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + Writer.ensure has_content warning (return inner) + |> Writer.map (fun v -> Loc.at location @@ `Version v ) + } + | content = located(Since); { + let Loc.{ value; location } = content in + let Tokens.{ inner; start } = value in + let location = { location with start } in + let what = Tokens.describe (Since value) in + let warning = + Writer.Warning (Parse_error.should_not_be_empty ~what location) + in + Writer.ensure has_content warning (return inner) + |> Writer.map (fun v -> Loc.at location @@ `Since v ) + } + | content = located(Canonical); { + let Loc.{ value; location } = content in + let Tokens.{ inner; start } = value in + let what = Tokens.describe @@ Canonical value in + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + let location = { location with start } in + Writer.ensure has_content warning @@ return inner + |> Writer.map (fun value -> Loc.at location @@ `Canonical (Loc.at location value)) + } + | content = located(Author); { + let Loc.{ value; location } = content in + let Tokens.{ inner; start } = value in + let what = Tokens.describe @@ Author value in + let warning = + Writer.Warning (Parse_error.should_not_be_empty ~what location) + in + Writer.ensure has_content warning @@ return inner + |> Writer.map (fun a -> { Loc.value = `Author a; location = { location with start } }) + } | pos = position(OPEN); whitespace?; { let loc = Loc.of_position pos in return @@ Loc.at loc `Open } | pos = position(INLINE); whitespace?; { let loc = Loc.of_position pos in return @@ Loc.at loc `Inline } | pos = position(CLOSED); whitespace?; { let loc = Loc.of_position pos in return @@ Loc.at loc `Closed } @@ -548,21 +563,30 @@ let tag_bare := (* INLINE ELEMENTS *) +(* + If we're calling `inline_element` from another rule higher up the parse tree, + it should only accept horizontal whitespace. + If we're calling it recursively from an inline element with some delimiters, + we can allow any whitespace +*) let inline_element(ws) := | ~ = inline_element_without_whitespace; <> | s = located(ws); { return s } let inline_element_without_whitespace := (* Single token inline elements which are mostly handled in the lexer *) - | c = located(Code_span); { - return @@ Loc.map (fun c -> `Code_span c) c + | content = located(Code_span); { + let Loc.{ value = Tokens.{ inner; start }; location } = content in + return @@ Loc.at { location with start } (`Code_span inner) + } + | m = located(Raw_markup); { + let Loc.{ value = Tokens.{ inner; start }; location } = m in + return @@ Loc.at { location with start } (`Raw_markup inner) } - | m = located(Raw_markup); { return @@ Loc.map (fun m -> `Raw_markup m) m } | w = located(Word); { return @@ Loc.map (fun w -> `Word w) w } | m = located(Math_span); { - let Loc.{ value = Tokens.{ start; content }; location } = m in - let span = { location with start } in - return @@ Loc.at span (`Math_span content) + let Loc.{ value = Tokens.{ start; inner }; location } = m in + return @@ Loc.at { location with start } (`Math_span inner) } (* More complex/recursive inline elements should have their own rule *) | ~ = style; <> @@ -610,7 +634,6 @@ let style := let span = Loc.delimited style endpos in let style = style.Loc.value in let warning = - let span = Loc.of_position $sloc in let in_what = Tokens.describe @@ Style style in Writer.Warning (Parse_error.end_not_allowed ~in_what span) in @@ -622,87 +645,78 @@ let style := let reference := | ref_body = located(Simple_ref); children = sequence(inline_element(whitespace)); { - let+ children = children in - let startpos = Loc.nudge_start (-2) ref_body.Loc.location in - let span = Loc.span @@ startpos :: List.map Loc.location children in - let ref_body = Loc.at startpos ref_body.Loc.value in - Loc.at span @@ `Reference (`Simple, ref_body, trim_start children) + let* children = children in + let Loc.{ value = Tokens.{ inner; start }; location } = ref_body in + let span = Loc.span @@ { location with start } :: List.map Loc.location children in + return @@ Loc.at span @@ `Reference (`Simple, Loc.at location inner, trim_start children) } - | ref_body = located(Ref_with_replacement); children = sequence_nonempty(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { - let+ children = children in - let startpos = Loc.nudge_map_start (-3) ref_body in - let ref_body = Loc.same startpos ref_body.Loc.value in - let span = Loc.delimited startpos endpos in - Loc.at span @@ `Reference (`With_text, ref_body, trim_start children) + | ref_body = Ref_with_replacement; children = sequence_nonempty(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { + let* children = children in + let Tokens.{ inner; start } = ref_body in + let span = { endpos.Loc.location with start } in + return @@ Loc.at span @@ `Reference (`With_text, Loc.at span inner, trim_start children) } - | ref_body = located(Ref_with_replacement); endpos = located(RIGHT_BRACE); { - let startpos = Loc.nudge_map_start (-3) ref_body in - let span = Loc.delimited startpos endpos in - let node = Loc.at span @@ `Reference (`With_text, ref_body, []) in + | ref_body = Ref_with_replacement; endpos = located(RIGHT_BRACE); { + let Tokens.{ inner; start } = ref_body in + let span = { endpos.Loc.location with start } in + let node = Loc.at span @@ `Reference (`With_text, Loc.at span inner, []) in let warning = - let what = Tokens.describe @@ Ref_with_replacement (Loc.value ref_body) in + let what = Tokens.describe @@ Ref_with_replacement ref_body in Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.with_warning node warning } - | ref_body = located(Ref_with_replacement); children = sequence_nonempty(inline_element(whitespace))?; endpos = located(END); { - let Loc.{value = ref_body_value; location} = ref_body in - let startpos = Loc.nudge_start (-3) location in - let ref_body : string Loc.with_location = Loc.at startpos ref_body_value in - let span = Loc.delimited ref_body endpos in + | ref_body = Ref_with_replacement; children = sequence_nonempty(inline_element(whitespace))?; endpos = located(END); { + let Tokens.{ inner; start } = ref_body in + let span = { endpos.Loc.location with start } in let not_allowed = - let in_what = Tokens.describe (Ref_with_replacement ref_body_value) in + let in_what = Tokens.describe (Ref_with_replacement ref_body) in Writer.Warning (Parse_error.end_not_allowed ~in_what span) in let* children = Option.value ~default:(return []) children in - let node = Loc.at span @@ `Reference (`With_text, ref_body, children) in + let node = Loc.at span @@ `Reference (`With_text, Loc.at span inner, children) in Writer.with_warning node not_allowed } let link := - | link_body = located(Simple_link); endpos = located(RIGHT_BRACE); - { - let span = Loc.delimited (Loc.nudge_map_start (-2) link_body) endpos in - let link_body = link_body.Loc.value in - let node = Loc.at span @@ `Link (link_body, []) in - let url = String.trim link_body in - if "" = url then - let what = Tokens.describe @@ Simple_link link_body in - let warning = - Writer.Warning (Parse_error.should_not_be_empty ~what span) - in - Writer.with_warning node warning - else - return node - } - | link_body = located(Link_with_replacement); children = sequence_nonempty(inline_element(whitespace)); endpos = located(RIGHT_BRACE); - { - let* c = children in - let span = Loc.delimited link_body endpos in - let link_body = link_body.Loc.value in - let node = Loc.at span @@ `Link (link_body, c) in - if "" = link_body then - let what = Tokens.describe @@ Link_with_replacement link_body in - let warning = - Writer.Warning (Parse_error.should_not_be_empty ~what span) - in - Writer.with_warning node warning - else - return node - } - | link_body = located(Link_with_replacement); endpos = located(RIGHT_BRACE); - { - let span = - Loc.delimited link_body endpos + | content = Simple_link; endpos = located(RIGHT_BRACE); { + let Tokens.{ inner; start } = content in + let span = { endpos.Loc.location with start } in + let node = Loc.at span @@ `Link (inner, []) in + let url = String.trim inner in + if "" = url then + let what = Tokens.describe @@ Simple_link content in + let warning = + Writer.Warning (Parse_error.should_not_be_empty ~what span) in - let link_body = link_body.Loc.value in - let node = Loc.at span @@ `Link (link_body, []) in - let what = Tokens.describe @@ Link_with_replacement link_body in - let warning = - Writer.Warning (Parse_error.should_not_be_empty ~what span) + Writer.with_warning node warning + else + return node + } + | content = Link_with_replacement; children = sequence_nonempty(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { + let* c = children in + let Tokens.{ inner; start } = content in + let span = { endpos.Loc.location with start } in + let node = Loc.at span @@ `Link (inner, c) in + if "" = inner then + let what = Tokens.describe @@ Link_with_replacement content in + let warning = + Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.with_warning node warning - } + else + return node + } + | content = Link_with_replacement; endpos = located(RIGHT_BRACE); { + let Tokens.{ inner; start } = content in + let span = { endpos.Loc.location with start } in + let node = Loc.at span @@ `Link (inner, []) in + let what = Tokens.describe @@ Link_with_replacement content in + let warning = + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + Writer.with_warning node warning + } (* LIST *) @@ -828,9 +842,9 @@ let odoc_list := (* TABLES *) let cell_heavy := - | cell_kind = Table_cell; whitespace?; children = sequence_nonempty(nestable_block_element); RIGHT_BRACE; whitespace?; + | cell_kind = Table_cell; whitespace*; children = sequence_nonempty(nestable_block_element); RIGHT_BRACE; whitespace*; { Writer.map (fun c -> (c, cell_kind)) children } - | cell_kind = Table_cell; RIGHT_BRACE; whitespace?; + | cell_kind = Table_cell; RIGHT_BRACE; whitespace*; { return ([], cell_kind) } | cell_kind = Table_cell; children = sequence_nonempty(nestable_block_element)?; errloc = position(error); { @@ -847,8 +861,8 @@ let cell_heavy := } let row_heavy := - | TABLE_ROW; whitespace?; ~ = sequence_nonempty(cell_heavy); RIGHT_BRACE; whitespace?; <> - | TABLE_ROW; whitespace?; RIGHT_BRACE; whitespace?; { return [] } + | TABLE_ROW; whitespace*; ~ = sequence_nonempty(cell_heavy); RIGHT_BRACE; whitespace*; <> + | TABLE_ROW; whitespace*; RIGHT_BRACE; whitespace*; { return [] } | TABLE_ROW; children = sequence_nonempty(cell_heavy)?; errloc = position(error); { let warning = fun input -> @@ -863,7 +877,7 @@ let row_heavy := } let table_heavy := - | grid = delimited_location(TABLE_HEAVY, whitespace?; sequence_nonempty(row_heavy), RIGHT_BRACE); { + | grid = delimited_location(TABLE_HEAVY, whitespace*; sequence_nonempty(row_heavy), RIGHT_BRACE); { Writer.map (Loc.map (fun grid -> `Table ((grid, None), `Heavy))) (Writer.sequence_loc grid) } | startpos = located(TABLE_HEAVY); endpos = located(RIGHT_BRACE); { @@ -871,7 +885,7 @@ let table_heavy := let inner = Loc.at span @@ `Table (([], None), `Heavy) in return inner } - | startpos = located(TABLE_HEAVY); whitespace?; grid = sequence_nonempty(row_heavy)?; errloc = position(error); + | startpos = located(TABLE_HEAVY); whitespace*; grid = sequence_nonempty(row_heavy)?; errloc = position(error); { let warning = fun input -> let (start_pos, end_pos) as loc = errloc in @@ -942,25 +956,25 @@ let table := (* MEDIA *) -(* TODO: Naming needs some work here, multiple fields labeled as `kind` which - doesn't make sense *) let media := - | media = located(Media); whitespace*; - { - let (located_media_kind, media_href) = split_simple_media media in - let wrapped_located_kind = Loc.map href_of_media located_media_kind in - let kind = media_kind_of_target media_href in - let inner = Loc.(at media.location @@ `Media (`Simple, wrapped_located_kind, "", kind)) in - return inner - } - | media = located(Media_with_replacement); whitespace*; - { - let (located_media_kind, media_href, content) = split_replacement_media media in - let wrapped_located_kind = Loc.map href_of_media located_media_kind in - let kind = media_kind_of_target media_href in - let inner = Loc.(at media.location @@ `Media (`With_text, wrapped_located_kind, content, kind)) in - return inner - } + | content = located(Media); whitespace*; { + let Loc.{ value = Tokens.{ inner; start }; location } = content in + let span = { location with start } in + let (located_media_kind, media_href) = split_simple_media @@ Loc.at span inner in + let wrapped_located_kind = Loc.map href_of_media located_media_kind in + let kind = media_kind_of_target media_href in + let inner = Loc.at span @@ `Media (`Simple, wrapped_located_kind, "", kind) in + return inner + } + | content = located(Media_with_replacement); whitespace*; { + let Loc.{ value = Tokens.{ inner; start }; location } = content in + let span = { location with start } in + let (located_media_kind, media_href, content) = split_replacement_media @@ Loc.at span inner in + let wrapped_located_kind = Loc.map href_of_media located_media_kind in + let kind = media_kind_of_target media_href in + let inner = Loc.at span @@ `Media (`With_text, wrapped_located_kind, content, kind) in + return inner + } (* TOP-LEVEL ELEMENTS *) @@ -987,14 +1001,14 @@ let paragraph_style := content = located(Paragraph_style); ws = paragraph; endpo let verbatim := verbatim = located(Verbatim); { let Loc.{ value; location } = verbatim in + let Tokens.{ start; inner } = value in + let span = { location with start } in let what = Tokens.describe @@ Verbatim value in let warning = - let span = Loc.of_position $sloc in Writer.Warning (Parse_error.should_not_be_empty ~what span) in - let location = Loc.nudge_start (-String.length "{v ") location in - let verbatim = Loc.at location @@ `Verbatim value in - Writer.ensure has_content warning (return value) + let verbatim = Loc.at span @@ `Verbatim inner in + Writer.ensure has_content warning (return inner) |> Writer.map (Fun.const verbatim) } @@ -1004,28 +1018,30 @@ let paragraph := items = sequence_nonempty(inline_element(horizontal_whitespace) let code_block := | content = located(Code_block); { - return @@ Loc.map (fun Tokens.{metadata; delimiter; content} -> - let meta = Option.map (fun Tokens.{language_tag; tags} -> Ast.{ language = language_tag; tags }) metadata in - `Code_block Ast.{ meta; delimiter; content; output = None } - ) content + let Loc.{ value = Tokens.{ inner; start }; location } = content in + let Tokens.{metadata; delimiter; content} = inner in + let meta = Option.map (fun Tokens.{language_tag; tags} -> Ast.{ language = language_tag; tags }) metadata in + let node = `Code_block Ast.{ meta; delimiter; content; output = None } in + return @@ Loc.at { location with start } node } | content = located(Code_block_with_output); output = sequence_nonempty(nestable_block_element); RIGHT_CODE_DELIMITER; { - let* output = output in - return @@ Loc.map (fun Tokens.{metadata; delimiter; content} -> - let meta = Option.map (fun Tokens.{language_tag; tags} -> Ast.{ language = language_tag; tags }) metadata in - `Code_block Ast.{ meta; delimiter; content; output = Some output } - ) content + let* output = Writer.map Option.some output in + let Loc.{ value = Tokens.{ inner; start }; location } = content in + let Tokens.{metadata; delimiter; content} = inner in + let meta = Option.map (fun Tokens.{language_tag; tags} -> Ast.{ language = language_tag; tags }) metadata in + let node = `Code_block Ast.{ meta; delimiter; content; output } in + return @@ Loc.at { location with start } node } let math_block := inner = located(Math_block); { - let Loc.{ value ; location } = inner in - let Tokens.{start; content} = value in + let Loc.{ value; location } = inner in + let Tokens.{ start; inner } = value in let span = { location with start } in let what = Tokens.describe @@ Math_block value in let warning = Writer.Warning (Parse_error.should_not_be_empty ~what span) in - Writer.ensure has_content warning (return content) + Writer.ensure has_content warning (return inner) |> Writer.map (fun m -> Loc.at span @@ `Math_block m) } diff --git a/src/parser/tokens.ml b/src/parser/tokens.ml index ec3ab8312b..a84b3a4a0f 100644 --- a/src/parser/tokens.ml +++ b/src/parser/tokens.ml @@ -17,6 +17,10 @@ let ast_list_kind : list_kind -> Ast.list_kind = function | Ordered -> `Ordered | Unordered -> `Unordered +type 'a with_start_pos = { start : Loc.point; inner : 'a } +let with_start_pos : Lexing.position -> 'a -> 'a with_start_pos = + fun start inner -> { start = Loc.make_point start; inner } + type code_block = { metadata : meta option; delimiter : string option; @@ -31,21 +35,21 @@ type token = | Space of string | Single_newline of string | Blank_line of string - | Simple_ref of string - | Ref_with_replacement of string - | Simple_link of string - | Link_with_replacement of string + | Simple_ref of string with_start_pos + | Ref_with_replacement of string with_start_pos + | Simple_link of string with_start_pos + | Link_with_replacement of string with_start_pos | MODULES - | Media of (media * media_target) - | Media_with_replacement of (media * media_target * string) + | Media of (media * media_target) with_start_pos + | Media_with_replacement of (media * media_target * string) with_start_pos (* Start location *) - | Math_span of math - | Math_block of math - | Code_span of string - | Code_block of code_block - | Code_block_with_output of code_block + | Math_span of string with_start_pos + | Math_block of string with_start_pos + | Code_span of string with_start_pos + | Code_block of code_block with_start_pos + | Code_block_with_output of code_block with_start_pos | Word of string - | Verbatim of string + | Verbatim of string with_start_pos | RIGHT_CODE_DELIMITER | RIGHT_BRACE | Paragraph_style of alignment @@ -60,22 +64,22 @@ type token = | MINUS | PLUS | BAR - | Section_heading of (int * string option) - | Author of string + | Section_heading of (int * string option) with_start_pos + | Author of string with_start_pos | DEPRECATED - | Param of string - | Raise of string + | Param of string with_start_pos + | Raise of string with_start_pos | RETURN - | See of (internal_reference * string) - | Since of string - | Before of string - | Version of string - | Canonical of string + | See of (internal_reference * string) with_start_pos + | Since of string with_start_pos + | Before of string with_start_pos + | Version of string with_start_pos + | Canonical of string with_start_pos | INLINE | OPEN | CLOSED | HIDDEN - | Raw_markup of (string option * string) + | Raw_markup of (string option * string) with_start_pos | END let media_description ref_kind media_kind = @@ -97,10 +101,10 @@ let print : token -> string = function | Simple_link _ -> "{:" | Link_with_replacement _ -> "{{:" | MODULES -> "{!modules:" - | Media (ref_kind, media_kind) -> + | Media { inner = ref_kind, media_kind; _ } -> let ref_kind, media_kind = media_description ref_kind media_kind in Printf.sprintf "{%s%s" media_kind ref_kind - | Media_with_replacement (ref_kind, media_kind, _) -> + | Media_with_replacement { inner = ref_kind, media_kind, _; _ } -> let ref_kind, media_kind = media_description ref_kind media_kind in Printf.sprintf "{{%s%s" media_kind ref_kind | Math_span _ -> "{m" @@ -131,7 +135,7 @@ let print : token -> string = function | MINUS -> "'-'" | PLUS -> "'+'" | BAR -> "'|'" - | Section_heading (level, label) -> + | Section_heading { inner = level, label; _ } -> let label = match label with None -> "" | Some label -> ":" ^ label in Printf.sprintf "'{%i%s'" level label | Author _ -> "'@author'" @@ -148,8 +152,8 @@ let print : token -> string = function | OPEN -> "'@open'" | CLOSED -> "'@closed'" | HIDDEN -> "'@hidden" - | Raw_markup (None, _) -> "'{%...%}'" - | Raw_markup (Some target, _) -> "'{%" ^ target ^ ":...%}'" + | Raw_markup { inner = None, _; _ } -> "'{%...%}'" + | Raw_markup { inner = Some target, _; _ } -> "'{%" ^ target ^ ":...%}'" | END -> "EOI" (* [`Minus] and [`Plus] are interpreted as if they start list items. Therefore, @@ -157,10 +161,10 @@ let print : token -> string = function [`Minus] and [`Plus] should always be plausibly list item bullets. *) let describe : token -> string = function | Space _ -> "(horizontal space)" - | Media (ref_kind, media_kind) -> + | Media { inner = ref_kind, media_kind; _ } -> let ref_kind, media_kind = media_description ref_kind media_kind in Printf.sprintf "{%s%s" media_kind ref_kind - | Media_with_replacement (ref_kind, media_kind, _) -> + | Media_with_replacement { inner = ref_kind, media_kind, _; _ } -> let ref_kind, media_kind = media_description ref_kind media_kind in Printf.sprintf "{{%s%s" media_kind ref_kind | Word w -> Printf.sprintf "'%s'" w @@ -200,7 +204,7 @@ let describe : token -> string = function | MINUS -> "'-' (bulleted list item)" | PLUS -> "'+' (numbered list item)" | BAR -> "'|'" - | Section_heading (level, _) -> + | Section_heading { inner = level, _; _ } -> Printf.sprintf "'{%i ...}' (section heading)" level | Author _ -> "'@author'" | DEPRECATED -> "'@deprecated'" @@ -219,10 +223,16 @@ let describe : token -> string = function let empty_code_block = { - metadata = None; - delimiter = None; - content = - Loc.at Loc.{ start = Loc.dummy_pos; end_ = Loc.dummy_pos; file = "" } ""; + inner = + { + metadata = None; + delimiter = None; + content = + Loc.at + Loc.{ start = Loc.dummy_pos; end_ = Loc.dummy_pos; file = "" } + ""; + }; + start = Loc.dummy_pos; } let of_ast_style : Ast.style -> style = function @@ -243,15 +253,17 @@ let describe_inline : Ast.inline_element -> string = function | `Word w -> describe @@ Word w | `Space _ -> describe @@ Space "" | `Styled (style, _) -> describe @@ Style (of_ast_style style) - | `Code_span _ -> describe @@ Code_span "" - | `Math_span _ -> - describe @@ Math_span { start = Loc.dummy_pos; content = "" } - | `Raw_markup x -> describe @@ Raw_markup x - | `Link (l, []) -> describe @@ Simple_link l - | `Link (l, _ :: _) -> describe @@ Link_with_replacement l - | `Reference (`Simple, { value; _ }, _) -> describe @@ Simple_ref value - | `Reference (`With_text, { value; _ }, _) -> - describe @@ Ref_with_replacement value + | `Code_span _ -> describe @@ Code_span { inner = ""; start = Loc.dummy_pos } + | `Math_span _ -> describe @@ Math_span { start = Loc.dummy_pos; inner = "" } + | `Raw_markup inner -> describe @@ Raw_markup { inner; start = Loc.dummy_pos } + | `Link (inner, []) -> + describe @@ Simple_link { inner; start = Loc.dummy_pos } + | `Link (inner, _ :: _) -> + describe @@ Link_with_replacement { inner; start = Loc.dummy_pos } + | `Reference (`Simple, { value = inner; _ }, _) -> + describe @@ Simple_ref { inner; start = Loc.dummy_pos } + | `Reference (`With_text, { value = inner; _ }, _) -> + describe @@ Ref_with_replacement { inner; start = Loc.dummy_pos } let of_href = function `Reference s -> Reference s | `Link s -> Link s @@ -262,7 +274,8 @@ let of_media_kind = function let of_media = function | `Media (_, Loc.{ value; _ }, _, kind) -> - Media (of_href value, of_media_kind kind) + Media + { inner = (of_href value, of_media_kind kind); start = Loc.dummy_pos } (* NOTE: Fix list *) let describe_nestable_block : Ast.nestable_block_element -> string = function @@ -271,13 +284,13 @@ let describe_nestable_block : Ast.nestable_block_element -> string = function | Loc.{ value; _ } :: _ -> describe_inline value | [] -> describe @@ Word "") | `Code_block _ -> describe @@ Code_block empty_code_block - | `Verbatim _ -> describe @@ Verbatim "" + | `Verbatim _ -> describe @@ Verbatim { inner = ""; start = Loc.dummy_pos } | `Modules _ -> describe MODULES | `List (_, _, _) -> "List" | `Table (_, kind) -> describe @@ if kind = `Light then TABLE_LIGHT else TABLE_HEAVY | `Math_block _ -> - describe @@ Math_block { start = Loc.dummy_pos; content = "" } + describe @@ Math_block { start = Loc.dummy_pos; inner = "" } | `Media _ as media -> describe @@ of_media media let of_ast_ref : [ `Document | `File | `Url ] -> internal_reference = function @@ -291,17 +304,19 @@ let to_ast_ref : internal_reference -> [ `Url | `File | `Document ] = function | Document -> `Document let describe_tag : Ast.tag -> string = function - | `See (kind, _, _) -> describe @@ See (of_ast_ref kind, "") - | `Author s -> describe @@ Author s + | `See (kind, _, _) -> + describe @@ See { inner = (of_ast_ref kind, ""); start = Loc.dummy_pos } + | `Author inner -> describe @@ Author { inner; start = Loc.dummy_pos } | `Deprecated _ -> describe DEPRECATED - | `Param (s, _) -> describe @@ Param s - | `Raise (s, _) -> describe @@ Raise s + | `Param (inner, _) -> describe @@ Param { inner; start = Loc.dummy_pos } + | `Raise (inner, _) -> describe @@ Raise { inner; start = Loc.dummy_pos } | `Return _ -> describe RETURN - | `Since v -> describe @@ Since v - | `Before (v, _) -> describe @@ Before v - | `Version v -> describe @@ Version v + | `Since inner -> describe @@ Since { inner; start = Loc.dummy_pos } + | `Before (inner, _) -> describe @@ Before { inner; start = Loc.dummy_pos } + | `Version inner -> describe @@ Version { inner; start = Loc.dummy_pos } | `Closed -> describe CLOSED | `Open -> describe OPEN - | `Canonical Loc.{ value; _ } -> describe @@ Canonical value + | `Canonical Loc.{ value = inner; _ } -> + describe @@ Canonical { inner; start = Loc.dummy_pos } | `Hidden -> describe HIDDEN | `Inline -> describe INLINE From 5166ec7cc6f060415144d45e66ec4c77c9e17b5c Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 9 Jan 2025 11:46:50 -0500 Subject: [PATCH 121/150] cover style errors --- src/parser/parser.mly | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 311464bdd4..1007802827 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -370,6 +370,8 @@ let toplevel_error := in Writer.with_warning (list :> Ast.block_element Loc.with_location) (Writer.Warning warning) } + (* + | illegal_elt = tag; { Writer.bind illegal_elt @@ -382,6 +384,7 @@ let toplevel_error := let inner = (illegal_elt :> Ast.block_element Loc.with_location) in Writer.with_warning inner should_begin_on_its_own_line) } + *) (* SECTION HEADING *) @@ -630,6 +633,17 @@ let style := |> Writer.warning not_allowed |> Writer.warning should_not_be_empty } + | style = located(Style); errloc = position(error); { + let span = Loc.span [style.Loc.location; Loc.of_position errloc] in + let illegal = Writer.InputNeeded (fun input -> + let in_what = Tokens.describe @@ Style style.Loc.value in + let (start_pos, end_pos) = errloc in + let illegal_section = Loc.extract ~input ~start_pos ~end_pos in + Parse_error.illegal ~in_what illegal_section span + ) in + let inner = Loc.at span @@ `Styled (Tokens.to_ast_style style.Loc.value, []) in + Writer.with_warning inner illegal + } | style = located(Style); endpos = located(END); { let span = Loc.delimited style endpos in let style = style.Loc.value in From 3bf452d30c55bb8624db2ea0d3dc801bb7551ae7 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 9 Jan 2025 11:53:33 -0500 Subject: [PATCH 122/150] improve light table whitespace handling --- src/parser/parser.mly | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 1007802827..9b3df33460 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -278,8 +278,7 @@ let separated_sequence(sep, rule) := xs = separated_list(sep, rule); { Writer.se (* WHITESPACE *) -let horizontal_whitespace := - | ~ = Space; <`Space> +let horizontal_whitespace := ~ = Space; <`Space> let whitespace := | ~ = horizontal_whitespace; <> @@ -916,11 +915,7 @@ let table_heavy := (* LIGHT TABLE *) -let table_light_legal_elt := - | s = located(horizontal_whitespace); { return s } - | ~ = inline_element_without_whitespace; <> - -let cell_content_light := ~ = sequence_nonempty(table_light_legal_elt); <> +let cell_content_light := ~ = sequence_nonempty(inline_element(horizontal_whitespace)); <> let cell := | ~ = cell_content_light; <> | ~ = cell_content_light; BAR; <> @@ -930,10 +925,13 @@ let cells := BAR?; ~ = sequence_nonempty(cell); <> let row_light := | ~ = cells; <> | ~ = cells; Single_newline; <> + | ~ = cells; Single_newline; Space; <> + | ~ = cells; Blank_line; <> + | ~ = cells; Blank_line; Single_newline; <> let rows_light := ~ = sequence_nonempty(row_light); <> -let table_start_light := startpos = located(TABLE_LIGHT); whitespace?; { startpos } +let table_start_light := startpos = located(TABLE_LIGHT); any_whitespace*; { startpos } let table_light := | startpos = table_start_light; data = rows_light; endpos = located(RIGHT_BRACE); { let span = Loc.delimited startpos endpos in From 16ad51e149255575f536e84ddb32eb9330e6c84f Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 9 Jan 2025 15:36:53 -0500 Subject: [PATCH 123/150] make `light_table` more robust --- src/parser/parser.mly | 122 ++++++++++++++++++++++++++---------------- 1 file changed, 77 insertions(+), 45 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 9b3df33460..29e1ffddaf 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -8,6 +8,16 @@ let not_empty : 'a list -> bool = function _ :: _ -> true | _ -> false let has_content : string -> bool = fun s -> String.length s > 0 + let trim_start = function + | Loc.{value = `Space _; _ } :: xs -> xs + | xs -> xs + + let paragraph : Ast.inline_element Loc.with_location list -> Ast.nestable_block_element Loc.with_location = + fun elts -> + let elts = trim_start elts in + let span = Loc.span @@ List.map Loc.location elts in + Loc.at span @@ `Paragraph elts + type align_error = | Invalid_align (* An invalid align cell *) | Not_align (* Not an align cell *) @@ -74,28 +84,18 @@ | _ :: _, [] -> sequence align | _ -> Error Not_align - let to_paragraph : - Ast.inline_element Loc.with_location list -> - Ast.nestable_block_element Loc.with_location = - fun words -> - let span = Loc.span @@ List.map Loc.location words in - Loc.at span (`Paragraph words) - (* Merges inline elements within a cell into a single paragraph element, and tags cells w/ tag *) let merged_tagged_row tag : 'a Loc.with_location list list -> 'b = - List.map (fun elts -> ([ to_paragraph elts ], tag)) + List.map (fun elts -> ([ paragraph elts ], tag)) let as_data = merged_tagged_row `Data let as_header = merged_tagged_row `Header let is_valid_align row = Result.is_ok @@ valid_align_row row - -(* - + (* - If the first row is the alignment row then the rest should be data - Otherwise the first should be the headers, the second align, and the rest data - If there's only one row and it's not the align row, then it's data -*) - + *) let construct_table : span:Loc.span -> Ast.inline_element Loc.with_location list list list -> @@ -147,16 +147,6 @@ let split_replacement_media Loc.{ location; value = media, target, content } = (Loc.at location media, target, content) - let trim_start = function - | Loc.{value = `Space _; _ } :: xs -> xs - | xs -> xs - - let paragraph : Ast.inline_element Loc.with_location list -> Ast.nestable_block_element Loc.with_location = - fun elts -> - let elts = trim_start elts in - let span = Loc.span @@ List.map Loc.location elts in - Loc.at span @@ `Paragraph elts - let rec inline_element_inner : Ast.inline_element -> string = function | `Space s -> s | `Word s -> s @@ -325,6 +315,31 @@ let toplevel_error := let node = (Loc.same as_text @@ `Paragraph [ as_text ]) in Writer.with_warning node warning } + | err = located(list_opening); horizontal_whitespace; children = sequence_nonempty(inline_element(horizontal_whitespace)); endpos = located(RIGHT_BRACE)?; { + let endloc = Option.value ~default:(Writer.unwrap children |> List.rev |> List.hd |> Loc.map @@ Fun.const ()) endpos in + let span = Loc.delimited err endloc in + let not_allowed = Writer.Warning ( + let what = Tokens.describe err.Loc.value in + let in_what = "top-level text" in + let suggestion = + Printf.sprintf + "Move %s into %s or %s" + what + (Tokens.describe @@ List Ordered) + (Tokens.describe @@ List Unordered) + in + Parse_error.not_allowed ~what ~in_what ~suggestion span) + in + let unclosed = Writer.Warning ( + Parse_error.unclosed_bracket ~bracket:(Tokens.print err.Loc.value) span) + in + let* children = children in + let inner = { (paragraph children :> Ast.block_element Loc.with_location) with location = span } in + let m = Writer.with_warning inner not_allowed in + if Option.is_some endpos then m + else + Writer.warning unclosed m + } | errloc = position(BAR); whitespace?; { let span = Loc.of_position errloc in let warning = @@ -657,11 +672,10 @@ let style := (* LINKS + REFS *) let reference := - | ref_body = located(Simple_ref); children = sequence(inline_element(whitespace)); { - let* children = children in + | ref_body = located(Simple_ref); { let Loc.{ value = Tokens.{ inner; start }; location } = ref_body in - let span = Loc.span @@ { location with start } :: List.map Loc.location children in - return @@ Loc.at span @@ `Reference (`Simple, Loc.at location inner, trim_start children) + let span = { location with start } in + return @@ Loc.at span @@ `Reference (`Simple, Loc.at location inner, []) } | ref_body = Ref_with_replacement; children = sequence_nonempty(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { let* children = children in @@ -692,13 +706,13 @@ let reference := } let link := - | content = Simple_link; endpos = located(RIGHT_BRACE); { - let Tokens.{ inner; start } = content in - let span = { endpos.Loc.location with start } in + | content = located(Simple_link); { + let Loc.{ value = Tokens.{ inner; start }; location } = content in + let span = { location with start } in let node = Loc.at span @@ `Link (inner, []) in let url = String.trim inner in if "" = url then - let what = Tokens.describe @@ Simple_link content in + let what = Tokens.describe @@ Simple_link content.Loc.value in let warning = Writer.Warning (Parse_error.should_not_be_empty ~what span) in @@ -767,15 +781,19 @@ let list_light := return inner } +let list_opening := + | LI; { Tokens.LI } + | DASH; { Tokens.DASH } + let item_heavy := - | startpos = located(LI); whitespace; items = sequence(nestable_block_element); endpos = located(RIGHT_BRACE); whitespace?; { - let span = Loc.delimited startpos endpos in + | start_pos = located(list_opening); whitespace; items = sequence(nestable_block_element); RIGHT_BRACE; any_whitespace*; { + let Loc.{ value = token; location } = start_pos in let warning = - Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span) + Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe token) location) in - Writer.ensure not_empty warning items + Writer.ensure not_empty warning items } - | startpos = located(LI); items = sequence(nestable_block_element); endpos = located(RIGHT_BRACE); whitespace?; { + | startpos = located(list_opening); items = sequence(nestable_block_element); endpos = located(RIGHT_BRACE); any_whitespace*; { let span = Loc.delimited startpos endpos in let should_be_followed_by_whitespace = Writer.Warning (Parse_error.should_be_followed_by_whitespace ~what:(Tokens.describe LI) span) @@ -786,14 +804,7 @@ let item_heavy := Writer.ensure not_empty should_not_be_empty items |> Writer.warning should_be_followed_by_whitespace } - | startpos = located(DASH); whitespace?; items = sequence(nestable_block_element); endpos = located(RIGHT_BRACE); whitespace?; { - let warning = - let span = Loc.delimited startpos endpos in - Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span) - in - Writer.ensure not_empty warning items - } - | startpos = located(DASH); whitespace?; items = sequence_nonempty(nestable_block_element)?; endpos = located(END); { + | startpos = located(DASH); items = sequence_nonempty(nestable_block_element)?; endpos = located(END); { let end_not_allowed = Writer.Warning (Parse_error.end_not_allowed ~in_what:(Tokens.describe DASH) endpos.Loc.location) in @@ -915,10 +926,31 @@ let table_heavy := (* LIGHT TABLE *) -let cell_content_light := ~ = sequence_nonempty(inline_element(horizontal_whitespace)); <> +let cell_inner := + | ~ = inline_element(horizontal_whitespace); <> + | (start_pos, end_pos) = position(error); { + let span = Loc.of_position (start_pos, end_pos) in + let illegal = Writer.InputNeeded (fun input -> + let text_span = Loc.extract ~start_pos ~end_pos ~input in + Parse_error.illegal ~in_what:(Tokens.describe TABLE_LIGHT) text_span span) + in + + (* NOTE: this is the best we can do right now, accepting a `nestable_block_element` + for example, causes a reduce/reduce conflict. So we have to lose some + information via the `error` keyword and return an empty word. + Maybe if we refactored the way that block elements/tags/sections handle + whitespace we could remove the conflict while still being to match on those + elements + *) + Writer.with_warning (Loc.at span @@ `Word "") illegal + } + +let cell_content_light := ~ = sequence_nonempty(cell_inner); <> + let cell := | ~ = cell_content_light; <> | ~ = cell_content_light; BAR; <> + | BAR; { return [] } let cells := BAR?; ~ = sequence_nonempty(cell); <> From 34b01ccd80b0339b8c51f37ea1fff7e7ab35dd08 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 9 Jan 2025 16:05:18 -0500 Subject: [PATCH 124/150] Cleanup, break helper functions out into `parser_aux.ml` --- src/parser/TODO.md | 31 - src/parser/parser.mly | 179 +---- src/parser/parser_aux.ml | 171 +++++ src/parser/syntax.ml | 1532 -------------------------------------- src/parser/syntax.mli | 8 - 5 files changed, 176 insertions(+), 1745 deletions(-) delete mode 100644 src/parser/TODO.md create mode 100644 src/parser/parser_aux.ml delete mode 100644 src/parser/syntax.ml delete mode 100644 src/parser/syntax.mli diff --git a/src/parser/TODO.md b/src/parser/TODO.md deleted file mode 100644 index 82ad4abcd3..0000000000 --- a/src/parser/TODO.md +++ /dev/null @@ -1,31 +0,0 @@ -- Leading white-space on tags - - Tests expect tag bodies (for i.e. "@version" or "@canonical", Lexer line 610) to have no - leading white-space. Fiddling with the lexer's rules to fix this seems to - break its ability to match on these rules correctly. - - Worst case we trim the start of the tag body -- Locations - - Not sure what the problem is here, not specifically at least - - The lexer has a, maybe unnecessarily, complex location handling system - with lots of curried functions, which was written with the intention of - wrapping tokens in `Loc.with_location` - - Do we rewrite it from scratch? (~week of work) - - Or fix it as it is now - - Or! move location adjustment to parser - - This is the approach I'm taking. Everywhere the parser uses the - `Inline_element` and `nestable_block_element` rules, they're wrapped in - a location, so we can instead emit them wrapped in a location initially - instead of afterwards with a higher-order rule. - - This looks like getting the location of opening and closing delimiters - for delimited elements, and setting their location to the span between - those two points -- Code blocks - - Code blocks do not work now. The lexer relies on cooperation from the parser - which is not possible with Menhir. - - - Our only option is to break the `Code_block` token up in multiple tokens, - the question is how: - - Do we split it into `Code_block` and `Code_block_w_output`? - - Or into its delimiters, i.e. `RIGHT_CODE_BLOCK`, `RIGHT_BRACKET`, - `CODE_BLOCK_META` etc? - - Seeing as the AST expects a string for its content I can see a benefit - in parsing said content within the lexer diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 29e1ffddaf..77f0fbd5b6 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -1,175 +1,6 @@ %{ + open Parser_aux open Writer.Prelude - let wrap_location : Lexing.position * Lexing.position -> 'a -> 'a Loc.with_location = - fun pos value -> - let location = Loc.of_position pos in - { location; value } - - let not_empty : 'a list -> bool = function _ :: _ -> true | _ -> false - let has_content : string -> bool = fun s -> String.length s > 0 - - let trim_start = function - | Loc.{value = `Space _; _ } :: xs -> xs - | xs -> xs - - let paragraph : Ast.inline_element Loc.with_location list -> Ast.nestable_block_element Loc.with_location = - fun elts -> - let elts = trim_start elts in - let span = Loc.span @@ List.map Loc.location elts in - Loc.at span @@ `Paragraph elts - - type align_error = - | Invalid_align (* An invalid align cell *) - | Not_align (* Not an align cell *) - - (* This could be made a bit more specific by allowing Space elements only at - the beginning and end *) - let valid_elements (cell : Ast.inline_element list) : string option = - let rec go acc = function - | `Word _ :: _ when Option.is_some acc -> None - | `Word word :: rest -> go (Some word) rest - | `Space _ :: rest -> go acc rest - | _ :: _ -> None - | [] -> acc - in - go None cell - - let valid_word word = - match String.length word with - | 0 -> Ok None - | 1 -> ( - match word.[0] with - | ':' -> Ok (Some `Center) - | '-' -> Ok None - | _ -> Error Not_align) - | len -> - if String.for_all (Char.equal '-') (String.sub word 1 (len - 2)) then - match (word.[0], word.[pred len]) with - | ':', '-' -> Ok (Some `Left) - | '-', ':' -> Ok (Some `Right) - | ':', ':' -> Ok (Some `Center) - | '-', '-' -> Ok None - | _ -> Error Invalid_align - else Error Not_align - - let valid_align_cell (cell : Ast.inline_element Loc.with_location list) = - match List.map Loc.value cell |> valid_elements with - | Some word -> valid_word word - | None -> Error Not_align - - let sequence : ('elt, 'err) result list -> ('elt list, 'err) result = - fun list -> - let rec go acc : ('elt, 'err) result list -> ('elt list, 'err) result = - function - | Ok x :: xs -> go (x :: acc) xs - | Error err :: _ -> Error err - | [] -> Ok (List.rev acc) - in - go [] list - - (* NOTE: (@FayCarsons) - When we get something that doesn't look like an align at all, we check to see if we've gotten - any valid aligns, if so we assume that the cell being considered is supposed to be an align and treat it as an error, - otherwise we assume the row is not supposed to be an align row *) - let valid_align_row (row : Ast.inline_element Loc.with_location list list) : - (Ast.alignment option list, align_error) result = - let align, not_align = - List.map valid_align_cell row - |> List.partition (function - | Ok _ | Error Invalid_align -> true - | _ -> false) - in - match (align, not_align) with - | _ :: _, _ :: _ -> Error Invalid_align - | _ :: _, [] -> sequence align - | _ -> Error Not_align - - (* Merges inline elements within a cell into a single paragraph element, and tags cells w/ tag *) - let merged_tagged_row tag : 'a Loc.with_location list list -> 'b = - List.map (fun elts -> ([ paragraph elts ], tag)) - let as_data = merged_tagged_row `Data - let as_header = merged_tagged_row `Header - - let is_valid_align row = Result.is_ok @@ valid_align_row row - (* - - If the first row is the alignment row then the rest should be data - - Otherwise the first should be the headers, the second align, and the rest data - - If there's only one row and it's not the align row, then it's data - *) - let construct_table : - span:Loc.span -> - Ast.inline_element Loc.with_location list list list -> - Ast.nestable_block_element Loc.with_location = - fun ~span grid -> - match grid with - | [only_row] -> ( - match valid_align_row only_row with - | Ok align -> - Loc.at span @@ `Table (([[]], Some align), `Light) - | _ -> - Loc.at span @@ `Table (([as_data only_row], None), `Light)) - | align :: data when is_valid_align align -> - let align = Result.get_ok @@ valid_align_row align in - Loc.at span @@ `Table ( (List.map as_data data, Some align) , `Light) - | header :: align :: data when is_valid_align align -> - let align = Result.get_ok @@ valid_align_row align in - Loc.at span @@ `Table ((as_header header :: List.map as_data data, Some align), `Light) - | data -> Loc.at span @@ `Table ((List.map as_data data, None), `Light) - - let unclosed_table - ?(data : - Ast.inline_element Loc.with_location list list list Writer.t option) - ~span - warning : Ast.nestable_block_element Loc.with_location Writer.t = - let node = - match data with - | Some data -> - Writer.map - (fun data -> Loc.at span @@ `Table ((List.map as_data data, None), `Light)) - data - | None -> - let inner = Loc.at span @@ `Table (([], None), `Light) in - return inner - in - Writer.warning warning node - - let media_kind_of_target = - let open Tokens in - function Audio -> `Audio | Video -> `Video | Image -> `Image - - let href_of_media = - let open Tokens in - function Reference name -> `Reference name | Link uri -> `Link uri - - let split_simple_media Loc.{ location; value = media, target } = - (Loc.at location media, target) - - let split_replacement_media Loc.{ location; value = media, target, content } = - (Loc.at location media, target, content) - - let rec inline_element_inner : Ast.inline_element -> string = function - | `Space s -> s - | `Word s -> s - | `Styled (_, s) -> children s - | `Code_span s -> s - | `Raw_markup (_, s) -> s - | `Reference (_, _, s) -> children s - | `Link (_, s) -> children s - | `Math_span s -> s - and children s = - List.fold_left - (fun acc elt -> acc ^ inline_element_inner (Loc.value elt)) - "" - s - -let legal_module_list : Ast.inline_element Loc.with_location list -> bool = - fun xs -> - not_empty xs - && List.for_all - (function - | `Word _ | `Space _ -> true - | _ -> false) - @@ List.map Loc.value xs %} %token RIGHT_BRACE "{" @@ -301,7 +132,7 @@ let toplevel_error := let what = Tokens.describe PLUS in Writer.Warning (Parse_error.bad_markup what span) in - let as_text = Loc.at span @@ `Word "{" in + let as_text = Loc.at span @@ `Word "+" in let node = (Loc.same as_text @@ `Paragraph [ as_text ]) in Writer.with_warning node warning } @@ -311,7 +142,7 @@ let toplevel_error := let what = Tokens.describe MINUS in Writer.Warning (Parse_error.bad_markup what span) in - let as_text = Loc.at span @@ `Word "{" in + let as_text = Loc.at span @@ `Word "-" in let node = (Loc.same as_text @@ `Paragraph [ as_text ]) in Writer.with_warning node warning } @@ -346,7 +177,7 @@ let toplevel_error := let what = Tokens.describe BAR in Writer.Warning (Parse_error.bad_markup what span) in - let as_text = Loc.at span @@ `Word "{" in + let as_text = Loc.at span @@ `Word "|" in let node = (Loc.same as_text @@ `Paragraph [ as_text ]) in Writer.with_warning node warning } @@ -356,7 +187,7 @@ let toplevel_error := let what = Tokens.describe RIGHT_BRACE in Writer.Warning (Parse_error.bad_markup what span) in - let as_text = Loc.at span @@ `Word "{" in + let as_text = Loc.at span @@ `Word "}" in let node = (Loc.same as_text @@ `Paragraph [ as_text ]) in Writer.with_warning node warning } diff --git a/src/parser/parser_aux.ml b/src/parser/parser_aux.ml new file mode 100644 index 0000000000..e1cdd09ead --- /dev/null +++ b/src/parser/parser_aux.ml @@ -0,0 +1,171 @@ +let wrap_location : + Lexing.position * Lexing.position -> 'a -> 'a Loc.with_location = + fun pos value -> + let location = Loc.of_position pos in + { location; value } + +let not_empty : 'a list -> bool = function _ :: _ -> true | _ -> false +let has_content : string -> bool = fun s -> String.length s > 0 + +let trim_start = function Loc.{ value = `Space _; _ } :: xs -> xs | xs -> xs + +(* Wrap a list of `inline_element` in a `Paragraph *) +let paragraph : + Ast.inline_element Loc.with_location list -> + Ast.nestable_block_element Loc.with_location = + fun elts -> + let span = Loc.span @@ List.map Loc.location elts in + let elts = trim_start elts in + Loc.at span @@ `Paragraph elts + +type align_error = + | Invalid_align (* An invalid align cell *) + | Not_align (* Not an align cell *) + +(* This could be made a bit more specific by allowing Space elements only at + the beginning and end *) +let valid_elements (cell : Ast.inline_element list) : string option = + let rec go acc = function + | `Word _ :: _ when Option.is_some acc -> None + | `Word word :: rest -> go (Some word) rest + | `Space _ :: rest -> go acc rest + | _ :: _ -> None + | [] -> acc + in + go None cell + +let valid_word word = + match String.length word with + | 0 -> Ok None + | 1 -> ( + match word.[0] with + | ':' -> Ok (Some `Center) + | '-' -> Ok None + | _ -> Error Not_align) + | len -> + if String.for_all (Char.equal '-') (String.sub word 1 (len - 2)) then + match (word.[0], word.[pred len]) with + | ':', '-' -> Ok (Some `Left) + | '-', ':' -> Ok (Some `Right) + | ':', ':' -> Ok (Some `Center) + | '-', '-' -> Ok None + | _ -> Error Invalid_align + else Error Not_align + +let valid_align_cell (cell : Ast.inline_element Loc.with_location list) = + match List.map Loc.value cell |> valid_elements with + | Some word -> valid_word word + | None -> Error Not_align + +let sequence : ('elt, 'err) result list -> ('elt list, 'err) result = + fun list -> + let rec go acc : ('elt, 'err) result list -> ('elt list, 'err) result = + function + | Ok x :: xs -> go (x :: acc) xs + | Error err :: _ -> Error err + | [] -> Ok (List.rev acc) + in + go [] list + +(* NOTE: (@FayCarsons) + When we get something that doesn't look like an align at all, we check to see if we've gotten + any valid aligns, if so we assume that the cell being considered is supposed to be an align and treat it as an error, + otherwise we assume the row is not supposed to be an align row + This is somewhat error prone, using i.e. '--' to denote an empty table cell seems reasonable +*) +let valid_align_row (row : Ast.inline_element Loc.with_location list list) : + (Ast.alignment option list, align_error) result = + let align, not_align = + List.map valid_align_cell row + |> List.partition (function + | Ok _ | Error Invalid_align -> true + | _ -> false) + in + match (align, not_align) with + | _ :: _, _ :: _ -> Error Invalid_align + | _ :: _, [] -> sequence align + | _ -> Error Not_align + +(* Merges inline elements within a cell into a single paragraph element, and tags cells w/ tag *) +let merged_tagged_row tag : 'a Loc.with_location list list -> 'b = + List.map (fun elts -> ([ paragraph elts ], tag)) +let as_data = merged_tagged_row `Data +let as_header = merged_tagged_row `Header + +let is_valid_align row = Result.is_ok @@ valid_align_row row + +(* + - If the first row is the alignment row then the rest should be data + - Otherwise the first should be the headers, the second align, and the rest data + - If there's only one row and it's not the align row, then it's data + *) +let construct_table : + span:Loc.span -> + Ast.inline_element Loc.with_location list list list -> + Ast.nestable_block_element Loc.with_location = + fun ~span grid -> + match grid with + | [ only_row ] -> ( + match valid_align_row only_row with + | Ok align -> Loc.at span @@ `Table (([ [] ], Some align), `Light) + | _ -> Loc.at span @@ `Table (([ as_data only_row ], None), `Light)) + | align :: data when is_valid_align align -> + let align = Result.get_ok @@ valid_align_row align in + Loc.at span @@ `Table ((List.map as_data data, Some align), `Light) + | header :: align :: data when is_valid_align align -> + let align = Result.get_ok @@ valid_align_row align in + Loc.at span + @@ `Table ((as_header header :: List.map as_data data, Some align), `Light) + | data -> Loc.at span @@ `Table ((List.map as_data data, None), `Light) + +let unclosed_table + ?(data : + Ast.inline_element Loc.with_location list list list Writer.t option) + ~span warning : Ast.nestable_block_element Loc.with_location Writer.t = + let node = + match data with + | Some data -> + Writer.map + (fun data -> + Loc.at span @@ `Table ((List.map as_data data, None), `Light)) + data + | None -> + let inner = Loc.at span @@ `Table (([], None), `Light) in + Writer.return inner + in + Writer.warning warning node + +let media_kind_of_target = + let open Tokens in + function Audio -> `Audio | Video -> `Video | Image -> `Image + +let href_of_media = + let open Tokens in + function Reference name -> `Reference name | Link uri -> `Link uri + +let split_simple_media Loc.{ location; value = media, target } = + (Loc.at location media, target) + +let split_replacement_media Loc.{ location; value = media, target, content } = + (Loc.at location media, target, content) + +let rec inline_element_inner : Ast.inline_element -> string = function + | `Space s -> s + | `Word s -> s + | `Styled (_, s) -> children s + | `Code_span s -> s + | `Raw_markup (_, s) -> s + | `Reference (_, _, s) -> children s + | `Link (_, s) -> children s + | `Math_span s -> s + +and children s = + List.fold_left + (fun acc elt -> acc ^ inline_element_inner (Loc.value elt)) + "" s + +let legal_module_list : Ast.inline_element Loc.with_location list -> bool = + fun xs -> + not_empty xs + && List.for_all (function `Word _ | `Space _ -> true | _ -> false) + @@ List.map Loc.value xs diff --git a/src/parser/syntax.ml b/src/parser/syntax.ml deleted file mode 100644 index a49feddb5e..0000000000 --- a/src/parser/syntax.ml +++ /dev/null @@ -1,1532 +0,0 @@ -(* - - -(* This module is a recursive descent parser for the ocamldoc syntax. The parser - consumes a token stream of type [Token.t Stream.t], provided by the lexer, - and produces a comment AST of the type defined in [Parser_.Ast]. - - The AST has two main levels: inline elements, which can appear inside - paragraphs, and are spaced horizontally when presented, and block elements, - such as paragraphs and lists, which are spaced vertically when presented. - Block elements contain inline elements, but not vice versa. - - Corresponding to this, the parser has three "main" functions: - - - [delimited_inline_element_list] parses a run of inline elements that is - delimited by curly brace markup ([{...}]). - - [paragraph] parses a run of inline elements that make up a paragraph, and - is not explicitly delimited with curly braces. - - [block_element_list] parses a sequence of block elements. A comment is a - sequence of block elements, so [block_element_list] is the top-level - parser. It is also used for list item and tag content. *) - -open! Compat - -type 'a with_location = 'a Loc.with_location - -(* {2 Input} *) - -type input = { - tokens : Token.t Loc.with_location Stream.t; - warnings : Warning.t list ref; -} - -(* {2 Output} *) - -let add_warning input warning = input.warnings := warning :: !(input.warnings) -let junk input = Stream.junk input.tokens - -let peek input = - match Stream.peek input.tokens with - | Some token -> token - | None -> assert false - -module Table = struct - module Light_syntax = struct - let valid_align = function - | [ { Loc.value = `Word w; _ } ] -> ( - match String.length w with - | 0 -> `Valid None - | 1 -> ( - match w with - | "-" -> `Valid None - | ":" -> `Valid (Some `Center) - | _ -> `Invalid) - | len -> - if String.for_all (Char.equal '-') (String.sub w 1 (len - 2)) then - match (String.get w 0, String.get w (len - 1)) with - | ':', ':' -> `Valid (Some `Center) - | ':', '-' -> `Valid (Some `Left) - | '-', ':' -> `Valid (Some `Right) - | '-', '-' -> `Valid None - | _ -> `Invalid - else `Invalid) - | _ -> `Invalid - - let valid_align_row lx = - let rec loop acc = function - | [] -> Some (List.rev acc) - | x :: q -> ( - match valid_align x with - | `Invalid -> None - | `Valid alignment -> loop (alignment :: acc) q) - in - loop [] lx - - let create ~grid ~align : Ast.table = - let cell_to_block (x, k) = - let whole_loc = Loc.span (List.map (fun x -> x.Loc.location) x) in - match x with - | [] -> ([], k) - | _ -> ([ Loc.at whole_loc (`Paragraph x) ], k) - in - let row_to_block = List.map cell_to_block in - let grid_to_block = List.map row_to_block in - ((grid_to_block grid, align), `Light) - - let with_kind kind : 'a with_location list list -> 'a Ast.row = - List.map (fun c -> (c, kind)) - - let from_raw_data grid : Ast.table = - match grid with - | [] -> create ~grid:[] ~align:None - | row1 :: rows2_N -> ( - match valid_align_row row1 with - (* If the first line is the align row, everything else is data. *) - | Some _ as align -> - create ~grid:(List.map (with_kind `Data) rows2_N) ~align - | None -> ( - match rows2_N with - (* Only 1 line, if this is not the align row this is data. *) - | [] -> create ~grid:[ with_kind `Data row1 ] ~align:None - | row2 :: rows3_N -> ( - match valid_align_row row2 with - (* If the second line is the align row, the first one is the - header and the rest is data. *) - | Some _ as align -> - let header = with_kind `Header row1 in - let data = List.map (with_kind `Data) rows3_N in - create ~grid:(header :: data) ~align - (* No align row in the first 2 lines, everything is considered - data. *) - | None -> - create ~grid:(List.map (with_kind `Data) grid) ~align:None - ))) - end - - module Heavy_syntax = struct - let create ~grid : Ast.table = ((grid, None), `Heavy) - let from_grid grid : Ast.table = create ~grid - end -end - -module Reader = struct - let until_rbrace_or_eof input acc = - let rec consume () = - let next_token = peek input in - match next_token.value with - | `Right_brace -> - junk input; - `End (acc, next_token.location) - | `End -> - Parse_error.end_not_allowed next_token.location ~in_what:"table" - |> add_warning input; - junk input; - `End (acc, next_token.location) - | `Space _ | `Single_newline _ | `Blank_line _ -> - junk input; - consume () - | _ -> `Token next_token - in - consume () - - module Infix = struct - let ( >>> ) consume if_token = - match consume with - | `End (ret, loc) -> (ret, loc) - | `Token t -> if_token t - end -end - -open Reader.Infix - -(* The last token in the stream is always [`End], and it is never consumed by - the parser, so the [None] case is impossible. *) - -let npeek n input = Stream.npeek n input.tokens - -(* {2 Non-link inline elements} *) -type style = [ `Bold | `Italic | `Emphasis | `Superscript | `Subscript ] - -(* Convenient abbreviation for use in patterns. *) -type token_that_always_begins_an_inline_element = - [ `Word of string - | `Code_span of string - | `Raw_markup of string option * string - | `Begin_style of style - | `Simple_reference of string - | `Begin_reference_with_replacement_text of string - | `Simple_link of string - | `Begin_link_with_replacement_text of string - | `Math_span of string ] - -(* Check that the token constructors above actually are all in [Token.t]. *) -let _check_subset : token_that_always_begins_an_inline_element -> Token.t = - fun t -> (t :> Token.t) - -(* Consumes tokens that make up a single non-link inline element: - - - a horizontal space ([`Space], significant in inline elements), - - a word (see [word]), - - a code span ([...], [`Code_span _]), or - - styled text ({e ...}). - - The latter requires a recursive call to [delimited_inline_element_list], - defined below. - - This should be part of [delimited_inline_element_list]; however, it is also - called by function [paragraph]. As a result, it is factored out, and made - mutually-recursive with [delimited_inline_element_list]. - - This is called only when it is known that the first token in the list is the - beginning of an inline element. In the case of [`Minus] and [`Plus], that - means the caller has determined that they are not a list bullet (i.e., not - the first non-whitespace tokens on their line). - - This function consumes exactly the tokens that make up the element. *) -let rec inline_element : - input -> Loc.span -> _ -> Ast.inline_element with_location = - fun input location next_token -> - match next_token with - | `Space _ as token -> - junk input; - Loc.at location token - | `Word _ as token -> - junk input; - Loc.at location token - (* This is actually the same memory representation as the token, complete - with location, and is probably the most common case. Perhaps the token - can be reused somehow. The same is true of [`Space], [`Code_span]. *) - | `Minus -> - junk input; - Loc.at location (`Word "-") - | `Plus -> - junk input; - Loc.at location (`Word "+") - | `Bar -> - junk input; - Loc.at location (`Word "|") - | (`Code_span _ | `Math_span _ | `Raw_markup _) as token -> - junk input; - Loc.at location token - | `Begin_style s as parent_markup -> - junk input; - - let requires_leading_whitespace = - match s with - | `Bold | `Italic | `Emphasis -> true - | `Superscript | `Subscript -> false - in - let content, brace_location = - delimited_inline_element_list ~parent_markup - ~parent_markup_location:location ~requires_leading_whitespace input - in - - let location = Loc.span [ location; brace_location ] in - - if content = [] then - Parse_error.should_not_be_empty - ~what:(Token.describe parent_markup) - location - |> add_warning input; - - Loc.at location (`Styled (s, content)) - | `Simple_reference r -> - junk input; - - let r_location = Loc.nudge_start (String.length "{!") location in - let r = Loc.at r_location r in - - Loc.at location (`Reference (`Simple, r, [])) - | `Begin_reference_with_replacement_text r as parent_markup -> - junk input; - - let r_location = Loc.nudge_start (String.length "{{!") location in - let r = Loc.at r_location r in - - let content, brace_location = - delimited_inline_element_list ~parent_markup - ~parent_markup_location:location ~requires_leading_whitespace:false - input - in - - let location = Loc.span [ location; brace_location ] in - - if content = [] then - Parse_error.should_not_be_empty - ~what:(Token.describe parent_markup) - location - |> add_warning input; - - Loc.at location (`Reference (`With_text, r, content)) - | `Simple_link u -> - junk input; - - let u = String.trim u in - - if u = "" then - Parse_error.should_not_be_empty - ~what:(Token.describe next_token) - location - |> add_warning input; - - Loc.at location (`Link (u, [])) - | `Begin_link_with_replacement_text u as parent_markup -> - junk input; - - let u = String.trim u in - - if u = "" then - Parse_error.should_not_be_empty - ~what:(Token.describe parent_markup) - location - |> add_warning input; - - let content, brace_location = - delimited_inline_element_list ~parent_markup - ~parent_markup_location:location ~requires_leading_whitespace:false - input - in - - `Link (u, content) |> Loc.at (Loc.span [ location; brace_location ]) - -(* Consumes tokens that make up a sequence of inline elements that is ended by - a '}', a [`Right_brace] token. The brace token is also consumed. - - The sequences are also preceded by some markup like '{b'. Some of these - markup tokens require whitespace immediately after the token, and others not. - The caller indicates which way that is through the - [~requires_leading_whitespace] argument. - - Whitespace is significant in inline element lists. In particular, "foo [bar]" - is represented as [`Word "foo"; `Space; `Code_span "bar"], while "foo[bar]" - is [`Word "foo"; `Code_span "bar"]. It doesn't matter how much whitespace is - there, just whether it is present or not. Single newlines and horizontal - space in any amount are allowed. Blank lines are not, as these are separators - for {e block} elements. - - In correct input, the first and last elements emitted will not be [`Space], - i.e. [`Space] appears only between other non-link inline elements. In - incorrect input, there might be [`Space] followed immediately by something - like an @author tag. - - The [~parent_markup] and [~parent_markup_location] arguments are used for - generating error messages. *) -and delimited_inline_element_list : - parent_markup:[< Token.t ] -> - parent_markup_location:Loc.span -> - requires_leading_whitespace:bool -> - input -> - Ast.inline_element with_location list * Loc.span = - fun ~parent_markup ~parent_markup_location ~requires_leading_whitespace input -> - (* [~at_start_of_line] is used to interpret [`Minus] and [`Plus]. These are - word tokens if not the first non-whitespace tokens on their line. Then, - they are allowed in a non-link element list. *) - let rec consume_elements : - at_start_of_line:bool -> - Ast.inline_element with_location list -> - Ast.inline_element with_location list * Loc.span = - fun ~at_start_of_line acc -> - let next_token = peek input in - match next_token.value with - | `Right_brace -> - junk input; - (List.rev acc, next_token.location) - (* The [`Space] token is not space at the beginning or end of line, because - that is combined into [`Single_newline] or [`Blank_line] tokens. It is - also not at the beginning of markup (after e.g. '{b'), because that is - handled separately before calling - [consume_non_link_inline_elements], and not immediately before '}', - because that is combined into the [`Right_brace] token by the lexer. So, - it is an internal space, and we want to add it to the non-link inline - element list. *) - | (`Space _ | #token_that_always_begins_an_inline_element) as token -> - let acc = inline_element input next_token.location token :: acc in - consume_elements ~at_start_of_line:false acc - | `Single_newline ws -> - junk input; - let element = Loc.same next_token (`Space ws) in - consume_elements ~at_start_of_line:true (element :: acc) - | `Blank_line ws as blank -> - Parse_error.not_allowed ~what:(Token.describe blank) - ~in_what:(Token.describe parent_markup) - next_token.location - |> add_warning input; - - junk input; - let element = Loc.same next_token (`Space ws) in - consume_elements ~at_start_of_line:true (element :: acc) - | `Bar as token -> - let acc = inline_element input next_token.location token :: acc in - consume_elements ~at_start_of_line:false acc - | (`Minus | `Plus) as bullet -> - (if at_start_of_line then - let suggestion = - Printf.sprintf "move %s so it isn't the first thing on the line." - (Token.print bullet) - in - Parse_error.not_allowed ~what:(Token.describe bullet) - ~in_what:(Token.describe parent_markup) - ~suggestion next_token.location - |> add_warning input); - - let acc = inline_element input next_token.location bullet :: acc in - consume_elements ~at_start_of_line:false acc - | other_token -> - Parse_error.not_allowed - ~what:(Token.describe other_token) - ~in_what:(Token.describe parent_markup) - next_token.location - |> add_warning input; - - let last_location = - match acc with - | last_token :: _ -> last_token.location - | [] -> parent_markup_location - in - - (List.rev acc, last_location) - in - - let first_token = peek input in - match first_token.value with - | `Space _ -> - junk input; - consume_elements ~at_start_of_line:false [] - (* [~at_start_of_line] is [false] here because the preceding token was some - some markup like '{b', and we didn't move to the next line, so the next - token will not be the first non-whitespace token on its line. *) - | `Single_newline _ -> - junk input; - consume_elements ~at_start_of_line:true [] - | `Blank_line _ as blank -> - (* In case the markup is immediately followed by a blank line, the error - message printed by the catch-all case below can be confusing, as it will - suggest that the markup must be followed by a newline (which it is). It - just must not be followed by two newlines. To explain that clearly, - handle that case specifically. *) - Parse_error.not_allowed ~what:(Token.describe blank) - ~in_what:(Token.describe parent_markup) - first_token.location - |> add_warning input; - - junk input; - consume_elements ~at_start_of_line:true [] - | `Right_brace -> - junk input; - ([], first_token.location) - | _ -> - if requires_leading_whitespace then - Parse_error.should_be_followed_by_whitespace - ~what:(Token.print parent_markup) - parent_markup_location - |> add_warning input; - consume_elements ~at_start_of_line:false [] - -(* {2 Paragraphs} *) - -(* Consumes tokens that make up a paragraph. - - A paragraph is a sequence of inline elements that ends on a blank line, or - explicit block markup such as a verbatim block on a new line. - - Because of the significance of newlines, paragraphs are parsed line-by-line. - The function [paragraph] is called only when the current token is the first - non-whitespace token on its line, and begins an inline element. [paragraph] - then parses a line of inline elements. Afterwards, it looks ahead to the next - line. If that line also begins with an inline element, it parses that line, - and so on. *) -let paragraph : input -> Ast.nestable_block_element with_location = - fun input -> - (* Parses a single line of a paragraph, consisting of inline elements. The - only valid ways to end a paragraph line are with [`End], [`Single_newline], - [`Blank_line], and [`Right_brace]. Everything else either belongs in the - paragraph, or signifies an attempt to begin a block element inside a - paragraph line, which is an error. These errors are caught elsewhere; the - paragraph parser just stops. *) - let rec paragraph_line : - Ast.inline_element with_location list -> - Ast.inline_element with_location list = - fun acc -> - let next_token = peek input in - match next_token.value with - | ( `Space _ | `Minus | `Plus | `Bar - | #token_that_always_begins_an_inline_element ) as token -> - let element = inline_element input next_token.location token in - paragraph_line (element :: acc) - | _ -> acc - in - - (* After each line is parsed, decides whether to parse more lines. *) - let rec additional_lines : - Ast.inline_element with_location list -> - Ast.inline_element with_location list = - fun acc -> - match npeek 2 input with - | { value = `Single_newline ws; location } - :: { value = #token_that_always_begins_an_inline_element | `Bar; _ } - :: _ -> - junk input; - let acc = Loc.at location (`Space ws) :: acc in - let acc = paragraph_line acc in - additional_lines acc - | _ -> List.rev acc - in - - let elements = paragraph_line [] |> additional_lines in - `Paragraph elements |> Loc.at (Loc.span (List.map Loc.location elements)) - -(* {2 Block elements} *) - -(* {3 Helper types} *) - -(* The interpretation of tokens in the block parser depends on where on a line - each token appears. The six possible "locations" are: - - - [`At_start_of_line], when only whitespace has been read on the current - line. - - [`After_tag], when a valid tag token, such as [@deprecated], has been read, - and only whitespace has been read since. - - [`After_shorthand_bullet], when a valid shorthand list item bullet, such as - [-], has been read, and only whitespace has been read since. - - [`After_explicit_list_bullet], when a valid explicit bullet, such as [{li], - has been read, and only whitespace has been read since. - - [`After_table_cell], when a table cell opening markup ('{th' or '{td') has been read. - - [`After_text], when any other valid non-whitespace token has already been - read on the current line. - - Here are some examples of how this affects the interpretation of tokens: - - - A paragraph can start anywhere except [`After_text] (two paragraphs cannot - be on the same line, but paragraphs can be nested in just about anything). - - [`Minus] is interpreted as a list item bullet [`At_start_of_line], - [`After_tag], and [`After_explicit_list_bullet]. - - Tags are only allowed [`At_start_of_line]. - - To track the location accurately, the functions that make up the block parser - pass explicit [where_in_line] values around and return them. - - In a few cases, [where_in_line] can be inferred from what helper was called. - For example, the [paragraph] parser always stops on the same line as the last - significant token that is in the paragraph it consumed, so the location must - be [`After_text]. *) -type where_in_line = - [ `At_start_of_line - | `After_tag - | `After_shorthand_bullet - | `After_explicit_list_bullet - | `After_table_cell - | `After_text ] - -(* The block parsing loop, function [block_element_list], stops when it - encounters certain tokens. - - When it is called for the whole comment, or for in explicit list item - ([{li foo}]), it can only stop on end of input or a right brace. - - When it is called inside a shorthand list item ([- foo]), it stops on end of - input, right brace, a blank line (indicating end of shorthand list), plus or - minus (indicating the start of the next list item), or a section heading or - tag, which cannot be nested in list markup. - - The block parser [block_element_list] explicitly returns the token that - stopped it, with a type more precise than [Token.t stream_head]: if it was - called for the whole comment or an explicit list item, the stop token will - have type [stops_at_delimiters stream_head], and if it was called for a - shorthand list item, the stop token will have type - [implicit_stop stream_head]. This allows the calling parsers to write precise - cases for exactly the tokens that might be at the front of the stream after - the block parser returns. *) -type stops_at_delimiters = [ `End | `Right_brace ] -type code_stop = [ `End | `Right_code_delimiter ] - -type stopped_implicitly = - [ `End - | `Blank_line of string - | `Right_brace - | `Minus - | `Plus - | Token.section_heading - | Token.media_markup - | Token.tag ] - -(* Ensure that the above two types are really subsets of [Token.t]. *) -let _check_subset : stops_at_delimiters -> Token.t = fun t -> (t :> Token.t) -let _check_subset : stopped_implicitly -> Token.t = fun t -> (t :> Token.t) - -(* The different contexts in which the block parser [block_element_list] can be - called. The block parser's behavior depends somewhat on the context. For - example, while paragraphs are allowed anywhere, shorthand lists are not - allowed immediately inside other shorthand lists, while tags are not allowed - anywhere except at the comment top level. - - Besides telling the block parser how to behave, each context also carries two - types, which determine the return type of the block parser: - - - The type of blocks the parser returns. Note that [nestable_block_element] - is included in [block_element]. However, the extra block kinds in - [block_element] are only allowed at the comment top level. - - The type of token that the block parser stops at. See discussion above. *) -type ('block, 'stops_at_which_tokens) context = - | Top_level : (Ast.block_element, stops_at_delimiters) context - | In_shorthand_list : (Ast.nestable_block_element, stopped_implicitly) context - | In_explicit_list : (Ast.nestable_block_element, stops_at_delimiters) context - | In_table_cell : (Ast.nestable_block_element, stops_at_delimiters) context - | In_code_results : (Ast.nestable_block_element, code_stop) context - | In_tag : (Ast.nestable_block_element, Token.t) context - -(* This is a no-op. It is needed to prove to the type system that nestable block - elements are acceptable block elements in all contexts. *) -let accepted_in_all_contexts : - type block stops_at_which_tokens. - (block, stops_at_which_tokens) context -> - Ast.nestable_block_element -> - block = - fun context block -> - match context with - | Top_level -> (block :> Ast.block_element) - | In_shorthand_list -> block - | In_explicit_list -> block - | In_table_cell -> block - | In_code_results -> block - | In_tag -> block - -(* Converts a tag to a series of words. This is used in error recovery, when a - tag cannot be generated. *) -let tag_to_words = function - | `Author s -> [ `Word "@author"; `Space " "; `Word s ] - | `Before s -> [ `Word "@before"; `Space " "; `Word s ] - | `Canonical s -> [ `Word "@canonical"; `Space " "; `Word s ] - | `Deprecated -> [ `Word "@deprecated" ] - | `Inline -> [ `Word "@inline" ] - | `Open -> [ `Word "@open" ] - | `Closed -> [ `Word "@closed" ] - | `Hidden -> [ `Word "@hidden" ] - | `Param s -> [ `Word "@param"; `Space " "; `Word s ] - | `Raise s -> [ `Word "@raise"; `Space " "; `Word s ] - | `Return -> [ `Word "@return" ] - | `See (`Document, s) -> [ `Word "@see"; `Space " "; `Word ("\"" ^ s ^ "\"") ] - | `See (`File, s) -> [ `Word "@see"; `Space " "; `Word ("'" ^ s ^ "'") ] - | `See (`Url, s) -> [ `Word "@see"; `Space " "; `Word ("<" ^ s ^ ">") ] - | `Since s -> [ `Word "@since"; `Space " "; `Word s ] - | `Version s -> [ `Word "@version"; `Space " "; `Word s ] - -(* {3 Block element lists} *) - -(* Consumes tokens making up a sequence of block elements. These are: - - - paragraphs, - - code blocks, - - verbatim text blocks, - - tables, - - lists, and - - section headings. *) -let rec block_element_list : - type block stops_at_which_tokens. - (block, stops_at_which_tokens) context -> - parent_markup:[< Token.t | `Comment ] -> - input -> - block with_location list - * stops_at_which_tokens with_location - * where_in_line = - fun context ~parent_markup input -> - let rec consume_block_elements : - parsed_a_tag:bool -> - where_in_line -> - block with_location list -> - block with_location list - * stops_at_which_tokens with_location - * where_in_line = - fun ~parsed_a_tag where_in_line acc -> - let describe token = - match token with - | #token_that_always_begins_an_inline_element -> "paragraph" - | _ -> Token.describe token - in - - let warn_if_after_text { Loc.location; value = token } = - if where_in_line = `After_text then - Parse_error.should_begin_on_its_own_line ~what:(describe token) location - |> add_warning input - in - - let warn_if_after_tags { Loc.location; value = token } = - if parsed_a_tag then - let suggestion = - Printf.sprintf "move %s before any tags." (Token.describe token) - in - Parse_error.not_allowed ~what:(describe token) - ~in_what:"the tags section" ~suggestion location - |> add_warning input - in - - let warn_because_not_at_top_level { Loc.location; value = token } = - let suggestion = - Printf.sprintf "move %s outside of any other markup." - (Token.print token) - in - Parse_error.not_allowed ~what:(Token.describe token) - ~in_what:(Token.describe parent_markup) - ~suggestion location - |> add_warning input - in - - match peek input with - (* Terminators: the two tokens that terminate anything. *) - | { value = `End; _ } as next_token -> ( - match context with - | Top_level -> (List.rev acc, next_token, where_in_line) - | In_shorthand_list -> (List.rev acc, next_token, where_in_line) - | In_explicit_list -> (List.rev acc, next_token, where_in_line) - | In_tag -> (List.rev acc, next_token, where_in_line) - | In_table_cell -> (List.rev acc, next_token, where_in_line) - | In_code_results -> (List.rev acc, next_token, where_in_line)) - | { value = `Right_brace; _ } as next_token -> ( - (* This little absurdity is needed to satisfy the type system. Without it, - OCaml is unable to prove that [stream_head] has the right type for all - possible values of [context]. *) - match context with - | Top_level -> (List.rev acc, next_token, where_in_line) - | In_shorthand_list -> (List.rev acc, next_token, where_in_line) - | In_explicit_list -> (List.rev acc, next_token, where_in_line) - | In_table_cell -> (List.rev acc, next_token, where_in_line) - | In_tag -> (List.rev acc, next_token, where_in_line) - | In_code_results -> - junk input; - consume_block_elements ~parsed_a_tag where_in_line acc) - | { value = `Right_code_delimiter; _ } as next_token -> ( - match context with - | In_code_results -> (List.rev acc, next_token, where_in_line) - | _ -> - junk input; - consume_block_elements ~parsed_a_tag where_in_line acc) - (* Whitespace. This can terminate some kinds of block elements. It is also - necessary to track it to interpret [`Minus] and [`Plus] correctly, as - well as to ensure that all block elements begin on their own line. *) - | { value = `Space _; _ } -> - junk input; - consume_block_elements ~parsed_a_tag where_in_line acc - | { value = `Single_newline _; _ } -> - junk input; - consume_block_elements ~parsed_a_tag `At_start_of_line acc - | { value = `Blank_line _; _ } as next_token -> ( - match context with - (* Blank lines terminate shorthand lists ([- foo]). They also terminate - paragraphs, but the paragraph parser is aware of that internally. *) - | In_shorthand_list -> (List.rev acc, next_token, where_in_line) - (* Otherwise, blank lines are pretty much like single newlines. *) - | _ -> - junk input; - consume_block_elements ~parsed_a_tag `At_start_of_line acc) - (* Explicit list items ([{li ...}] and [{- ...}]) can never appear directly - in block content. They can only appear inside [{ul ...}] and [{ol ...}]. - So, catch those. *) - | { value = `Begin_list_item _ as token; location } -> - let suggestion = - Printf.sprintf "move %s into %s, or use %s." (Token.print token) - (Token.describe (`Begin_list `Unordered)) - (Token.describe `Minus) - in - Parse_error.not_allowed ~what:(Token.describe token) - ~in_what:(Token.describe parent_markup) - ~suggestion location - |> add_warning input; - - junk input; - consume_block_elements ~parsed_a_tag where_in_line acc - (* Table rows ([{tr ...}]) can never appear directly - in block content. They can only appear inside [{table ...}]. *) - | { value = `Begin_table_row as token; location } -> - let suggestion = - Printf.sprintf "move %s into %s." (Token.print token) - (Token.describe `Begin_table_heavy) - in - Parse_error.not_allowed ~what:(Token.describe token) - ~in_what:(Token.describe parent_markup) - ~suggestion location - |> add_warning input; - junk input; - consume_block_elements ~parsed_a_tag where_in_line acc - (* Table cells ([{th ...}] and [{td ...}]) can never appear directly - in block content. They can only appear inside [{tr ...}]. *) - | { value = `Begin_table_cell _ as token; location } -> - let suggestion = - Printf.sprintf "move %s into %s." (Token.print token) - (Token.describe `Begin_table_row) - in - Parse_error.not_allowed ~what:(Token.describe token) - ~in_what:(Token.describe parent_markup) - ~suggestion location - |> add_warning input; - junk input; - consume_block_elements ~parsed_a_tag where_in_line acc - (* Tags. These can appear at the top level only. Also, once one tag is seen, - the only top-level elements allowed are more tags. *) - | { value = `Tag tag as token; location } as next_token -> ( - let recover_when_not_at_top_level context = - warn_because_not_at_top_level next_token; - junk input; - let words = List.map (Loc.at location) (tag_to_words tag) in - let paragraph = - `Paragraph words - |> accepted_in_all_contexts context - |> Loc.at location - in - consume_block_elements ~parsed_a_tag `At_start_of_line - (paragraph :: acc) - in - - match context with - (* Tags cannot make sense in an explicit list ([{ul {li ...}}]). *) - | In_explicit_list -> recover_when_not_at_top_level context - (* If a tag starts at the beginning of a line, it terminates the preceding - tag and/or the current shorthand list. In this case, return to the - caller, and let the caller decide how to interpret the tag token. *) - | In_shorthand_list -> - if where_in_line = `At_start_of_line then - (List.rev acc, next_token, where_in_line) - else recover_when_not_at_top_level context - | In_table_cell -> recover_when_not_at_top_level context - | In_tag -> - if where_in_line = `At_start_of_line then - (List.rev acc, next_token, where_in_line) - else recover_when_not_at_top_level context - | In_code_results -> recover_when_not_at_top_level context - (* If this is the top-level call to [block_element_list], parse the - tag. *) - | Top_level -> ( - if where_in_line <> `At_start_of_line then - Parse_error.should_begin_on_its_own_line - ~what:(Token.describe token) location - |> add_warning input; - - junk input; - - match tag with - | (`Author s | `Since s | `Version s | `Canonical s) as tag -> - let s = String.trim s in - if s = "" then - Parse_error.should_not_be_empty ~what:(Token.describe token) - location - |> add_warning input; - let tag = - match tag with - | `Author _ -> `Author s - | `Since _ -> `Since s - | `Version _ -> `Version s - | `Canonical _ -> - (* TODO The location is only approximate, as we need lexer - cooperation to get the real location. *) - let r_location = - Loc.nudge_start (String.length "@canonical ") location - in - `Canonical (Loc.at r_location s) - in - - let tag = Loc.at location (`Tag tag) in - consume_block_elements ~parsed_a_tag:true `After_text - (tag :: acc) - | (`Deprecated | `Return) as tag -> - let content, _stream_head, where_in_line = - block_element_list In_tag ~parent_markup:token input - in - let tag = - match tag with - | `Deprecated -> `Deprecated content - | `Return -> `Return content - in - let location = - location :: List.map Loc.location content |> Loc.span - in - let tag = Loc.at location (`Tag tag) in - consume_block_elements ~parsed_a_tag:true where_in_line - (tag :: acc) - | (`Param _ | `Raise _ | `Before _) as tag -> - let content, _stream_head, where_in_line = - block_element_list In_tag ~parent_markup:token input - in - let tag = - match tag with - | `Param s -> `Param (s, content) - | `Raise s -> `Raise (s, content) - | `Before s -> `Before (s, content) - in - let location = - location :: List.map Loc.location content |> Loc.span - in - let tag = Loc.at location (`Tag tag) in - consume_block_elements ~parsed_a_tag:true where_in_line - (tag :: acc) - | `See (kind, target) -> - let content, _next_token, where_in_line = - block_element_list In_tag ~parent_markup:token input - in - let location = - location :: List.map Loc.location content |> Loc.span - in - let tag = `Tag (`See (kind, target, content)) in - let tag = Loc.at location tag in - consume_block_elements ~parsed_a_tag:true where_in_line - (tag :: acc) - | (`Inline | `Open | `Closed | `Hidden) as tag -> - let tag = Loc.at location (`Tag tag) in - consume_block_elements ~parsed_a_tag:true `After_text - (tag :: acc))) - | ( { value = #token_that_always_begins_an_inline_element; _ } - | { value = `Bar; _ } ) as next_token -> - warn_if_after_tags next_token; - warn_if_after_text next_token; - - let block = paragraph input in - let block = Loc.map (accepted_in_all_contexts context) block in - let acc = block :: acc in - consume_block_elements ~parsed_a_tag `After_text acc - | { value = `Verbatim s as token; location } as next_token -> - warn_if_after_tags next_token; - warn_if_after_text next_token; - if s = "" then - Parse_error.should_not_be_empty ~what:(Token.describe token) location - |> add_warning input; - - junk input; - let block = accepted_in_all_contexts context token in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements ~parsed_a_tag `After_text acc - | { value = `Math_block s as token; location } as next_token -> - warn_if_after_tags next_token; - warn_if_after_text next_token; - if s = "" then - Parse_error.should_not_be_empty ~what:(Token.describe token) location - |> add_warning input; - - junk input; - let block = accepted_in_all_contexts context token in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements ~parsed_a_tag `After_text acc - | { - value = - `Code_block (meta, delim, { value = s; location = v_loc }, has_outputs) - as token; - location; - } as next_token -> - warn_if_after_tags next_token; - warn_if_after_text next_token; - junk input; - let delimiter = if delim = "" then None else Some delim in - let output, location = - if not has_outputs then (None, location) - else - let content, next_token, _where_in_line = - block_element_list In_code_results ~parent_markup:token input - in - junk input; - let locations = - location :: List.map (fun content -> content.Loc.location) content - in - let location = Loc.span locations in - let location = { location with end_ = next_token.location.end_ } in - (Some content, location) - in - - if s = "" then - Parse_error.should_not_be_empty ~what:(Token.describe token) location - |> add_warning input; - - let meta = - match meta with - | None -> None - | Some (language, tags) -> Some { Ast.language; tags } - in - let block = - accepted_in_all_contexts context - (`Code_block - { - Ast.meta; - delimiter; - content = { value = s; location = v_loc }; - output; - }) - in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements ~parsed_a_tag `After_text acc - | { value = `Modules s as token; location } as next_token -> - warn_if_after_tags next_token; - warn_if_after_text next_token; - - junk input; - - (* TODO Use some library for a splitting function, or move this out into a - Util module. *) - let split_string delimiters s = - let rec scan_delimiters acc index = - if index >= String.length s then List.rev acc - else if String.contains delimiters s.[index] then - scan_delimiters acc (index + 1) - else scan_word acc index (index + 1) - and scan_word acc start_index index = - if index >= String.length s then - let word = String.sub s start_index (index - start_index) in - List.rev (word :: acc) - else if String.contains delimiters s.[index] then - let word = String.sub s start_index (index - start_index) in - scan_delimiters (word :: acc) (index + 1) - else scan_word acc start_index (index + 1) - in - - scan_delimiters [] 0 - in - - (* TODO Correct locations await a full implementation of {!modules} - parsing. *) - let modules = - split_string " \t\r\n" s |> List.map (fun r -> Loc.at location r) - in - - if modules = [] then - Parse_error.should_not_be_empty ~what:(Token.describe token) location - |> add_warning input; - - let block = accepted_in_all_contexts context (`Modules modules) in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements ~parsed_a_tag `After_text acc - | { value = `Begin_list kind as token; location } as next_token -> - warn_if_after_tags next_token; - warn_if_after_text next_token; - - junk input; - - let items, brace_location = - explicit_list_items ~parent_markup:token input - in - if items = [] then - Parse_error.should_not_be_empty ~what:(Token.describe token) location - |> add_warning input; - - let location = Loc.span [ location; brace_location ] in - let block = `List (kind, `Heavy, items) in - let block = accepted_in_all_contexts context block in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements ~parsed_a_tag `After_text acc - | { value = (`Begin_table_light | `Begin_table_heavy) as token; location } - as next_token -> - warn_if_after_tags next_token; - warn_if_after_text next_token; - junk input; - let block, brace_location = - let parent_markup = token in - let parent_markup_location = location in - match token with - | `Begin_table_light -> - light_table input ~parent_markup ~parent_markup_location - | `Begin_table_heavy -> - heavy_table input ~parent_markup ~parent_markup_location - in - let location = Loc.span [ location; brace_location ] in - let block = accepted_in_all_contexts context (`Table block) in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements ~parsed_a_tag `After_text acc - | { value = (`Minus | `Plus) as token; location } as next_token -> ( - (match where_in_line with - | `After_text | `After_shorthand_bullet -> - Parse_error.should_begin_on_its_own_line - ~what:(Token.describe token) location - |> add_warning input - | _ -> ()); - - warn_if_after_tags next_token; - - match context with - | In_shorthand_list -> (List.rev acc, next_token, where_in_line) - | _ -> - let items, where_in_line = - shorthand_list_items next_token where_in_line input - in - let kind = - match token with `Minus -> `Unordered | `Plus -> `Ordered - in - let location = - location :: List.map Loc.location (List.flatten items) |> Loc.span - in - let block = `List (kind, `Light, items) in - let block = accepted_in_all_contexts context block in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements ~parsed_a_tag where_in_line acc) - | { value = `Begin_section_heading (level, label) as token; location } as - next_token -> ( - warn_if_after_tags next_token; - - let recover_when_not_at_top_level context = - warn_because_not_at_top_level next_token; - junk input; - let content, brace_location = - delimited_inline_element_list ~parent_markup:token - ~parent_markup_location:location ~requires_leading_whitespace:true - input - in - let location = Loc.span [ location; brace_location ] in - let paragraph = - `Paragraph content - |> accepted_in_all_contexts context - |> Loc.at location - in - consume_block_elements ~parsed_a_tag `At_start_of_line - (paragraph :: acc) - in - - match context with - | In_shorthand_list -> - if where_in_line = `At_start_of_line then - (List.rev acc, next_token, where_in_line) - else recover_when_not_at_top_level context - | In_explicit_list -> recover_when_not_at_top_level context - | In_table_cell -> recover_when_not_at_top_level context - | In_tag -> recover_when_not_at_top_level context - | In_code_results -> recover_when_not_at_top_level context - | Top_level -> - if where_in_line <> `At_start_of_line then - Parse_error.should_begin_on_its_own_line - ~what:(Token.describe token) location - |> add_warning input; - - let label = - match label with - | Some "" -> - Parse_error.should_not_be_empty ~what:"heading label" location - |> add_warning input; - None - | _ -> label - in - - junk input; - - let content, brace_location = - delimited_inline_element_list ~parent_markup:token - ~parent_markup_location:location - ~requires_leading_whitespace:true input - in - if content = [] then - Parse_error.should_not_be_empty ~what:(Token.describe token) - location - |> add_warning input; - - let location = Loc.span [ location; brace_location ] in - let heading = `Heading (level, label, content) in - let heading = Loc.at location heading in - let acc = heading :: acc in - consume_block_elements ~parsed_a_tag `After_text acc) - | { value = `Begin_paragraph_style _ as token; location } -> - junk input; - let content, brace_location = - delimited_inline_element_list ~parent_markup:token - ~parent_markup_location:location ~requires_leading_whitespace:true - input - in - let location = Loc.span [ location; brace_location ] in - - Parse_error.markup_should_not_be_used ~what:(Token.describe token) - location - |> add_warning input; - - let paragraph = - `Paragraph content - |> accepted_in_all_contexts context - |> Loc.at location - in - consume_block_elements ~parsed_a_tag `At_start_of_line (paragraph :: acc) - | { - location; - value = `Media_with_replacement_text (href, media, content) as token; - } as next_token -> - warn_if_after_tags next_token; - - junk input; - - let r_location = - Loc.nudge_start - (String.length @@ Token.s_of_media `Replaced media) - location - |> Loc.nudge_end (String.length content + 1) - (* +1 for closing character *) - in - let c_location = - Loc.nudge_start - (String.length (Token.s_of_media `Replaced media) - + String.length (match href with `Reference s | `Link s -> s)) - location - |> Loc.nudge_end 1 - in - let content = String.trim content in - let href = href |> Loc.at r_location in - - if content = "" then - Parse_error.should_not_be_empty ~what:(Token.describe token) - c_location - |> add_warning input; - - let block = `Media (`Simple, href, content, media) in - let block = accepted_in_all_contexts context block in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements ~parsed_a_tag `After_text acc - | { location; value = `Simple_media (href, media) } as next_token -> - warn_if_after_tags next_token; - - junk input; - - let r_location = - Loc.nudge_start - (String.length @@ Token.s_of_media `Simple media) - location - |> Loc.nudge_end 1 - in - let href = href |> Loc.at r_location in - let block = `Media (`Simple, href, "", media) in - let block = accepted_in_all_contexts context block in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements ~parsed_a_tag `After_text acc - in - - let where_in_line = - match context with - | Top_level -> `At_start_of_line - | In_shorthand_list -> `After_shorthand_bullet - | In_explicit_list -> `After_explicit_list_bullet - | In_table_cell -> `After_table_cell - | In_code_results -> `After_tag - | In_tag -> `After_tag - in - - consume_block_elements ~parsed_a_tag:false where_in_line [] - -(* {3 Lists} *) - -(* Consumes a sequence of implicit list items. Each one consists of a [`Minus] - or [`Plus] token, followed by block elements until: - - - a blank line, or - - a list bullet of the opposite kind (e.g. [`Plus] for a [`Minus] list). - - This function is called when the next token is known to be [`Minus] or - [`Plus]. It consumes that token, and calls the block element parser (see - above). That parser returns to [implicit_list_items] only on [`Blank_line], - [`End], [`Minus] or [`Plus] at the start of a line, or [`Right_brace]. *) -and shorthand_list_items : - [ `Minus | `Plus ] with_location -> - where_in_line -> - input -> - Ast.nestable_block_element with_location list list * where_in_line = - fun first_token where_in_line input -> - let bullet_token = first_token.value in - - let rec consume_list_items : - [> ] with_location -> - where_in_line -> - Ast.nestable_block_element with_location list list -> - Ast.nestable_block_element with_location list list * where_in_line = - fun next_token where_in_line acc -> - match next_token.value with - | `End | `Right_brace | `Blank_line _ | `Tag _ | `Begin_section_heading _ - | `Simple_media _ | `Media_with_replacement_text _ -> - (List.rev acc, where_in_line) - | (`Minus | `Plus) as bullet -> - if bullet = bullet_token then ( - junk input; - - let content, stream_head, where_in_line = - block_element_list In_shorthand_list ~parent_markup:bullet input - in - if content = [] then - Parse_error.should_not_be_empty ~what:(Token.describe bullet) - next_token.location - |> add_warning input; - - let acc = content :: acc in - consume_list_items stream_head where_in_line acc) - else (List.rev acc, where_in_line) - in - - consume_list_items - (first_token :> stopped_implicitly with_location) - where_in_line [] - -(* Consumes a sequence of explicit list items (starting with '{li ...}' and - '{-...}', which are represented by [`Begin_list_item _] tokens). - - This function is called immediately after '{ul' or '{ol' ([`Begin_list _]) is - read. The only "valid" way to exit is by reading a [`Right_brace] token, - which is consumed. - - Whitespace inside the list, but outside list items, is not significant – this - parsing function consumes all of it. Otherwise, only list item start tokens - are accepted. Everything else is an error. *) -and explicit_list_items : - parent_markup:[< Token.t ] -> - input -> - Ast.nestable_block_element with_location list list * Loc.span = - fun ~parent_markup input -> - let rec consume_list_items : - Ast.nestable_block_element with_location list list -> - Ast.nestable_block_element with_location list list * Loc.span = - fun acc -> - let next_token = peek input in - match next_token.value with - | `End -> - Parse_error.end_not_allowed next_token.location - ~in_what:(Token.describe parent_markup) - |> add_warning input; - (List.rev acc, next_token.location) - | `Right_brace -> - junk input; - (List.rev acc, next_token.location) - | `Space _ | `Single_newline _ | `Blank_line _ -> - junk input; - consume_list_items acc - | `Begin_list_item kind as token -> - junk input; - - (* '{li', represented by [`Begin_list_item `Li], must be followed by - whitespace. *) - (if kind = `Li then - match (peek input).value with - | `Space _ | `Single_newline _ | `Blank_line _ | `Right_brace -> - () - (* The presence of [`Right_brace] above requires some explanation: - - - It is better to be silent about missing whitespace if the next - token is [`Right_brace], because the error about an empty list - item will be generated below, and that error is more important to - the user. - - The [`Right_brace] token also happens to include all whitespace - before it, as a convenience for the rest of the parser. As a - result, not ignoring it could be wrong: there could in fact be - whitespace in the concrete syntax immediately after '{li', just - it is not represented as [`Space], [`Single_newline], or - [`Blank_line]. *) - | _ -> - Parse_error.should_be_followed_by_whitespace next_token.location - ~what:(Token.print token) - |> add_warning input); - - let content, token_after_list_item, _where_in_line = - block_element_list In_explicit_list ~parent_markup:token input - in - - if content = [] then - Parse_error.should_not_be_empty next_token.location - ~what:(Token.describe token) - |> add_warning input; - - (match token_after_list_item.value with - | `Right_brace -> junk input - | `End -> - Parse_error.end_not_allowed token_after_list_item.location - ~in_what:(Token.describe token) - |> add_warning input); - - let acc = content :: acc in - consume_list_items acc - | token -> - let suggestion = - match token with - | `Begin_section_heading _ | `Tag _ -> - Printf.sprintf "move %s outside the list." (Token.describe token) - | _ -> - Printf.sprintf "move %s into a list item, %s or %s." - (Token.describe token) - (Token.print (`Begin_list_item `Li)) - (Token.print (`Begin_list_item `Dash)) - in - Parse_error.not_allowed next_token.location ~what:(Token.describe token) - ~in_what:(Token.describe parent_markup) - ~suggestion - |> add_warning input; - - junk input; - consume_list_items acc - in - - consume_list_items [] - -(* Consumes a sequence of table rows that might start with [`Bar]. - - This function is called immediately after '{t' ([`Begin_table `Light]) is - read. The only "valid" way to exit is by reading a [`Right_brace] token, - which is consumed. *) -and light_table ~parent_markup ~parent_markup_location input = - let rec consume_rows acc ~last_loc = - Reader.until_rbrace_or_eof input acc >>> fun next_token -> - match next_token.Loc.value with - | `Bar | #token_that_always_begins_an_inline_element -> ( - let next, row, last_loc = - light_table_row ~parent_markup ~last_loc input - in - match next with - | `Continue -> consume_rows (row :: acc) ~last_loc - | `Stop -> (row :: acc, last_loc)) - | other_token -> - Parse_error.not_allowed next_token.location - ~what:(Token.describe other_token) - ~in_what:(Token.describe parent_markup) - |> add_warning input; - junk input; - consume_rows acc ~last_loc - in - let rows, brace_location = consume_rows [] ~last_loc:parent_markup_location in - let grid = List.rev rows in - (Table.Light_syntax.from_raw_data grid, brace_location) - -(* Consumes a table row that might start with [`Bar]. *) -and light_table_row ~parent_markup ~last_loc input = - let rec consume_row acc_row acc_cell acc_space ~new_line ~last_loc = - let push_cells row cell = - match cell with [] -> row | _ -> List.rev cell :: row - in - let return row cell = List.rev (push_cells row cell) in - let next_token = peek input in - match next_token.value with - | `End -> - Parse_error.end_not_allowed next_token.location ~in_what:"table" - |> add_warning input; - junk input; - (`Stop, return acc_row acc_cell, next_token.location) - | `Right_brace -> - junk input; - (`Stop, return acc_row acc_cell, next_token.location) - | `Space _ as token -> - junk input; - let i = Loc.at next_token.location token in - consume_row acc_row acc_cell (i :: acc_space) ~new_line ~last_loc - | `Single_newline _ | `Blank_line _ -> - junk input; - (`Continue, return acc_row acc_cell, last_loc) - | `Bar -> - junk input; - let acc_row = if new_line then [] else List.rev acc_cell :: acc_row in - consume_row acc_row [] [] ~new_line:false ~last_loc - | #token_that_always_begins_an_inline_element as token -> - let i = inline_element input next_token.location token in - if Loc.spans_multiple_lines i then - Parse_error.not_allowed - ~what:(Token.describe (`Single_newline "")) - ~in_what:(Token.describe `Begin_table_light) - i.location - |> add_warning input; - let acc_cell = - if acc_cell = [] then [ i ] else (i :: acc_space) @ acc_cell - in - consume_row acc_row acc_cell [] ~new_line:false - ~last_loc:next_token.location - | other_token -> - Parse_error.not_allowed next_token.location - ~what:(Token.describe other_token) - ~in_what:(Token.describe parent_markup) - |> add_warning input; - junk input; - consume_row acc_row acc_cell acc_space ~new_line ~last_loc - in - consume_row [] [] [] ~new_line:true ~last_loc - -(* Consumes a sequence of table rows (starting with '{tr ...}', which are - represented by [`Begin_table_row] tokens). - - This function is called immediately after '{table' ([`Begin_table `Heavy]) is - read. The only "valid" way to exit is by reading a [`Right_brace] token, - which is consumed. *) -and heavy_table ~parent_markup ~parent_markup_location input = - let rec consume_rows acc ~last_loc = - Reader.until_rbrace_or_eof input acc >>> fun next_token -> - match next_token.Loc.value with - | `Begin_table_row as token -> - junk input; - let items, last_loc = heavy_table_row ~parent_markup:token input in - consume_rows (List.rev items :: acc) ~last_loc - | token -> - Parse_error.not_allowed next_token.location ~what:(Token.describe token) - ~in_what:(Token.describe parent_markup) - ~suggestion:"Move outside of {table ...}, or inside {tr ...}" - |> add_warning input; - junk input; - consume_rows acc ~last_loc - in - let rows, brace_location = consume_rows [] ~last_loc:parent_markup_location in - let grid = List.rev rows in - (Table.Heavy_syntax.from_grid grid, brace_location) - -(* Consumes a sequence of table cells (starting with '{th ...}' or '{td ... }', - which are represented by [`Begin_table_cell] tokens). - - This function is called immediately after '{tr' ([`Begin_table_row]) is - read. The only "valid" way to exit is by reading a [`Right_brace] token, - which is consumed. *) -and heavy_table_row ~parent_markup input = - let rec consume_cell_items acc = - Reader.until_rbrace_or_eof input acc >>> fun next_token -> - match next_token.Loc.value with - | `Begin_table_cell kind as token -> - junk input; - let content, token_after_list_item, _where_in_line = - block_element_list In_table_cell ~parent_markup:token input - in - (match token_after_list_item.value with - | `Right_brace -> junk input - | `End -> - Parse_error.not_allowed token_after_list_item.location - ~what:(Token.describe `End) ~in_what:(Token.describe token) - |> add_warning input); - consume_cell_items ((content, kind) :: acc) - | token -> - Parse_error.not_allowed next_token.location ~what:(Token.describe token) - ~in_what:(Token.describe parent_markup) - ~suggestion: - "Move outside of {table ...}, or inside {td ...} or {th ...}" - |> add_warning input; - junk input; - consume_cell_items acc - in - consume_cell_items [] - -(* {2 Entry point} *) - -let parse warnings tokens = - let input : input = { tokens; warnings } in - - let rec parse_block_elements () = - let elements, last_token, _where_in_line = - block_element_list Top_level ~parent_markup:`Comment input - in - - match last_token.value with - | `End -> elements - | `Right_brace -> - Parse_error.unpaired_right_brace last_token.location - |> add_warning input; - - let block = - Loc.same last_token (`Paragraph [ Loc.same last_token (`Word "}") ]) - in - - junk input; - elements @ (block :: parse_block_elements ()) - in - let ast = parse_block_elements () in - (ast, List.rev !(input.warnings)) - - *) diff --git a/src/parser/syntax.mli b/src/parser/syntax.mli deleted file mode 100644 index e89945abf4..0000000000 --- a/src/parser/syntax.mli +++ /dev/null @@ -1,8 +0,0 @@ -(* Internal module, not exposed - -val parse : - Warning.t list ref -> - Token.t Loc.with_location Stream.t -> - Ast.t * Warning.t list - -*) From 31572d019da3c7130d23694fbdd1b75c22438bb1 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Fri, 10 Jan 2025 10:48:18 -0500 Subject: [PATCH 125/150] Fix `Paragraph_style` location --- src/parser/lexer.mll | 6 +++--- src/parser/parser.mly | 8 ++++++-- src/parser/tokens.ml | 14 +++++++------- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 592ff31641..b512eacd85 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -463,13 +463,13 @@ and token input = parse { Style Emphasis } | "{L" - { Paragraph_style Left } + { Paragraph_style { inner = Left; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } | "{C" - { Paragraph_style Center } + { Paragraph_style { inner = Center; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } | "{R" - { Paragraph_style Right } + { Paragraph_style { inner = Right; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } | "{^" { Style Superscript } diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 77f0fbd5b6..c6e6bd7d9b 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -18,7 +18,7 @@ %token Style "{i" (* or '{b' etc *) (* or '{C' or '{R', but this syntax has been deprecated and is only kept around so legacy codebases don't break :p *) -%token Paragraph_style "{L" +%token Paragraph_style "{L" %token MODULES "{!modules:" @@ -871,7 +871,11 @@ let paragraph_style := content = located(Paragraph_style); ws = paragraph; endpo let what = Tokens.describe @@ Paragraph_style content.Loc.value in Writer.Warning (Parse_error.markup_should_not_be_used span ~what) in - Writer.warning warning ws + let start = content.Loc.value.Tokens.start in + let endloc = endpos.Loc.location in + Writer.bind ws (fun ws -> + Writer.with_warning { ws with Loc.location = { endloc with start }} warning) + } let verbatim := verbatim = located(Verbatim); { diff --git a/src/parser/tokens.ml b/src/parser/tokens.ml index a84b3a4a0f..d0633244b5 100644 --- a/src/parser/tokens.ml +++ b/src/parser/tokens.ml @@ -52,7 +52,7 @@ type token = | Verbatim of string with_start_pos | RIGHT_CODE_DELIMITER | RIGHT_BRACE - | Paragraph_style of alignment + | Paragraph_style of alignment with_start_pos | Style of style | List of list_kind | LI @@ -115,9 +115,9 @@ let print : token -> string = function | Verbatim _ -> "{v" | RIGHT_CODE_DELIMITER -> "]}" | RIGHT_BRACE -> "}" - | Paragraph_style Left -> "'{L'" - | Paragraph_style Center -> "'{C'" - | Paragraph_style Right -> "'{R'" + | Paragraph_style { inner = Left; _ } -> "'{L'" + | Paragraph_style { inner = Center; _ } -> "'{C'" + | Paragraph_style { inner = Right; _ } -> "'{R'" | Style Bold -> "'{b'" | Style Italic -> "'{i'" | Style Emphasis -> "'{e'" @@ -170,9 +170,9 @@ let describe : token -> string = function | Word w -> Printf.sprintf "'%s'" w | Code_span _ -> "'[...]' (code)" | Raw_markup _ -> "'{%...%}' (raw markup)" - | Paragraph_style Left -> "'{L ...}' (left alignment)" - | Paragraph_style Center -> "'{C ...}' (center alignment)" - | Paragraph_style Right -> "'{R ...}' (right alignment)" + | Paragraph_style { inner = Left; _ } -> "'{L ...}' (left alignment)" + | Paragraph_style { inner = Center; _ } -> "'{C ...}' (center alignment)" + | Paragraph_style { inner = Right; _ } -> "'{R ...}' (right alignment)" | Style Bold -> "'{b ...}' (boldface text)" | Style Italic -> "'{i ...}' (italic text)" | Style Emphasis -> "'{e ...}' (emphasized text)" From 603da728a46bc3f6c0ea21badfdfe44b249fea83 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Fri, 10 Jan 2025 12:33:39 -0500 Subject: [PATCH 126/150] Cleanup, formatting, consistency --- src/parser/parser.mly | 632 +++++++++++++++++++-------------------- src/parser/parser_aux.ml | 19 +- src/parser/writer.ml | 64 ++-- 3 files changed, 352 insertions(+), 363 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index c6e6bd7d9b..a3c07c7dcc 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -81,19 +81,19 @@ (* UTILITIES *) -(* Utility which wraps the return value of a rule in `Loc.with_location` *) -let locatedM(rule) == inner = rule; { Writer.map (wrap_location $sloc) inner } +(* Utilities which wraps the return value of a rule in `Loc.with_location` *) +let locatedM(rule) == inner = rule; { wrap_location $sloc <$> inner } let located(rule) == inner = rule; { wrap_location $sloc inner } let with_position(rule) == inner = rule; { (inner, $sloc) } let position(rule) == _ = rule; { $sloc } -let sequence(rule) == xs = list(rule); { Writer.sequence xs } -let sequence_nonempty(rule) == xs = nonempty_list(rule); { Writer.sequence xs } - let delimited_location(opening, rule, closing) := startpos = located(opening); inner = rule; endpos = located(closing); { let span = Loc.delimited startpos endpos in Loc.at span inner -} +} +(* Utilities for working inside the Writer.t monad *) +let sequence(rule) == xs = list(rule); { Writer.sequence xs } +let sequence_nonempty(rule) == xs = nonempty_list(rule); { Writer.sequence xs } let separated_nonempty_sequence(sep, rule) := xs = separated_nonempty_list(sep, rule); { Writer.sequence xs } let separated_sequence(sep, rule) := xs = separated_list(sep, rule); { Writer.sequence xs } @@ -121,7 +121,7 @@ let main := let toplevel := | block = nestable_block_element; { (block :> Ast.block_element Loc.with_location Writer.t) } - | t = tag; { Writer.map (fun loc -> Loc.{ loc with value = `Tag loc.value }) t } + | t = tag; { Writer.map ~f:(fun loc -> Loc.{ loc with value = `Tag loc.value }) t } | ~ = section_heading; <> | ~ = toplevel_error; <> @@ -134,7 +134,7 @@ let toplevel_error := in let as_text = Loc.at span @@ `Word "+" in let node = (Loc.same as_text @@ `Paragraph [ as_text ]) in - Writer.with_warning node warning + Writer.return_warning node warning } | errloc = position(MINUS); whitespace?; { let span = Loc.of_position errloc in @@ -144,10 +144,16 @@ let toplevel_error := in let as_text = Loc.at span @@ `Word "-" in let node = (Loc.same as_text @@ `Paragraph [ as_text ]) in - Writer.with_warning node warning + Writer.return_warning node warning } | err = located(list_opening); horizontal_whitespace; children = sequence_nonempty(inline_element(horizontal_whitespace)); endpos = located(RIGHT_BRACE)?; { - let endloc = Option.value ~default:(Writer.unwrap children |> List.rev |> List.hd |> Loc.map @@ Fun.const ()) endpos in + let default = + Writer.get children + |> List.rev + |> List.hd + |> Loc.map (Fun.const ()) + in + let endloc = Option.value ~default endpos in let span = Loc.delimited err endloc in let not_allowed = Writer.Warning ( let what = Tokens.describe err.Loc.value in @@ -164,12 +170,12 @@ let toplevel_error := let unclosed = Writer.Warning ( Parse_error.unclosed_bracket ~bracket:(Tokens.print err.Loc.value) span) in - let* children = children in - let inner = { (paragraph children :> Ast.block_element Loc.with_location) with location = span } in - let m = Writer.with_warning inner not_allowed in - if Option.is_some endpos then m - else - Writer.warning unclosed m + Writer.bind children ~f:(fun c -> + let inner = { (paragraph c :> Ast.block_element Loc.with_location) with location = span } in + let m = Writer.return_warning inner not_allowed in + if Option.is_some endpos then m + else + Writer.warning unclosed m) } | errloc = position(BAR); whitespace?; { let span = Loc.of_position errloc in @@ -179,7 +185,7 @@ let toplevel_error := in let as_text = Loc.at span @@ `Word "|" in let node = (Loc.same as_text @@ `Paragraph [ as_text ]) in - Writer.with_warning node warning + Writer.return_warning node warning } | errloc = position(RIGHT_BRACE); whitespace?; { let span = Loc.of_position errloc in @@ -189,7 +195,7 @@ let toplevel_error := in let as_text = Loc.at span @@ `Word "}" in let node = (Loc.same as_text @@ `Paragraph [ as_text ]) in - Writer.with_warning node warning + Writer.return_warning node warning } | errloc = position(RIGHT_CODE_DELIMITER); { let span = Loc.of_position errloc in @@ -199,37 +205,22 @@ let toplevel_error := in let as_text = Loc.at span @@ `Word "{" in let node = Loc.same as_text @@ `Paragraph [ as_text ] in - Writer.with_warning node warning + Writer.return_warning node warning } | list = list_light; { - let* list = list in - let warning = - let Loc.{ value; location } = list in - let what = Tokens.describe @@ - match value with - | `List (`Ordered, `Light, _) -> Tokens.PLUS - | `List (`Unordered, `Light, _) -> Tokens.MINUS - | _ -> assert false (* Unreachable *) + Writer.bind list ~f:(fun list -> + let warning = + let Loc.{ value; location } = list in + let what = Tokens.describe @@ + match value with + | `List (`Ordered, `Light, _) -> Tokens.PLUS + | `List (`Unordered, `Light, _) -> Tokens.MINUS + | _ -> assert false (* Unreachable *) + in + Parse_error.should_begin_on_its_own_line ~what location in - Parse_error.should_begin_on_its_own_line ~what location - in - Writer.with_warning (list :> Ast.block_element Loc.with_location) (Writer.Warning warning) - } - (* - - | illegal_elt = tag; { - Writer.bind - illegal_elt - (fun illegal_elt -> - let should_begin_on_its_own_line = Writer.Warning ( - let what = Tokens.describe_tag illegal_elt.Loc.value in - Parse_error.should_begin_on_its_own_line ~what illegal_elt.Loc.location) - in - let illegal_elt = Loc.map (fun t -> `Tag t) illegal_elt in - let inner = (illegal_elt :> Ast.block_element Loc.with_location) in - Writer.with_warning inner should_begin_on_its_own_line) - } - *) + Writer.return_warning (list :> Ast.block_element Loc.with_location) (Writer.Warning warning)) + } (* SECTION HEADING *) @@ -237,7 +228,7 @@ let section_heading := | content = Section_heading; children = sequence_nonempty(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { let Tokens.{ inner = (num, title); start } = content in let span = { endpos.Loc.location with start } in - Writer.map (fun c -> Loc.at span @@ `Heading (num, title, trim_start c)) children + Writer.map ~f:(fun c -> Loc.at span @@ `Heading (num, title, trim_start c)) children } | content = Section_heading; endpos = located(RIGHT_BRACE); { let Tokens.{ inner = (num, title); start } = content in @@ -247,7 +238,7 @@ let section_heading := Writer.Warning (Parse_error.should_not_be_empty ~what span) in let node = Loc.at span @@ `Heading (num, title, []) in - Writer.with_warning node should_not_be_empty + Writer.return_warning node should_not_be_empty } | content = with_position(Section_heading); end_pos = position(error); { let Tokens.{ inner; start }, start_pos = content in @@ -262,7 +253,7 @@ let section_heading := Parse_error.illegal ~in_what err span) in let inner = Loc.at span @@ `Heading (num, title, []) in - Writer.with_warning inner illegal + Writer.return_warning inner illegal } (* TAGS *) @@ -282,7 +273,7 @@ let tag_with_content := let return := | startpos = located(RETURN); horizontal_whitespace; children = located(sequence_nonempty(nestable_block_element)); { let span = Loc.delimited startpos children in - Writer.map (fun c -> Loc.at span @@ `Return c) children.Loc.value + Writer.map ~f:(fun c -> Loc.at span @@ `Return c) children.Loc.value } | pos = located(RETURN); horizontal_whitespace?; { return (Loc.same pos @@ `Return []) @@ -290,9 +281,9 @@ let return := let deprecated := | startpos = located(DEPRECATED); horizontal_whitespace; children = locatedM(sequence_nonempty(nestable_block_element)); { - let* children = children in - let span = Loc.delimited startpos children in - return @@ Loc.at span (`Deprecated children.Loc.value) + Writer.bind children ~f:(fun c -> + let span = Loc.delimited startpos c in + return @@ Loc.at span (`Deprecated c.Loc.value)) } | pos = located(DEPRECATED); horizontal_whitespace?; { return @@ { pos with Loc.value = `Deprecated [] } } @@ -300,17 +291,17 @@ let deprecated := let warning = Writer.Warning (Parse_error.unpaired_right_brace @@ errloc.Loc.location) in - Writer.with_warning ({ pos with Loc.value = `Deprecated [] }) warning + Writer.return_warning ({ pos with Loc.value = `Deprecated [] }) warning } let before := | content = Before; horizontal_whitespace; children = sequence_nonempty(nestable_block_element); { - let* children = children in - let Tokens.{ inner; start } = content in - let child_span = Loc.span @@ List.map Loc.location children in - let span = { child_span with start } in - let inner = Loc.at span @@ `Before (inner, children) in - return inner + Writer.bind children ~f:(fun c -> + let Tokens.{ inner; start } = content in + let child_span = Loc.span @@ List.map Loc.location c in + let span = { child_span with start } in + let inner = Loc.at span @@ `Before (inner, c) in + return inner) } | content = located(Before); { let Loc.{ value = Tokens.{ inner; start }; location } = content in @@ -321,10 +312,10 @@ let raise := | content = located(Raise); horizontal_whitespace; children = located(sequence_nonempty(nestable_block_element)); { let Loc.{ value = Tokens.{ inner; start }; location } = content in let location = { location with start } in - let* children = children.Loc.value in - let span = Loc.span @@ location :: List.map Loc.location children in - let inner = Loc.at span @@ `Raise (inner, children) in - return inner + Writer.bind children.Loc.value ~f:(fun c -> + let span = Loc.span @@ location :: List.map Loc.location c in + let inner = Loc.at span @@ `Raise (inner, c) in + return inner) } | content = located(Raise); horizontal_whitespace?; { let Loc.{ value = Tokens.{ inner; start }; location } = content in @@ -336,9 +327,9 @@ let see := | content = located(See); horizontal_whitespace; children = located(sequence_nonempty(nestable_block_element)); { let Loc.{ value = Tokens.{ inner = (kind, href); start }; location } = content in let span = Loc.delimited { content with location = { location with start }} children in - let* children = children.Loc.value in - let inner = Loc.at span @@ `See (Tokens.to_ast_ref kind, href, children) in - return inner + Writer.bind children.Loc.value ~f:(fun c -> + let inner = Loc.at span @@ `See (Tokens.to_ast_ref kind, href, c) in + return inner) } | content = located(See); horizontal_whitespace?; { let Loc.{ value = Tokens.{ inner = (kind, href); start }; location } = content in @@ -348,11 +339,11 @@ let see := let param := | content = located(Param); horizontal_whitespace; children = sequence_nonempty(nestable_block_element); { - let* children = children in - let Loc.{ value = Tokens.{ inner; start }; location } = content in - let span = Loc.span @@ { location with start } :: List.map Loc.location children in - let inner = Loc.at span @@ `Param (inner, children) in - return inner + Writer.bind children ~f:(fun c -> + let Loc.{ value = Tokens.{ inner; start }; location } = content in + let span = Loc.span @@ { location with start } :: List.map Loc.location c in + let inner = Loc.at span @@ `Param (inner, c) in + return inner) } | content = located(Param); horizontal_whitespace?; { let Loc.{ value = Tokens.{ inner; start }; location } = content in @@ -369,7 +360,7 @@ let tag_bare := Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.ensure has_content warning (return inner) - |> Writer.map (fun v -> Loc.at location @@ `Version v ) + |> Writer.map ~f:(fun v -> Loc.at location @@ `Version v ) } | content = located(Since); { let Loc.{ value; location } = content in @@ -380,7 +371,7 @@ let tag_bare := Writer.Warning (Parse_error.should_not_be_empty ~what location) in Writer.ensure has_content warning (return inner) - |> Writer.map (fun v -> Loc.at location @@ `Since v ) + |> Writer.map ~f:(fun v -> Loc.at location @@ `Since v ) } | content = located(Canonical); { let Loc.{ value; location } = content in @@ -392,7 +383,7 @@ let tag_bare := in let location = { location with start } in Writer.ensure has_content warning @@ return inner - |> Writer.map (fun value -> Loc.at location @@ `Canonical (Loc.at location value)) + |> Writer.map ~f:(fun value -> Loc.at location @@ `Canonical (Loc.at location value)) } | content = located(Author); { let Loc.{ value; location } = content in @@ -402,7 +393,7 @@ let tag_bare := Writer.Warning (Parse_error.should_not_be_empty ~what location) in Writer.ensure has_content warning @@ return inner - |> Writer.map (fun a -> { Loc.value = `Author a; location = { location with start } }) + |> Writer.map ~f:(fun a -> { Loc.value = `Author a; location = { location with start } }) } | pos = position(OPEN); whitespace?; { let loc = Loc.of_position pos in return @@ Loc.at loc `Open } | pos = position(INLINE); whitespace?; { let loc = Loc.of_position pos in return @@ Loc.at loc `Inline } @@ -450,7 +441,7 @@ let style := Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.ensure not_empty warning children - |> Writer.map (fun c -> Loc.at span @@ `Styled (Tokens.to_ast_style style, trim_start c)) + |> Writer.map ~f:(fun c -> Loc.at span @@ `Styled (Tokens.to_ast_style style, trim_start c)) } | style = located(Style); endpos = located(RIGHT_BRACE); { let span = Loc.delimited style endpos in @@ -460,7 +451,7 @@ let style := Writer.Warning (Parse_error.should_not_be_empty ~what span) in let inner = Loc.at span @@ `Styled (Tokens.to_ast_style style, []) in - Writer.with_warning inner warning + Writer.return_warning inner warning } | style = located(Style); endpos = located(RIGHT_CODE_DELIMITER); { let span = Loc.delimited style endpos in @@ -484,10 +475,10 @@ let style := let in_what = Tokens.describe @@ Style style.Loc.value in let (start_pos, end_pos) = errloc in let illegal_section = Loc.extract ~input ~start_pos ~end_pos in - Parse_error.illegal ~in_what illegal_section span - ) in + Parse_error.illegal ~in_what illegal_section span) + in let inner = Loc.at span @@ `Styled (Tokens.to_ast_style style.Loc.value, []) in - Writer.with_warning inner illegal + Writer.return_warning inner illegal } | style = located(Style); endpos = located(END); { let span = Loc.delimited style endpos in @@ -497,7 +488,7 @@ let style := Writer.Warning (Parse_error.end_not_allowed ~in_what span) in let inner = Loc.at span @@ `Styled (Tokens.to_ast_style style, []) in - Writer.with_warning inner warning + Writer.return_warning inner warning } (* LINKS + REFS *) @@ -509,10 +500,10 @@ let reference := return @@ Loc.at span @@ `Reference (`Simple, Loc.at location inner, []) } | ref_body = Ref_with_replacement; children = sequence_nonempty(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { - let* children = children in - let Tokens.{ inner; start } = ref_body in - let span = { endpos.Loc.location with start } in - return @@ Loc.at span @@ `Reference (`With_text, Loc.at span inner, trim_start children) + Writer.bind children ~f:(fun c -> + let Tokens.{ inner; start } = ref_body in + let span = { endpos.Loc.location with start } in + return @@ Loc.at span @@ `Reference (`With_text, Loc.at span inner, trim_start c)) } | ref_body = Ref_with_replacement; endpos = located(RIGHT_BRACE); { let Tokens.{ inner; start } = ref_body in @@ -522,7 +513,7 @@ let reference := let what = Tokens.describe @@ Ref_with_replacement ref_body in Writer.Warning (Parse_error.should_not_be_empty ~what span) in - Writer.with_warning node warning + Writer.return_warning node warning } | ref_body = Ref_with_replacement; children = sequence_nonempty(inline_element(whitespace))?; endpos = located(END); { let Tokens.{ inner; start } = ref_body in @@ -533,7 +524,7 @@ let reference := in let* children = Option.value ~default:(return []) children in let node = Loc.at span @@ `Reference (`With_text, Loc.at span inner, children) in - Writer.with_warning node not_allowed + Writer.return_warning node not_allowed } let link := @@ -547,23 +538,23 @@ let link := let warning = Writer.Warning (Parse_error.should_not_be_empty ~what span) in - Writer.with_warning node warning + Writer.return_warning node warning else return node } | content = Link_with_replacement; children = sequence_nonempty(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { - let* c = children in - let Tokens.{ inner; start } = content in - let span = { endpos.Loc.location with start } in - let node = Loc.at span @@ `Link (inner, c) in - if "" = inner then - let what = Tokens.describe @@ Link_with_replacement content in - let warning = - Writer.Warning (Parse_error.should_not_be_empty ~what span) - in - Writer.with_warning node warning - else - return node + Writer.bind children ~f:(fun c -> + let Tokens.{ inner; start } = content in + let span = { endpos.Loc.location with start } in + let node = Loc.at span @@ `Link (inner, c) in + if "" = inner then + let what = Tokens.describe @@ Link_with_replacement content in + let warning = + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + Writer.return_warning node warning + else + return node) } | content = Link_with_replacement; endpos = located(RIGHT_BRACE); { let Tokens.{ inner; start } = content in @@ -573,43 +564,36 @@ let link := let warning = Writer.Warning (Parse_error.should_not_be_empty ~what span) in - Writer.with_warning node warning + Writer.return_warning node warning } (* LIST *) -let list_light_item_unordered := - | MINUS; horizontal_whitespace; ~ = nestable_block_element; <> - | horizontal_whitespace; MINUS; item = nestable_block_element; { - let warning = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.should_begin_on_its_own_line ~what:(Tokens.describe MINUS) span) - in - Writer.warning warning item - } +let list_light_start := + | MINUS; { Tokens.MINUS } + | PLUS; { Tokens.PLUS } -let list_light_item_ordered := - | PLUS; horizontal_whitespace; ~ = nestable_block_element; <> - | horizontal_whitespace; PLUS; item = nestable_block_element; { - let warning = +let list_light_item := + | start = list_light_start; horizontal_whitespace; item = nestable_block_element; { + light_list_item start <$> item + } + | horizontal_whitespace; start = list_light_start; item = nestable_block_element; { + let should_begin_on_its_own_line = let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.should_begin_on_its_own_line ~what:(Tokens.describe PLUS) span) + Writer.Warning (Parse_error.should_begin_on_its_own_line ~what:(Tokens.describe MINUS) span) in - Writer.warning warning item + (light_list_item start <$> item) + |> Writer.warning should_begin_on_its_own_line } let list_light := - | children = separated_nonempty_sequence(Single_newline, list_light_item_unordered); { - let* children = children in - let span = Loc.span @@ List.map Loc.location children in - let inner = Loc.at span @@ `List (`Unordered, `Light, [ children ]) in - return inner - } - | children = separated_nonempty_sequence(Single_newline, list_light_item_ordered); { - let* children = children in - let span = Loc.span @@ List.map Loc.location children in - let inner = Loc.at span @@ `List (`Ordered, `Light, [ children ]) in - return inner + | children = separated_nonempty_sequence(Single_newline, list_light_item); { + Writer.bind children ~f:(fun c -> + let (list_kind, c) = split_light_list_items c in + let span = Loc.span @@ List.map Loc.location c in + `List (list_kind, `Light, [ c ]) + |> Loc.at span + |> return) } let list_opening := @@ -617,78 +601,80 @@ let list_opening := | DASH; { Tokens.DASH } let item_heavy := - | start_pos = located(list_opening); whitespace; items = sequence(nestable_block_element); RIGHT_BRACE; any_whitespace*; { - let Loc.{ value = token; location } = start_pos in - let warning = - Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe token) location) - in - Writer.ensure not_empty warning items - } - | startpos = located(list_opening); items = sequence(nestable_block_element); endpos = located(RIGHT_BRACE); any_whitespace*; { + | start_pos = located(list_opening); whitespace; items = sequence(nestable_block_element); RIGHT_BRACE; any_whitespace*; { + let Loc.{ value = token; location } = start_pos in + let warning = + Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe token) location) + in + Writer.ensure not_empty warning items + } + | startpos = located(list_opening); items = sequence(nestable_block_element); endpos = located(RIGHT_BRACE); any_whitespace*; { + let span = Loc.delimited startpos endpos in + let should_be_followed_by_whitespace = + Writer.Warning (Parse_error.should_be_followed_by_whitespace ~what:(Tokens.describe LI) span) + in + let should_not_be_empty = + Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span) + in + Writer.ensure not_empty should_not_be_empty items + |> Writer.warning should_be_followed_by_whitespace + } + | startpos = located(DASH); items = sequence_nonempty(nestable_block_element)?; endpos = located(END); { + let end_not_allowed = + Writer.Warning (Parse_error.end_not_allowed ~in_what:(Tokens.describe DASH) endpos.Loc.location) + in + match items with + | Some writer -> + Writer.warning end_not_allowed writer + | None -> let span = Loc.delimited startpos endpos in - let should_be_followed_by_whitespace = - Writer.Warning (Parse_error.should_be_followed_by_whitespace ~what:(Tokens.describe LI) span) - in let should_not_be_empty = - Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span) + Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe DASH) span) in - Writer.ensure not_empty should_not_be_empty items - |> Writer.warning should_be_followed_by_whitespace - } - | startpos = located(DASH); items = sequence_nonempty(nestable_block_element)?; endpos = located(END); { - let end_not_allowed = - Writer.Warning (Parse_error.end_not_allowed ~in_what:(Tokens.describe DASH) endpos.Loc.location) - in - match items with - | Some writer -> - Writer.warning end_not_allowed writer - | None -> - let span = Loc.delimited startpos endpos in - let should_not_be_empty = - Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe DASH) span) - in - Writer.with_warning [] should_not_be_empty |> Writer.warning end_not_allowed - } + Writer.return_warning [] should_not_be_empty + |> Writer.warning end_not_allowed + } let list_heavy := - | list_kind = located(List); whitespace?; items = sequence_nonempty(item_heavy); endpos = located(RIGHT_BRACE); { - let span = Loc.delimited list_kind endpos in - let* items : Ast.nestable_block_element Loc.with_location list list = items in - let inner = Loc.at span @@ `List (Tokens.ast_list_kind list_kind.Loc.value, `Heavy, items) in - return inner - } - | list_kind = located(List); endpos = located(RIGHT_BRACE); { - let span = Loc.delimited list_kind endpos in - let warning = - let what = Tokens.describe @@ List list_kind.Loc.value in - Writer.Warning (Parse_error.should_not_be_empty ~what span) - in - let node = Loc.at span @@ `List (Tokens.ast_list_kind list_kind.Loc.value, `Heavy, []) in - Writer.with_warning node warning - } - | list_kind = located(List); whitespace?; items = sequence_nonempty(item_heavy); errloc = position(error); { - let span = Loc.(span [list_kind.location; Loc.of_position errloc]) in - let warning = fun input -> - let (start_pos, end_pos) = errloc in - let illegal_input = Loc.extract ~input ~start_pos ~end_pos in - let in_what = Tokens.describe @@ List list_kind.Loc.value in - Parse_error.illegal ~in_what illegal_input span - in - let* items : Ast.nestable_block_element Loc.with_location list list = Writer.warning (Writer.InputNeeded warning) items in - let inner = Loc.at span @@ `List (Tokens.ast_list_kind list_kind.Loc.value, `Heavy, items) in - return inner - } - | list_kind = located(List); whitespace?; errloc = position(error); { - let span = Loc.(span [list_kind.location; Loc.of_position errloc]) in - let warning = fun input -> - let (start_pos, end_pos) = errloc in - let illegal_input = Loc.extract ~input ~start_pos ~end_pos in - let in_what = Tokens.describe (List list_kind.Loc.value) in - Parse_error.illegal ~in_what illegal_input span - in - let inner = Loc.at span @@ `List (Tokens.ast_list_kind list_kind.Loc.value, `Heavy, []) in - Writer.with_warning inner (Writer.InputNeeded warning) - } + | list_kind = located(List); whitespace?; items = sequence_nonempty(item_heavy); endpos = located(RIGHT_BRACE); { + let span = Loc.delimited list_kind endpos in + Writer.bind items ~f:(fun items -> + `List (Tokens.ast_list_kind list_kind.Loc.value, `Heavy, items) + |> Loc.at span + |> return) + } + | list_kind = located(List); endpos = located(RIGHT_BRACE); { + let span = Loc.delimited list_kind endpos in + let should_not_be_empty = + let what = Tokens.describe @@ List list_kind.Loc.value in + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + let node = Loc.at span @@ `List (Tokens.ast_list_kind list_kind.Loc.value, `Heavy, []) in + Writer.return_warning node should_not_be_empty + } + | list_kind = located(List); whitespace?; items = sequence_nonempty(item_heavy); errloc = position(error); { + let span = Loc.(span [list_kind.location; Loc.of_position errloc]) in + let illegal = Writer.InputNeeded (fun input -> + let (start_pos, end_pos) = errloc in + let illegal_input = Loc.extract ~input ~start_pos ~end_pos in + let in_what = Tokens.describe @@ List list_kind.Loc.value in + Parse_error.illegal ~in_what illegal_input span) + in + let* items : Ast.nestable_block_element Loc.with_location list list = Writer.warning illegal items in + let inner = Loc.at span @@ `List (Tokens.ast_list_kind list_kind.Loc.value, `Heavy, items) in + return inner + } + | list_kind = located(List); whitespace?; errloc = position(error); { + let span = Loc.(span [list_kind.location; Loc.of_position errloc]) in + let illegal = Writer.InputNeeded (fun input -> + let (start_pos, end_pos) = errloc in + let illegal_input = Loc.extract ~input ~start_pos ~end_pos in + let in_what = Tokens.describe (List list_kind.Loc.value) in + Parse_error.illegal ~in_what illegal_input span) + in + let inner = Loc.at span @@ `List (Tokens.ast_list_kind list_kind.Loc.value, `Heavy, []) in + Writer.return_warning inner illegal + } let odoc_list := | ~ = list_light; <> @@ -698,62 +684,59 @@ let odoc_list := let cell_heavy := | cell_kind = Table_cell; whitespace*; children = sequence_nonempty(nestable_block_element); RIGHT_BRACE; whitespace*; - { Writer.map (fun c -> (c, cell_kind)) children } + { Writer.map ~f:(fun c -> (c, cell_kind)) children } | cell_kind = Table_cell; RIGHT_BRACE; whitespace*; { return ([], cell_kind) } - | cell_kind = Table_cell; children = sequence_nonempty(nestable_block_element)?; errloc = position(error); - { - let warning = fun input -> - let (start_pos, end_pos) as loc = errloc in - let illegal_input = Loc.extract ~input ~start_pos ~end_pos in - let span = Loc.of_position loc in - let in_what = Tokens.describe @@ Table_cell cell_kind in - Parse_error.illegal ~in_what illegal_input span - in - Option.value ~default:(return []) children - |> Writer.map (fun children -> (children, cell_kind)) - |> Writer.warning (Writer.InputNeeded warning) - } + | cell_kind = Table_cell; children = sequence_nonempty(nestable_block_element)?; errloc = position(error); { + let illegal = Writer.InputNeeded (fun input -> + let (start_pos, end_pos) as loc = errloc in + let illegal_input = Loc.extract ~input ~start_pos ~end_pos in + let span = Loc.of_position loc in + let in_what = Tokens.describe @@ Table_cell cell_kind in + Parse_error.illegal ~in_what illegal_input span) + in + Option.value ~default:(return []) children + |> Writer.map ~f:(fun c -> (c, cell_kind)) + |> Writer.warning illegal + } let row_heavy := | TABLE_ROW; whitespace*; ~ = sequence_nonempty(cell_heavy); RIGHT_BRACE; whitespace*; <> | TABLE_ROW; whitespace*; RIGHT_BRACE; whitespace*; { return [] } - | TABLE_ROW; children = sequence_nonempty(cell_heavy)?; errloc = position(error); - { - let warning = fun input -> - let (start_pos, end_pos) as loc = errloc in - let illegal_input = Loc.extract ~input ~start_pos ~end_pos in - let span = Loc.of_position loc in - let in_what = Tokens.describe TABLE_ROW in - Parse_error.illegal ~in_what illegal_input span - in - Option.value ~default:(return []) children - |> Writer.warning (Writer.InputNeeded warning) - } + | TABLE_ROW; children = sequence_nonempty(cell_heavy)?; errloc = position(error); { + let illegal = Writer.InputNeeded (fun input -> + let (start_pos, end_pos) as loc = errloc in + let illegal_input = Loc.extract ~input ~start_pos ~end_pos in + let span = Loc.of_position loc in + let in_what = Tokens.describe TABLE_ROW in + Parse_error.illegal ~in_what illegal_input span) + in + Option.value ~default:(return []) children + |> Writer.warning illegal + } let table_heavy := | grid = delimited_location(TABLE_HEAVY, whitespace*; sequence_nonempty(row_heavy), RIGHT_BRACE); { - Writer.map (Loc.map (fun grid -> `Table ((grid, None), `Heavy))) (Writer.sequence_loc grid) + Loc.map (fun grid -> `Table ((grid, None), `Heavy)) <$> (Writer.sequence_loc grid) } | startpos = located(TABLE_HEAVY); endpos = located(RIGHT_BRACE); { let span = Loc.(span [startpos.location; endpos.location]) in let inner = Loc.at span @@ `Table (([], None), `Heavy) in return inner } - | startpos = located(TABLE_HEAVY); whitespace*; grid = sequence_nonempty(row_heavy)?; errloc = position(error); - { - let warning = fun input -> - let (start_pos, end_pos) as loc = errloc in - let illegal_input = Loc.extract ~input ~start_pos ~end_pos in - let span = Loc.of_position loc in - let in_what = Tokens.describe TABLE_HEAVY in - Parse_error.illegal ~in_what illegal_input span - in - let span = Loc.(span [startpos.location; (Loc.of_position errloc)]) in - Option.value ~default:(return []) grid - |> Writer.map (fun grid -> Loc.at span @@ `Table ((grid, None), `Heavy)) - |> Writer.warning (Writer.InputNeeded warning) - } + | startpos = located(TABLE_HEAVY); whitespace*; grid = sequence_nonempty(row_heavy)?; errloc = position(error); { + let illegal = Writer.InputNeeded (fun input -> + let (start_pos, end_pos) as loc = errloc in + let illegal_input = Loc.extract ~input ~start_pos ~end_pos in + let span = Loc.of_position loc in + let in_what = Tokens.describe TABLE_HEAVY in + Parse_error.illegal ~in_what illegal_input span) + in + let span = Loc.(span [startpos.location; (Loc.of_position errloc)]) in + Option.value ~default:(return []) grid + |> Writer.map ~f:(fun grid -> Loc.at span @@ `Table ((grid, None), `Heavy)) + |> Writer.warning illegal + } (* LIGHT TABLE *) @@ -765,15 +748,16 @@ let cell_inner := let text_span = Loc.extract ~start_pos ~end_pos ~input in Parse_error.illegal ~in_what:(Tokens.describe TABLE_LIGHT) text_span span) in - - (* NOTE: this is the best we can do right now, accepting a `nestable_block_element` - for example, causes a reduce/reduce conflict. So we have to lose some - information via the `error` keyword and return an empty word. + (* NOTE: (@FayCarsons) + This is the best we can do right now. Accepting a `nestable_block_element`, + for example, causes a reduce/reduce conflict. + So we have to lose some information(what the invalid element was) via the + `error` keyword and return an empty word. Maybe if we refactored the way that block elements/tags/sections handle whitespace we could remove the conflict while still being to match on those elements *) - Writer.with_warning (Loc.at span @@ `Word "") illegal + Writer.return_warning (Loc.at span @@ `Word "") illegal } let cell_content_light := ~ = sequence_nonempty(cell_inner); <> @@ -796,34 +780,33 @@ let rows_light := ~ = sequence_nonempty(row_light); <> let table_start_light := startpos = located(TABLE_LIGHT); any_whitespace*; { startpos } let table_light := - | startpos = table_start_light; data = rows_light; endpos = located(RIGHT_BRACE); { - let span = Loc.delimited startpos endpos in - Writer.map (construct_table ~span) data - } - | startpos = table_start_light; endpos = located(RIGHT_BRACE); { - let span = Loc.delimited startpos endpos in - let inner = Loc.at span @@ `Table (([[]], None), `Light) in - return inner - } - | startpos = table_start_light; data = rows_light; endpos = located(END); { - let in_what = Tokens.describe TABLE_LIGHT in - let warning = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.end_not_allowed ~in_what span) - in - let span = Loc.delimited startpos endpos in - unclosed_table ~span ~data warning - } - | startpos = located(TABLE_LIGHT); any_whitespace?; endpos = located(END); - { - let in_what = Tokens.describe TABLE_LIGHT in - let warning = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.end_not_allowed ~in_what span) - in - let span = Loc.delimited startpos endpos in - unclosed_table ~span warning - } + | startpos = table_start_light; data = rows_light; endpos = located(RIGHT_BRACE); { + let span = Loc.delimited startpos endpos in + construct_table ~span <$> data + } + | startpos = table_start_light; endpos = located(RIGHT_BRACE); { + let span = Loc.delimited startpos endpos in + let inner = Loc.at span @@ `Table (([[]], None), `Light) in + return inner + } + | startpos = table_start_light; data = rows_light; endpos = located(END); { + let in_what = Tokens.describe TABLE_LIGHT in + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.end_not_allowed ~in_what span) + in + let span = Loc.delimited startpos endpos in + unclosed_table ~span ~data warning + } + | startpos = located(TABLE_LIGHT); any_whitespace?; endpos = located(END); { + let in_what = Tokens.describe TABLE_LIGHT in + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.end_not_allowed ~in_what span) + in + let span = Loc.delimited startpos endpos in + unclosed_table ~span warning + } let table := | ~ = table_heavy; <> @@ -873,8 +856,10 @@ let paragraph_style := content = located(Paragraph_style); ws = paragraph; endpo in let start = content.Loc.value.Tokens.start in let endloc = endpos.Loc.location in - Writer.bind ws (fun ws -> - Writer.with_warning { ws with Loc.location = { endloc with start }} warning) + Writer.bind ws ~f:(fun ws -> + Writer.return_warning + { ws with Loc.location = { endloc with start }} + warning) } @@ -888,11 +873,11 @@ let verbatim := verbatim = located(Verbatim); { in let verbatim = Loc.at span @@ `Verbatim inner in Writer.ensure has_content warning (return inner) - |> Writer.map (Fun.const verbatim) + *> return verbatim } -let paragraph := items = sequence_nonempty(inline_element(horizontal_whitespace)); Single_newline?; { - Writer.map paragraph items +let paragraph := items = sequence_nonempty(inline_element(horizontal_whitespace)); { + paragraph <$> items } let code_block := @@ -904,7 +889,7 @@ let code_block := return @@ Loc.at { location with start } node } | content = located(Code_block_with_output); output = sequence_nonempty(nestable_block_element); RIGHT_CODE_DELIMITER; { - let* output = Writer.map Option.some output in + let* output = Option.some <$> output in let Loc.{ value = Tokens.{ inner; start }; location } = content in let Tokens.{metadata; delimiter; content} = inner in let meta = Option.map (fun Tokens.{language_tag; tags} -> Ast.{ language = language_tag; tags }) metadata in @@ -921,64 +906,59 @@ let math_block := inner = located(Math_block); { Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.ensure has_content warning (return inner) - |> Writer.map (fun m -> Loc.at span @@ `Math_block m) + |> Writer.map ~f:(fun m -> Loc.at span @@ `Math_block m) } let modules := startpos = located(MODULES); modules = sequence(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { let in_what = Tokens.describe MODULES in - let* modules = modules in - let not_allowed = - let span = Loc.span @@ List.map Loc.location modules in - let first_offending = - List.find_opt - (function - | `Word _ | `Space _ -> false - | _ -> true) - (List.map Loc.value modules : Ast.inline_element list) + Writer.bind modules ~f:(fun m -> + let not_allowed = + let span = Loc.span @@ List.map Loc.location m in + let first_offending = + List.find_opt + (function + | `Word _ | `Space _ -> false + | _ -> true) + (List.map Loc.value m : Ast.inline_element list) + in + let what = + Option.map Tokens.describe_inline first_offending + |> Option.value ~default:String.empty + in + Writer.Warning (Parse_error.not_allowed ~what ~in_what span) in - let what = Option.map Tokens.describe_inline first_offending |> Option.value ~default:String.empty in - Writer.Warning (Parse_error.not_allowed ~what ~in_what span) - in - let is_empty = - let span = Loc.span @@ List.map Loc.location modules in - let what = Tokens.describe MODULES in - Writer.Warning (Parse_error.should_not_be_empty ~what span) - in - let span = Loc.(span [startpos.location; endpos.location]) in - let inner = Loc.at span @@ `Modules (List.map (Loc.map inline_element_inner) modules) in - (* Test the content for errors *) - let* _ = List.fold_left - (fun writer (f, w) -> Writer.ensure f w writer) - (return modules) - [ - (* predicate + error pairs *) - (not_empty, is_empty); - (legal_module_list, not_allowed) - ] - in - return inner + let is_empty = + let span = Loc.span @@ List.map Loc.location m in + let what = Tokens.describe MODULES in + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + let span = Loc.(span [startpos.location; endpos.location]) in + let inner = Loc.at span @@ `Modules (List.map (Loc.map inline_element_inner) m) in + (* Test the content for errors, throwing away the value afterwards with `*>` *) + (Writer.ensure not_empty is_empty (return m) + |> Writer.ensure legal_module_list not_allowed) + *> return inner) } | startpos = located(MODULES); modules = sequence(inline_element(whitespace)); endpos = located(END); { let in_what = Tokens.describe MODULES in - let* modules = modules in - let span = Loc.span @@ List.map Loc.location modules in - let not_allowed = - let first_offending = - List.find_opt - (function - | `Word _ | `Space _ -> false - | _ -> true) - (List.map Loc.value modules : Ast.inline_element list) + Writer.bind modules ~f:(fun m -> + let span = Loc.span @@ List.map Loc.location m in + let not_allowed = + let first_offending = + List.find_opt + (function + | `Word _ | `Space _ -> false + | _ -> true) + (List.map Loc.value m : Ast.inline_element list) + in + let what = Option.map Tokens.describe_inline first_offending |> Option.value ~default:String.empty in + Writer.Warning (Parse_error.not_allowed ~what ~in_what span) in - let what = Option.map Tokens.describe_inline first_offending |> Option.value ~default:String.empty in - Writer.Warning (Parse_error.not_allowed ~what ~in_what span) - in - let unexpected_end = - Writer.Warning (Parse_error.end_not_allowed ~in_what:(Tokens.describe MODULES) span) - in - let span = Loc.(span [startpos.location; endpos.location]) in - let inner = Loc.at span @@ `Modules (List.map (Loc.map inline_element_inner) modules) in - return inner - |> Writer.warning not_allowed - |> Writer.warning unexpected_end + let unexpected_end = + Writer.Warning (Parse_error.end_not_allowed ~in_what:(Tokens.describe MODULES) span) + in + let span = Loc.(span [startpos.location; endpos.location]) in + let inner = Loc.at span @@ `Modules (List.map (Loc.map inline_element_inner) m) in + Writer.return_warning inner not_allowed + |> Writer.warning unexpected_end) } diff --git a/src/parser/parser_aux.ml b/src/parser/parser_aux.ml index e1cdd09ead..374b6f24ab 100644 --- a/src/parser/parser_aux.ml +++ b/src/parser/parser_aux.ml @@ -126,7 +126,7 @@ let unclosed_table match data with | Some data -> Writer.map - (fun data -> + ~f:(fun data -> Loc.at span @@ `Table ((List.map as_data data, None), `Light)) data | None -> @@ -169,3 +169,20 @@ let legal_module_list : Ast.inline_element Loc.with_location list -> bool = not_empty xs && List.for_all (function `Word _ | `Space _ -> true | _ -> false) @@ List.map Loc.value xs + +let light_list_item start item = + let open Tokens in + match start with + | MINUS -> `Unordered item + | PLUS -> `Ordered item + | _ -> assert false (* unreachable *) + +let or_insert = function None -> Option.some | o -> Fun.const o + +let split_light_list_items items = + let rec go acc list_kind = function + | `Ordered x :: xs -> go (x :: acc) (or_insert list_kind `Ordered) xs + | `Unordered x :: xs -> go (x :: acc) (or_insert list_kind `Unordered) xs + | [] -> (Option.get list_kind, List.rev acc) + in + go [] None items diff --git a/src/parser/writer.ml b/src/parser/writer.ml index 9e6aaaded5..70f0909204 100644 --- a/src/parser/writer.ml +++ b/src/parser/writer.ml @@ -1,65 +1,57 @@ (** An implementation of the Writer monad for parser error reporting *) type +'a t = Writer of ('a * warning list) -and warning = InputNeeded of (string -> Warning.t) | Warning of Warning.t -let run_warning : input:string -> warning -> Warning.t = - fun ~input warning -> - match warning with InputNeeded f -> f input | Warning w -> w +(** A warning can either be totally self-contained, or a function requiring + input. This is so that we can pass the input string in later in order + to extract spans where errors occur *) +and warning = InputNeeded of (string -> Warning.t) | Warning of Warning.t let return : 'a -> 'a t = fun x -> Writer (x, []) - -let bind : 'a t -> ('a -> 'b t) -> 'b t = - fun (Writer (node, warnings)) f -> +let bind : 'a t -> f:('a -> 'b t) -> 'b t = + fun (Writer (node, warnings)) ~f -> let (Writer (next, next_warnings)) = f node in Writer (next, warnings @ next_warnings) -let map : ('a -> 'b) -> 'a t -> 'b t = fun f w -> bind w (fun x -> return (f x)) +let map : f:('a -> 'b) -> 'a t -> 'b t = + fun ~f (Writer (x, ws)) -> Writer (f x, ws) -let seq_right : 'a t -> 'b t -> 'b t = - fun (Writer (_, ws)) (Writer (x, ws2)) -> Writer (x, ws @ ws2) +let ( <$> ) f w = map ~f w -let seq_left : 'a t -> 'b t -> 'a t = - fun (Writer (x, ws)) (Writer (_, ws2)) -> Writer (x, ws @ ws2) +let seq : 'a t -> 'b t -> 'b t = + fun (Writer (_, ws)) (Writer (x, ws2)) -> Writer (x, ws @ ws2) +let ( *> ) = seq +(** Useful functions for working with Writer.t *) module Prelude = struct let return = return - let ( >>= ) = bind - let ( let* ) = bind - let ( and* ) = bind - let ( let+ ) w f = map f w - let ( <$> ) = map - let ( *> ) = seq_right - let ( <* ) = seq_left + let ( let* ) w f = bind w ~f + let ( <$> ) = ( <$> ) + let ( *> ) = ( *> ) end -let warning warning (Writer (n, ws)) = Writer (n, warning :: ws) - let sequence : 'a t list -> 'a list t = fun xs -> - let go (ns, ws) (Writer (n, w)) = (n :: ns, w @ ws) in - let xs, ws = List.fold_left go ([], []) xs in - Writer (List.rev xs, ws) + let xs, ws = List.map (fun (Writer (x, ws)) -> (x, ws)) xs |> List.split in + Writer (xs, List.flatten ws) let sequence_loc : 'a t Loc.with_location -> 'a Loc.with_location t = - fun { value; location } -> map (Loc.at location) value + fun { value; location } -> Loc.at location <$> value -let map2 : ('a -> 'b -> 'c) -> 'a t -> 'b t -> 'c t = - fun f (Writer (a, ws)) (Writer (b, wsb)) -> Writer (f a b, wsb @ ws) - -let traverse : ('a -> 'b t) -> 'a list -> 'b list t = - fun f xs -> sequence (List.map f xs) +(** [warning Warning.t Writer.t] is equivalent to Haskell's [tell] *) +let warning warning (Writer (n, ws)) = Writer (n, warning :: ws) -let with_warning node warning = Writer (node, [ warning ]) +let return_warning node warning = Writer (node, [ warning ]) +(** [ensure pred warning Writer] Logs a warning if the predicate returns true + for the contained value *) let ensure : ('a -> bool) -> warning -> 'a t -> 'a t = fun pred warning (Writer (x, ws) as self) -> if pred x then self else Writer (x, warning :: ws) -let run : input:string -> Ast.t t -> Ast.t * Warning.t list = +let run : input:string -> 'a t -> 'a * Warning.t list = fun ~input (Writer (tree, warnings)) -> - (tree, List.map (run_warning ~input) warnings) + let go input = function InputNeeded f -> f input | Warning w -> w in + (tree, List.map (go input) warnings) -let unwrap : 'a t -> 'a = fun (Writer (x, _)) -> x -let unwrap_located : 'a Loc.with_location t -> 'a = - fun (Writer (Loc.{ value; _ }, _)) -> value +let get : 'a t -> 'a = fun (Writer (x, _)) -> x From 6d51e8e1f6efbcc3beff6d8a2cbd9ae9f4b7bd8c Mon Sep 17 00:00:00 2001 From: faycarsons Date: Fri, 10 Jan 2025 20:09:32 -0500 Subject: [PATCH 127/150] Improve light list parsing --- src/parser/loc.ml | 3 ++ src/parser/loc.mli | 3 ++ src/parser/odoc_parser.ml | 5 +-- src/parser/parser.mly | 64 +++++++++++++-------------------------- 4 files changed, 30 insertions(+), 45 deletions(-) diff --git a/src/parser/loc.ml b/src/parser/loc.ml index e12e8d3107..938a8f024b 100644 --- a/src/parser/loc.ml +++ b/src/parser/loc.ml @@ -41,6 +41,9 @@ let fmt { file; start; end_ } = "file: %s\nstart: { line : %d, col : %d }\nend: { line: %d; col: %d }" file sline scol eline ecol +let with_start_location : span -> 'a with_location -> 'a with_location = + fun { start; _ } self -> { self with location = { self.location with start } } + let span spans = match spans with | [] -> diff --git a/src/parser/loc.mli b/src/parser/loc.mli index aa14e9ed97..e9d8284f45 100644 --- a/src/parser/loc.mli +++ b/src/parser/loc.mli @@ -37,6 +37,9 @@ val fmt : span -> string val delimited : 'a with_location -> 'b with_location -> span (** Returns the span which contains both arguments *) +val with_start_location : span -> 'a with_location -> 'a with_location +(** [with_start_location start t] return t with it's starting point = start *) + val nudge_map_start : int -> 'a with_location -> 'a with_location val nudge_map_end : int -> 'a with_location -> 'a with_location diff --git a/src/parser/odoc_parser.ml b/src/parser/odoc_parser.ml index 7626e26310..1bb5419db4 100644 --- a/src/parser/odoc_parser.ml +++ b/src/parser/odoc_parser.ml @@ -119,8 +119,9 @@ let parse_comment : location:Lexing.position -> text:string -> t = let lexbuf = Lexing.from_string text in (* We cannot directly pass parameters to Menhir without converting our parser to a module functor. So we pass our current filename to the lexbuf here *) - Lexing.(set_filename lexbuf location.pos_fname); - Lexing.(set_position lexbuf location); + Lexing.( + set_filename lexbuf location.pos_fname; + set_position lexbuf location); let lexer_state = Lexer. { diff --git a/src/parser/parser.mly b/src/parser/parser.mly index a3c07c7dcc..54636c2e11 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -126,26 +126,6 @@ let toplevel := | ~ = toplevel_error; <> let toplevel_error := - | errloc = position(PLUS); whitespace?; { - let span = Loc.of_position errloc in - let warning = - let what = Tokens.describe PLUS in - Writer.Warning (Parse_error.bad_markup what span) - in - let as_text = Loc.at span @@ `Word "+" in - let node = (Loc.same as_text @@ `Paragraph [ as_text ]) in - Writer.return_warning node warning - } - | errloc = position(MINUS); whitespace?; { - let span = Loc.of_position errloc in - let warning = - let what = Tokens.describe MINUS in - Writer.Warning (Parse_error.bad_markup what span) - in - let as_text = Loc.at span @@ `Word "-" in - let node = (Loc.same as_text @@ `Paragraph [ as_text ]) in - Writer.return_warning node warning - } | err = located(list_opening); horizontal_whitespace; children = sequence_nonempty(inline_element(horizontal_whitespace)); endpos = located(RIGHT_BRACE)?; { let default = Writer.get children @@ -207,20 +187,6 @@ let toplevel_error := let node = Loc.same as_text @@ `Paragraph [ as_text ] in Writer.return_warning node warning } - | list = list_light; { - Writer.bind list ~f:(fun list -> - let warning = - let Loc.{ value; location } = list in - let what = Tokens.describe @@ - match value with - | `List (`Ordered, `Light, _) -> Tokens.PLUS - | `List (`Unordered, `Light, _) -> Tokens.MINUS - | _ -> assert false (* Unreachable *) - in - Parse_error.should_begin_on_its_own_line ~what location - in - Writer.return_warning (list :> Ast.block_element Loc.with_location) (Writer.Warning warning)) - } (* SECTION HEADING *) @@ -427,6 +393,14 @@ let inline_element_without_whitespace := let Loc.{ value = Tokens.{ start; inner }; location } = m in return @@ Loc.at { location with start } (`Math_span inner) } + (* + | minus_or_plus = located( list_light_start ); { + return @@ + Loc.map + Tokens.(function MINUS -> `Word "-" | PLUS -> `Word "+" | _ -> assert false) + minus_or_plus + } + *) (* More complex/recursive inline elements should have their own rule *) | ~ = style; <> | ~ = reference; <> @@ -569,25 +543,29 @@ let link := (* LIST *) +(*TODO: For some reason this is always matching the `should_begin it's own line` branch?? *) let list_light_start := | MINUS; { Tokens.MINUS } | PLUS; { Tokens.PLUS } let list_light_item := - | start = list_light_start; horizontal_whitespace; item = nestable_block_element; { - light_list_item start <$> item + | start = located(list_light_start); item = nestable_block_element; { + let Loc.{ value; location } = start in + light_list_item value <$> (Loc.with_start_location location <$> item) } - | horizontal_whitespace; start = list_light_start; item = nestable_block_element; { + | horizontal_whitespace; start = located(list_light_start); item = nestable_block_element; { let should_begin_on_its_own_line = let span = Loc.of_position $sloc in Writer.Warning (Parse_error.should_begin_on_its_own_line ~what:(Tokens.describe MINUS) span) in - (light_list_item start <$> item) + let Loc.{ value; location } = start in + Writer.map ~f:(Loc.with_start_location location) item + |> Writer.map ~f:(light_list_item value) |> Writer.warning should_begin_on_its_own_line } let list_light := - | children = separated_nonempty_sequence(Single_newline, list_light_item); { + | children = separated_nonempty_sequence(whitespace*, list_light_item); { Writer.bind children ~f:(fun c -> let (list_kind, c) = split_light_list_items c in let span = Loc.span @@ List.map Loc.location c in @@ -883,16 +861,16 @@ let paragraph := items = sequence_nonempty(inline_element(horizontal_whitespace) let code_block := | content = located(Code_block); { let Loc.{ value = Tokens.{ inner; start }; location } = content in - let Tokens.{metadata; delimiter; content} = inner in - let meta = Option.map (fun Tokens.{language_tag; tags} -> Ast.{ language = language_tag; tags }) metadata in + let Tokens.{ metadata; delimiter; content } = inner in + let meta = Option.map (fun Tokens.{ language_tag; tags } -> Ast.{ language = language_tag; tags }) metadata in let node = `Code_block Ast.{ meta; delimiter; content; output = None } in return @@ Loc.at { location with start } node } | content = located(Code_block_with_output); output = sequence_nonempty(nestable_block_element); RIGHT_CODE_DELIMITER; { let* output = Option.some <$> output in let Loc.{ value = Tokens.{ inner; start }; location } = content in - let Tokens.{metadata; delimiter; content} = inner in - let meta = Option.map (fun Tokens.{language_tag; tags} -> Ast.{ language = language_tag; tags }) metadata in + let Tokens.{ metadata; delimiter; content } = inner in + let meta = Option.map (fun Tokens.{ language_tag; tags } -> Ast.{ language = language_tag; tags }) metadata in let node = `Code_block Ast.{ meta; delimiter; content; output } in return @@ Loc.at { location with start } node } From ebe4c0f4a099f7f8fd9add2dbc14b5ff41f8f8bb Mon Sep 17 00:00:00 2001 From: faycarsons Date: Sat, 11 Jan 2025 10:45:28 -0500 Subject: [PATCH 128/150] Add Menhir tokens for new tags --- dune-project | 2 -- src/parser/lexer.mll | 8 ++++---- src/parser/parser.mly | 5 ++++- src/parser/test/serialize.ml | 1 + src/parser/tokens.ml | 17 +++++++++++++++-- 5 files changed, 24 insertions(+), 9 deletions(-) diff --git a/dune-project b/dune-project index 19c9d4f887..968f25cd5c 100644 --- a/dune-project +++ b/dune-project @@ -4,8 +4,6 @@ (documentation "https://ocaml.org/p/odoc") -(using menhir 2.1) - (source (github ocaml/odoc)) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index b4bf6660b4..810be355da 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -617,16 +617,16 @@ and token input = parse { RETURN } | ("@children_order") - { emit input (`Tag `Children_order) } + { CHILDREN_ORDER } | ("@toc_status") - { emit input (`Tag `Toc_status) } + { TOC_STATUS } | ("@order_category") - { emit input (`Tag `Order_category) } + { ORDER_CATEGORY } | ("@short_title") - { emit input (`Tag `Short_title) } + { SHORT_TITLE } | "@see" horizontal_space* '<' ([^ '>']* as url) '>' { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 54636c2e11..ec7c8fbadb 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -60,7 +60,10 @@ %token OPEN "@open" %token CLOSED "@closed" %token HIDDEN "@hidden" - +%token CHILDREN_ORDER "@children_order" +%token TOC_STATUS "@toc_status" +%token ORDER_CATEGORY "@order_category" +%token SHORT_TITLE "@short_title" %token Simple_ref "{!" %token Ref_with_replacement "{{!" %token Simple_link "{:" diff --git a/src/parser/test/serialize.ml b/src/parser/test/serialize.ml index 8dcf0e6f3b..7449d1545c 100644 --- a/src/parser/test/serialize.ml +++ b/src/parser/test/serialize.ml @@ -164,6 +164,7 @@ module Ast_to_sexp = struct | `Open -> Atom "@open" | `Closed -> Atom "@closed" | `Hidden -> Atom "@hidden" + | _ -> failwith "TODO" let block_element at : Ast.block_element -> sexp = function | #Ast.nestable_block_element as e -> nestable_block_element at e diff --git a/src/parser/tokens.ml b/src/parser/tokens.ml index d0633244b5..016966ab22 100644 --- a/src/parser/tokens.ml +++ b/src/parser/tokens.ml @@ -79,6 +79,10 @@ type token = | OPEN | CLOSED | HIDDEN + | CHILDREN_ORDER + | TOC_STATUS + | ORDER_CATEGORY + | SHORT_TITLE | Raw_markup of (string option * string) with_start_pos | END @@ -151,7 +155,11 @@ let print : token -> string = function | INLINE -> "'@inline'" | OPEN -> "'@open'" | CLOSED -> "'@closed'" - | HIDDEN -> "'@hidden" + | HIDDEN -> "'@hidden'" + | CHILDREN_ORDER -> "'@children_order" + | TOC_STATUS -> "'@toc_status'" + | ORDER_CATEGORY -> "'@order_category'" + | SHORT_TITLE -> "@'short_title'" | Raw_markup { inner = None, _; _ } -> "'{%...%}'" | Raw_markup { inner = Some target, _; _ } -> "'{%" ^ target ^ ":...%}'" | END -> "EOI" @@ -219,7 +227,11 @@ let describe : token -> string = function | INLINE -> "'@inline'" | OPEN -> "'@open'" | CLOSED -> "'@closed'" - | HIDDEN -> "'@hidden" + | HIDDEN -> "'@hidden'" + | CHILDREN_ORDER -> "'@children_order" + | TOC_STATUS -> "'@toc_status'" + | ORDER_CATEGORY -> "'@order_category'" + | SHORT_TITLE -> "@'short_title'" let empty_code_block = { @@ -320,3 +332,4 @@ let describe_tag : Ast.tag -> string = function describe @@ Canonical { inner; start = Loc.dummy_pos } | `Hidden -> describe HIDDEN | `Inline -> describe INLINE + | _ -> assert false (* New tags currently unreachable *) From a9daa8cade137fa0b2f04ef3e1a68d6ba4b834d6 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 13 Jan 2025 11:36:39 -0500 Subject: [PATCH 129/150] Remove old files, add parsing rules for new tags --- src/parser/lexer.mll | 5 - src/parser/parser.mly | 126 ++-- src/parser/syntax.ml | 1504 ----------------------------------------- src/parser/token.ml | 256 ------- src/parser/tokens.ml | 58 +- 5 files changed, 115 insertions(+), 1834 deletions(-) delete mode 100644 src/parser/syntax.ml delete mode 100644 src/parser/token.ml diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 810be355da..edbd4f3e77 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -640,11 +640,6 @@ and token input = parse { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in See { inner = (Document, trim_horizontal_start name); start } } - (* NOTE: These tags will match the whitespace preceding the content and pass - that to the token. I've tried to match on the whitespace as a separate - thing from the token body but that seems to cause problems. - This is (maybe?) an issue because the tests expect the token body to have - no leading whitespace. What do we do here? *) | "@since" horizontal_space+ (([^ '\r' '\n']+) as inner) { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in Since { inner; start } } diff --git a/src/parser/parser.mly b/src/parser/parser.mly index ec7c8fbadb..916a4a7fdc 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -18,18 +18,18 @@ %token Style "{i" (* or '{b' etc *) (* or '{C' or '{R', but this syntax has been deprecated and is only kept around so legacy codebases don't break :p *) -%token Paragraph_style "{L" +%token Paragraph_style "{L" %token MODULES "{!modules:" -%token Math_span "{m" -%token Math_block "{math" +%token Math_span "{m" +%token Math_block "{math" -%token <(string option * string) Tokens.with_start_pos> Raw_markup "{%%}" +%token <(string option * string) Tokens.with_start_point> Raw_markup "{%%}" -%token Code_block "{[]}" (* Self-contained code-block *) -%token Code_block_with_output "{[][" (* Code block that expects some block elements *) -%token Code_span "[]" +%token Code_block "{[]}" (* Self-contained code-block *) +%token Code_block_with_output "{[][" (* Code block that expects some block elements *) +%token Code_span "[]" %token List "{ol" (* or '{ul' *) %token LI "{li" @@ -43,19 +43,19 @@ %token Table_cell "{td" (* or '{th' for header *) (* Where N is an integer *) -%token <(int * string option) Tokens.with_start_pos> Section_heading "{N:" +%token <(int * string option) Tokens.with_start_point> Section_heading "{N:" (* Tags *) -%token Author "@author" +%token Author "@author" %token DEPRECATED "@deprecated" -%token Param "@param" -%token Raise "@raise(s)" +%token Param "@param" +%token Raise "@raise(s)" %token RETURN "@return" -%token <(Tokens.internal_reference * string) Tokens.with_start_pos> See "@see" -%token Since "@since" -%token Before "@before" -%token Version "@version" -%token Canonical "@canonical" +%token <(Tokens.internal_reference * string) Tokens.with_start_point> See "@see" +%token Since "@since" +%token Before "@before" +%token Version "@version" +%token Canonical "@canonical" %token INLINE "@inline" %token OPEN "@open" %token CLOSED "@closed" @@ -64,13 +64,13 @@ %token TOC_STATUS "@toc_status" %token ORDER_CATEGORY "@order_category" %token SHORT_TITLE "@short_title" -%token Simple_ref "{!" -%token Ref_with_replacement "{{!" -%token Simple_link "{:" -%token Link_with_replacement "{{:" -%token <(Tokens.media * Tokens.media_target) Tokens.with_start_pos> Media "{(format)!" -%token <(Tokens.media * Tokens.media_target * string) Tokens.with_start_pos> Media_with_replacement "{(format):" (* where 'format' is audio, video, image *) -%token Verbatim "{v" +%token Simple_ref "{!" +%token Ref_with_replacement "{{!" +%token Simple_link "{:" +%token Link_with_replacement "{{:" +%token <(Tokens.media * Tokens.media_target) Tokens.with_start_point> Media "{(format)!" +%token <(Tokens.media * Tokens.media_target * string) Tokens.with_start_point> Media_with_replacement "{(format):" (* where 'format' is audio, video, image *) +%token Verbatim "{v" %token END @@ -87,13 +87,22 @@ (* Utilities which wraps the return value of a rule in `Loc.with_location` *) let locatedM(rule) == inner = rule; { wrap_location $sloc <$> inner } let located(rule) == inner = rule; { wrap_location $sloc inner } -let with_position(rule) == inner = rule; { (inner, $sloc) } -let position(rule) == _ = rule; { $sloc } let delimited_location(opening, rule, closing) := startpos = located(opening); inner = rule; endpos = located(closing); { let span = Loc.delimited startpos endpos in Loc.at span inner } +(* + When we have to handle errors with Menhir's `error` token, we need + `Lexing.position` as opposed to `Loc.with_location` so that we can cleanly + take a slice of the input text with those positions, which would be + difficult using `Loc.with_location` because of the way the lexbuf tracks + position +*) +let with_position(rule) == inner = rule; { (inner, $sloc) } +let position(rule) == _ = rule; { $sloc } + + (* Utilities for working inside the Writer.t monad *) let sequence(rule) == xs = list(rule); { Writer.sequence xs } let sequence_nonempty(rule) == xs = nonempty_list(rule); { Writer.sequence xs } @@ -101,7 +110,6 @@ let separated_nonempty_sequence(sep, rule) := xs = separated_nonempty_list(sep, let separated_sequence(sep, rule) := xs = separated_list(sep, rule); { Writer.sequence xs } (* WHITESPACE *) - let horizontal_whitespace := ~ = Space; <`Space> let whitespace := @@ -129,6 +137,7 @@ let toplevel := | ~ = toplevel_error; <> let toplevel_error := + (* Stray heavy list items, `{li` or `{-` *) | err = located(list_opening); horizontal_whitespace; children = sequence_nonempty(inline_element(horizontal_whitespace)); endpos = located(RIGHT_BRACE)?; { let default = Writer.get children @@ -160,6 +169,9 @@ let toplevel_error := else Writer.warning unclosed m) } + (* TODO: These(bar, +, -) need to be handled in paragraphs, where they should + be turned into text without emitting a warning + *) | errloc = position(BAR); whitespace?; { let span = Loc.of_position errloc in let warning = @@ -238,6 +250,10 @@ let tag_with_content := | ~ = param; <> | ~ = deprecated; <> | ~ = return; <> + | ~ = children_order; <> + | ~ = toc_status; <> + | ~ = order_category; <> + | ~ = short_title; <> let return := | startpos = located(RETURN); horizontal_whitespace; children = located(sequence_nonempty(nestable_block_element)); { @@ -319,6 +335,48 @@ let param := return @@ Loc.at { location with start } (`Param (inner, [])) } + +let children_order := + | start = located(CHILDREN_ORDER); horizontal_whitespace; children = sequence(nestable_block_element); { + Writer.bind children ~f:(function + | _ :: _ as children -> + let span = Loc.span @@ start.Loc.location :: List.map Loc.location children in + return @@ Loc.at span (`Children_order children) + | [] -> + return @@ Loc.map (Fun.const @@ `Children_order []) start) + } + + +let toc_status := + | start = located(TOC_STATUS); horizontal_whitespace; children = sequence(nestable_block_element); { + Writer.bind children ~f:(function + | _ :: _ as children -> + let span = Loc.span @@ start.Loc.location :: List.map Loc.location children in + return @@ Loc.at span (`Toc_status children) + | [] -> + return @@ Loc.map (Fun.const @@ `Toc_status []) start) + } + +let order_category := + | start = located(ORDER_CATEGORY); horizontal_whitespace; children = sequence(nestable_block_element); { + Writer.bind children ~f:(function + | _ :: _ as children -> + let span = Loc.span @@ start.Loc.location :: List.map Loc.location children in + return @@ Loc.at span (`Order_category children) + | [] -> + return @@ Loc.map (Fun.const (`Order_category [])) start) + } + +let short_title := + | start = located(SHORT_TITLE); horizontal_whitespace; children = sequence(nestable_block_element); { + Writer.bind children ~f:(function + | _ :: _ as children -> + let span = Loc.span @@ start.Loc.location :: List.map Loc.location children in + return @@ Loc.at span (`Short_title children) + | [] -> + return @@ Loc.map (Fun.const (`Short_title [])) start) + } + let tag_bare := | content = located(Version); horizontal_whitespace?; { let Loc.{ value; location } = content in @@ -369,14 +427,6 @@ let tag_bare := | pos = position(CLOSED); whitespace?; { let loc = Loc.of_position pos in return @@ Loc.at loc `Closed } | pos = position(HIDDEN); whitespace?; { let loc = Loc.of_position pos in return @@ Loc.at loc `Hidden } -(* INLINE ELEMENTS *) - -(* - If we're calling `inline_element` from another rule higher up the parse tree, - it should only accept horizontal whitespace. - If we're calling it recursively from an inline element with some delimiters, - we can allow any whitespace -*) let inline_element(ws) := | ~ = inline_element_without_whitespace; <> | s = located(ws); { return s } @@ -396,14 +446,6 @@ let inline_element_without_whitespace := let Loc.{ value = Tokens.{ start; inner }; location } = m in return @@ Loc.at { location with start } (`Math_span inner) } - (* - | minus_or_plus = located( list_light_start ); { - return @@ - Loc.map - Tokens.(function MINUS -> `Word "-" | PLUS -> `Word "+" | _ -> assert false) - minus_or_plus - } - *) (* More complex/recursive inline elements should have their own rule *) | ~ = style; <> | ~ = reference; <> diff --git a/src/parser/syntax.ml b/src/parser/syntax.ml deleted file mode 100644 index 74ecf88a34..0000000000 --- a/src/parser/syntax.ml +++ /dev/null @@ -1,1504 +0,0 @@ -(* This module is a recursive descent parser for the ocamldoc syntax. The parser - consumes a token stream of type [Token.t Stream.t], provided by the lexer, - and produces a comment AST of the type defined in [Parser_.Ast]. - - The AST has two main levels: inline elements, which can appear inside - paragraphs, and are spaced horizontally when presented, and block elements, - such as paragraphs and lists, which are spaced vertically when presented. - Block elements contain inline elements, but not vice versa. - - Corresponding to this, the parser has three "main" functions: - - - [delimited_inline_element_list] parses a run of inline elements that is - delimited by curly brace markup ([{...}]). - - [paragraph] parses a run of inline elements that make up a paragraph, and - is not explicitly delimited with curly braces. - - [block_element_list] parses a sequence of block elements. A comment is a - sequence of block elements, so [block_element_list] is the top-level - parser. It is also used for list item and tag content. *) - -open! Compat - -type 'a with_location = 'a Loc.with_location - -(* {2 Input} *) - -type input = { - tokens : Token.t Loc.with_location Stream.t; - warnings : Warning.t list ref; -} - -(* {2 Output} *) - -let add_warning input warning = input.warnings := warning :: !(input.warnings) -let junk input = Stream.junk input.tokens - -let peek input = - match Stream.peek input.tokens with - | Some token -> token - | None -> assert false - -module Table = struct - module Light_syntax = struct - let valid_align = function - | [ { Loc.value = `Word w; _ } ] -> ( - match String.length w with - | 0 -> `Valid None - | 1 -> ( - match w with - | "-" -> `Valid None - | ":" -> `Valid (Some `Center) - | _ -> `Invalid) - | len -> - if String.for_all (Char.equal '-') (String.sub w 1 (len - 2)) then - match (String.get w 0, String.get w (len - 1)) with - | ':', ':' -> `Valid (Some `Center) - | ':', '-' -> `Valid (Some `Left) - | '-', ':' -> `Valid (Some `Right) - | '-', '-' -> `Valid None - | _ -> `Invalid - else `Invalid) - | _ -> `Invalid - - let valid_align_row lx = - let rec loop acc = function - | [] -> Some (List.rev acc) - | x :: q -> ( - match valid_align x with - | `Invalid -> None - | `Valid alignment -> loop (alignment :: acc) q) - in - loop [] lx - - let create ~grid ~align : Ast.table = - let cell_to_block (x, k) = - let whole_loc = Loc.span (List.map (fun x -> x.Loc.location) x) in - match x with - | [] -> ([], k) - | _ -> ([ Loc.at whole_loc (`Paragraph x) ], k) - in - let row_to_block = List.map cell_to_block in - let grid_to_block = List.map row_to_block in - ((grid_to_block grid, align), `Light) - - let with_kind kind : 'a with_location list list -> 'a Ast.row = - List.map (fun c -> (c, kind)) - - let from_raw_data grid : Ast.table = - match grid with - | [] -> create ~grid:[] ~align:None - | row1 :: rows2_N -> ( - match valid_align_row row1 with - (* If the first line is the align row, everything else is data. *) - | Some _ as align -> - create ~grid:(List.map (with_kind `Data) rows2_N) ~align - | None -> ( - match rows2_N with - (* Only 1 line, if this is not the align row this is data. *) - | [] -> create ~grid:[ with_kind `Data row1 ] ~align:None - | row2 :: rows3_N -> ( - match valid_align_row row2 with - (* If the second line is the align row, the first one is the - header and the rest is data. *) - | Some _ as align -> - let header = with_kind `Header row1 in - let data = List.map (with_kind `Data) rows3_N in - create ~grid:(header :: data) ~align - (* No align row in the first 2 lines, everything is considered - data. *) - | None -> - create ~grid:(List.map (with_kind `Data) grid) ~align:None - ))) - end - - module Heavy_syntax = struct - let create ~grid : Ast.table = ((grid, None), `Heavy) - let from_grid grid : Ast.table = create ~grid - end -end - -module Reader = struct - let until_rbrace_or_eof input acc = - let rec consume () = - let next_token = peek input in - match next_token.value with - | `Right_brace -> - junk input; - `End (acc, next_token.location) - | `End -> - Parse_error.end_not_allowed next_token.location ~in_what:"table" - |> add_warning input; - junk input; - `End (acc, next_token.location) - | `Space _ | `Single_newline _ | `Blank_line _ -> - junk input; - consume () - | _ -> `Token next_token - in - consume () - - module Infix = struct - let ( >>> ) consume if_token = - match consume with - | `End (ret, loc) -> (ret, loc) - | `Token t -> if_token t - end -end - -open Reader.Infix - -(* The last token in the stream is always [`End], and it is never consumed by - the parser, so the [None] case is impossible. *) - -let npeek n input = Stream.npeek n input.tokens - -(* {2 Non-link inline elements} *) -type style = [ `Bold | `Italic | `Emphasis | `Superscript | `Subscript ] - -(* Convenient abbreviation for use in patterns. *) -type token_that_always_begins_an_inline_element = - [ `Word of string - | `Code_span of string - | `Raw_markup of string option * string - | `Begin_style of style - | `Simple_reference of string - | `Begin_reference_with_replacement_text of string - | `Simple_link of string - | `Begin_link_with_replacement_text of string - | `Math_span of string ] - -(* Check that the token constructors above actually are all in [Token.t]. *) -let _check_subset : token_that_always_begins_an_inline_element -> Token.t = - fun t -> (t :> Token.t) - -(* Consumes tokens that make up a single non-link inline element: - - - a horizontal space ([`Space], significant in inline elements), - - a word (see [word]), - - a code span ([...], [`Code_span _]), or - - styled text ({e ...}). - - The latter requires a recursive call to [delimited_inline_element_list], - defined below. - - This should be part of [delimited_inline_element_list]; however, it is also - called by function [paragraph]. As a result, it is factored out, and made - mutually-recursive with [delimited_inline_element_list]. - - This is called only when it is known that the first token in the list is the - beginning of an inline element. In the case of [`Minus] and [`Plus], that - means the caller has determined that they are not a list bullet (i.e., not - the first non-whitespace tokens on their line). - - This function consumes exactly the tokens that make up the element. *) -let rec inline_element : - input -> Loc.span -> _ -> Ast.inline_element with_location = - fun input location next_token -> - match next_token with - | `Space _ as token -> - junk input; - Loc.at location token - | `Word _ as token -> - junk input; - Loc.at location token - (* This is actually the same memory representation as the token, complete - with location, and is probably the most common case. Perhaps the token - can be reused somehow. The same is true of [`Space], [`Code_span]. *) - | `Minus -> - junk input; - Loc.at location (`Word "-") - | `Plus -> - junk input; - Loc.at location (`Word "+") - | `Bar -> - junk input; - Loc.at location (`Word "|") - | (`Code_span _ | `Math_span _ | `Raw_markup _) as token -> - junk input; - Loc.at location token - | `Begin_style s as parent_markup -> - junk input; - - let requires_leading_whitespace = - match s with - | `Bold | `Italic | `Emphasis -> true - | `Superscript | `Subscript -> false - in - let content, brace_location = - delimited_inline_element_list ~parent_markup - ~parent_markup_location:location ~requires_leading_whitespace input - in - - let location = Loc.span [ location; brace_location ] in - - if content = [] then - Parse_error.should_not_be_empty - ~what:(Token.describe parent_markup) - location - |> add_warning input; - - Loc.at location (`Styled (s, content)) - | `Simple_reference r -> - junk input; - - let r_location = Loc.nudge_start (String.length "{!") location in - let r = Loc.at r_location r in - - Loc.at location (`Reference (`Simple, r, [])) - | `Begin_reference_with_replacement_text r as parent_markup -> - junk input; - - let r_location = Loc.nudge_start (String.length "{{!") location in - let r = Loc.at r_location r in - - let content, brace_location = - delimited_inline_element_list ~parent_markup - ~parent_markup_location:location ~requires_leading_whitespace:false - input - in - - let location = Loc.span [ location; brace_location ] in - - if content = [] then - Parse_error.should_not_be_empty - ~what:(Token.describe parent_markup) - location - |> add_warning input; - - Loc.at location (`Reference (`With_text, r, content)) - | `Simple_link u -> - junk input; - - let u = String.trim u in - - if u = "" then - Parse_error.should_not_be_empty - ~what:(Token.describe next_token) - location - |> add_warning input; - - Loc.at location (`Link (u, [])) - | `Begin_link_with_replacement_text u as parent_markup -> - junk input; - - let u = String.trim u in - - if u = "" then - Parse_error.should_not_be_empty - ~what:(Token.describe parent_markup) - location - |> add_warning input; - - let content, brace_location = - delimited_inline_element_list ~parent_markup - ~parent_markup_location:location ~requires_leading_whitespace:false - input - in - - `Link (u, content) |> Loc.at (Loc.span [ location; brace_location ]) - -(* Consumes tokens that make up a sequence of inline elements that is ended by - a '}', a [`Right_brace] token. The brace token is also consumed. - - The sequences are also preceded by some markup like '{b'. Some of these - markup tokens require whitespace immediately after the token, and others not. - The caller indicates which way that is through the - [~requires_leading_whitespace] argument. - - Whitespace is significant in inline element lists. In particular, "foo [bar]" - is represented as [`Word "foo"; `Space; `Code_span "bar"], while "foo[bar]" - is [`Word "foo"; `Code_span "bar"]. It doesn't matter how much whitespace is - there, just whether it is present or not. Single newlines and horizontal - space in any amount are allowed. Blank lines are not, as these are separators - for {e block} elements. - - In correct input, the first and last elements emitted will not be [`Space], - i.e. [`Space] appears only between other non-link inline elements. In - incorrect input, there might be [`Space] followed immediately by something - like an @author tag. - - The [~parent_markup] and [~parent_markup_location] arguments are used for - generating error messages. *) -and delimited_inline_element_list : - parent_markup:[< Token.t ] -> - parent_markup_location:Loc.span -> - requires_leading_whitespace:bool -> - input -> - Ast.inline_element with_location list * Loc.span = - fun ~parent_markup ~parent_markup_location ~requires_leading_whitespace input -> - (* [~at_start_of_line] is used to interpret [`Minus] and [`Plus]. These are - word tokens if not the first non-whitespace tokens on their line. Then, - they are allowed in a non-link element list. *) - let rec consume_elements : - at_start_of_line:bool -> - Ast.inline_element with_location list -> - Ast.inline_element with_location list * Loc.span = - fun ~at_start_of_line acc -> - let next_token = peek input in - match next_token.value with - | `Right_brace -> - junk input; - (List.rev acc, next_token.location) - (* The [`Space] token is not space at the beginning or end of line, because - that is combined into [`Single_newline] or [`Blank_line] tokens. It is - also not at the beginning of markup (after e.g. '{b'), because that is - handled separately before calling - [consume_non_link_inline_elements], and not immediately before '}', - because that is combined into the [`Right_brace] token by the lexer. So, - it is an internal space, and we want to add it to the non-link inline - element list. *) - | (`Space _ | #token_that_always_begins_an_inline_element) as token -> - let acc = inline_element input next_token.location token :: acc in - consume_elements ~at_start_of_line:false acc - | `Single_newline ws -> - junk input; - let element = Loc.same next_token (`Space ws) in - consume_elements ~at_start_of_line:true (element :: acc) - | `Blank_line ws as blank -> - Parse_error.not_allowed ~what:(Token.describe blank) - ~in_what:(Token.describe parent_markup) - next_token.location - |> add_warning input; - - junk input; - let element = Loc.same next_token (`Space ws) in - consume_elements ~at_start_of_line:true (element :: acc) - | `Bar as token -> - let acc = inline_element input next_token.location token :: acc in - consume_elements ~at_start_of_line:false acc - | (`Minus | `Plus) as bullet -> - (if at_start_of_line then - let suggestion = - Printf.sprintf "move %s so it isn't the first thing on the line." - (Token.print bullet) - in - Parse_error.not_allowed ~what:(Token.describe bullet) - ~in_what:(Token.describe parent_markup) - ~suggestion next_token.location - |> add_warning input); - - let acc = inline_element input next_token.location bullet :: acc in - consume_elements ~at_start_of_line:false acc - | other_token -> - Parse_error.not_allowed - ~what:(Token.describe other_token) - ~in_what:(Token.describe parent_markup) - next_token.location - |> add_warning input; - - let last_location = - match acc with - | last_token :: _ -> last_token.location - | [] -> parent_markup_location - in - - (List.rev acc, last_location) - in - - let first_token = peek input in - match first_token.value with - | `Space _ -> - junk input; - consume_elements ~at_start_of_line:false [] - (* [~at_start_of_line] is [false] here because the preceding token was some - some markup like '{b', and we didn't move to the next line, so the next - token will not be the first non-whitespace token on its line. *) - | `Single_newline _ -> - junk input; - consume_elements ~at_start_of_line:true [] - | `Blank_line _ as blank -> - (* In case the markup is immediately followed by a blank line, the error - message printed by the catch-all case below can be confusing, as it will - suggest that the markup must be followed by a newline (which it is). It - just must not be followed by two newlines. To explain that clearly, - handle that case specifically. *) - Parse_error.not_allowed ~what:(Token.describe blank) - ~in_what:(Token.describe parent_markup) - first_token.location - |> add_warning input; - - junk input; - consume_elements ~at_start_of_line:true [] - | `Right_brace -> - junk input; - ([], first_token.location) - | _ -> - if requires_leading_whitespace then - Parse_error.should_be_followed_by_whitespace - ~what:(Token.print parent_markup) - parent_markup_location - |> add_warning input; - consume_elements ~at_start_of_line:false [] - -(* {2 Paragraphs} *) - -(* Consumes tokens that make up a paragraph. - - A paragraph is a sequence of inline elements that ends on a blank line, or - explicit block markup such as a verbatim block on a new line. - - Because of the significance of newlines, paragraphs are parsed line-by-line. - The function [paragraph] is called only when the current token is the first - non-whitespace token on its line, and begins an inline element. [paragraph] - then parses a line of inline elements. Afterwards, it looks ahead to the next - line. If that line also begins with an inline element, it parses that line, - and so on. *) -let paragraph : input -> Ast.nestable_block_element with_location = - fun input -> - (* Parses a single line of a paragraph, consisting of inline elements. The - only valid ways to end a paragraph line are with [`End], [`Single_newline], - [`Blank_line], and [`Right_brace]. Everything else either belongs in the - paragraph, or signifies an attempt to begin a block element inside a - paragraph line, which is an error. These errors are caught elsewhere; the - paragraph parser just stops. *) - let rec paragraph_line : - Ast.inline_element with_location list -> - Ast.inline_element with_location list = - fun acc -> - let next_token = peek input in - match next_token.value with - | ( `Space _ | `Minus | `Plus | `Bar - | #token_that_always_begins_an_inline_element ) as token -> - let element = inline_element input next_token.location token in - paragraph_line (element :: acc) - | _ -> acc - in - - (* After each line is parsed, decides whether to parse more lines. *) - let rec additional_lines : - Ast.inline_element with_location list -> - Ast.inline_element with_location list = - fun acc -> - match npeek 2 input with - | { value = `Single_newline ws; location } - :: { value = #token_that_always_begins_an_inline_element | `Bar; _ } - :: _ -> - junk input; - let acc = Loc.at location (`Space ws) :: acc in - let acc = paragraph_line acc in - additional_lines acc - | _ -> List.rev acc - in - - let elements = paragraph_line [] |> additional_lines in - `Paragraph elements |> Loc.at (Loc.span (List.map Loc.location elements)) - -(* {2 Block elements} *) - -(* {3 Helper types} *) - -(* The interpretation of tokens in the block parser depends on where on a line - each token appears. The six possible "locations" are: - - - [`At_start_of_line], when only whitespace has been read on the current - line. - - [`After_tag], when a valid tag token, such as [@deprecated], has been read, - and only whitespace has been read since. - - [`After_shorthand_bullet], when a valid shorthand list item bullet, such as - [-], has been read, and only whitespace has been read since. - - [`After_explicit_list_bullet], when a valid explicit bullet, such as [{li], - has been read, and only whitespace has been read since. - - [`After_table_cell], when a table cell opening markup ('{th' or '{td') has been read. - - [`After_text], when any other valid non-whitespace token has already been - read on the current line. - - Here are some examples of how this affects the interpretation of tokens: - - - A paragraph can start anywhere except [`After_text] (two paragraphs cannot - be on the same line, but paragraphs can be nested in just about anything). - - [`Minus] is interpreted as a list item bullet [`At_start_of_line], - [`After_tag], and [`After_explicit_list_bullet]. - - Tags are only allowed [`At_start_of_line]. - - To track the location accurately, the functions that make up the block parser - pass explicit [where_in_line] values around and return them. - - In a few cases, [where_in_line] can be inferred from what helper was called. - For example, the [paragraph] parser always stops on the same line as the last - significant token that is in the paragraph it consumed, so the location must - be [`After_text]. *) -type where_in_line = - [ `At_start_of_line - | `After_tag - | `After_shorthand_bullet - | `After_explicit_list_bullet - | `After_table_cell - | `After_text ] - -(* The block parsing loop, function [block_element_list], stops when it - encounters certain tokens. - - When it is called for the whole comment, or for in explicit list item - ([{li foo}]), it can only stop on end of input or a right brace. - - When it is called inside a shorthand list item ([- foo]), it stops on end of - input, right brace, a blank line (indicating end of shorthand list), plus or - minus (indicating the start of the next list item), or a section heading or - tag, which cannot be nested in list markup. - - The block parser [block_element_list] explicitly returns the token that - stopped it, with a type more precise than [Token.t stream_head]: if it was - called for the whole comment or an explicit list item, the stop token will - have type [stops_at_delimiters stream_head], and if it was called for a - shorthand list item, the stop token will have type - [implicit_stop stream_head]. This allows the calling parsers to write precise - cases for exactly the tokens that might be at the front of the stream after - the block parser returns. *) -type stops_at_delimiters = [ `End | `Right_brace ] -type code_stop = [ `End | `Right_code_delimiter ] - -type stopped_implicitly = - [ `End - | `Blank_line of string - | `Right_brace - | `Minus - | `Plus - | Token.section_heading - | Token.media_markup - | Token.tag ] - -(* Ensure that the above two types are really subsets of [Token.t]. *) -let _check_subset : stops_at_delimiters -> Token.t = fun t -> (t :> Token.t) -let _check_subset : stopped_implicitly -> Token.t = fun t -> (t :> Token.t) - -(* The different contexts in which the block parser [block_element_list] can be - called. The block parser's behavior depends somewhat on the context. For - example, while paragraphs are allowed anywhere, shorthand lists are not - allowed immediately inside other shorthand lists, while tags are not allowed - anywhere except at the comment top level. - - Besides telling the block parser how to behave, each context also carries two - types, which determine the return type of the block parser: - - - The type of blocks the parser returns. Note that [nestable_block_element] - is included in [block_element]. However, the extra block kinds in - [block_element] are only allowed at the comment top level. - - The type of token that the block parser stops at. See discussion above. *) -type ('block, 'stops_at_which_tokens) context = - | Top_level : (Ast.block_element, stops_at_delimiters) context - | In_implicitly_ended : - [ `Tag | `Shorthand_list ] - -> (Ast.nestable_block_element, stopped_implicitly) context - | In_explicit_list : (Ast.nestable_block_element, stops_at_delimiters) context - | In_table_cell : (Ast.nestable_block_element, stops_at_delimiters) context - | In_code_results : (Ast.nestable_block_element, code_stop) context - -(* This is a no-op. It is needed to prove to the type system that nestable block - elements are acceptable block elements in all contexts. *) -let accepted_in_all_contexts : - type block stops_at_which_tokens. - (block, stops_at_which_tokens) context -> - Ast.nestable_block_element -> - block = - fun context block -> - match context with - | Top_level -> (block :> Ast.block_element) - | In_implicitly_ended (`Tag | `Shorthand_list) -> block - | In_explicit_list -> block - | In_table_cell -> block - | In_code_results -> block - -(* Converts a tag to a series of words. This is used in error recovery, when a - tag cannot be generated. *) -let tag_to_words = function - | `Author s -> [ `Word "@author"; `Space " "; `Word s ] - | `Before s -> [ `Word "@before"; `Space " "; `Word s ] - | `Canonical s -> [ `Word "@canonical"; `Space " "; `Word s ] - | `Deprecated -> [ `Word "@deprecated" ] - | `Inline -> [ `Word "@inline" ] - | `Open -> [ `Word "@open" ] - | `Closed -> [ `Word "@closed" ] - | `Hidden -> [ `Word "@hidden" ] - | `Param s -> [ `Word "@param"; `Space " "; `Word s ] - | `Raise s -> [ `Word "@raise"; `Space " "; `Word s ] - | `Return -> [ `Word "@return" ] - | `See (`Document, s) -> [ `Word "@see"; `Space " "; `Word ("\"" ^ s ^ "\"") ] - | `See (`File, s) -> [ `Word "@see"; `Space " "; `Word ("'" ^ s ^ "'") ] - | `See (`Url, s) -> [ `Word "@see"; `Space " "; `Word ("<" ^ s ^ ">") ] - | `Since s -> [ `Word "@since"; `Space " "; `Word s ] - | `Version s -> [ `Word "@version"; `Space " "; `Word s ] - | `Children_order -> [ `Word "@children_order" ] - | `Toc_status -> [ `Word "@toc_status" ] - | `Order_category -> [ `Word "@order_category" ] - | `Short_title -> [ `Word "@short_title" ] - -(* {3 Block element lists} *) - -(* Consumes tokens making up a sequence of block elements. These are: - - - paragraphs, - - code blocks, - - verbatim text blocks, - - tables, - - lists, and - - section headings. *) -let rec block_element_list : - type block stops_at_which_tokens. - (block, stops_at_which_tokens) context -> - parent_markup:[< Token.t | `Comment ] -> - input -> - block with_location list - * stops_at_which_tokens with_location - * where_in_line = - fun context ~parent_markup input -> - let rec consume_block_elements : - where_in_line -> - block with_location list -> - block with_location list - * stops_at_which_tokens with_location - * where_in_line = - fun where_in_line acc -> - let describe token = - match token with - | #token_that_always_begins_an_inline_element -> "paragraph" - | _ -> Token.describe token - in - - let warn_if_after_text { Loc.location; value = token } = - if where_in_line = `After_text then - Parse_error.should_begin_on_its_own_line ~what:(describe token) location - |> add_warning input - in - - let warn_because_not_at_top_level { Loc.location; value = token } = - let suggestion = - Printf.sprintf "move %s outside of any other markup." - (Token.print token) - in - Parse_error.not_allowed ~what:(Token.describe token) - ~in_what:(Token.describe parent_markup) - ~suggestion location - |> add_warning input - in - - match peek input with - (* Terminators: the two tokens that terminate anything. *) - | { value = `End; _ } as next_token -> ( - match context with - | Top_level -> (List.rev acc, next_token, where_in_line) - | In_implicitly_ended (`Tag | `Shorthand_list) -> - (List.rev acc, next_token, where_in_line) - | In_explicit_list -> (List.rev acc, next_token, where_in_line) - | In_table_cell -> (List.rev acc, next_token, where_in_line) - | In_code_results -> (List.rev acc, next_token, where_in_line)) - | { value = `Right_brace; _ } as next_token -> ( - (* This little absurdity is needed to satisfy the type system. Without it, - OCaml is unable to prove that [stream_head] has the right type for all - possible values of [context]. *) - match context with - | Top_level -> (List.rev acc, next_token, where_in_line) - | In_implicitly_ended (`Tag | `Shorthand_list) -> - (List.rev acc, next_token, where_in_line) - | In_explicit_list -> (List.rev acc, next_token, where_in_line) - | In_table_cell -> (List.rev acc, next_token, where_in_line) - | In_code_results -> - junk input; - consume_block_elements where_in_line acc) - | { value = `Right_code_delimiter; _ } as next_token -> ( - match context with - | In_code_results -> (List.rev acc, next_token, where_in_line) - | _ -> - junk input; - consume_block_elements where_in_line acc) - (* Whitespace. This can terminate some kinds of block elements. It is also - necessary to track it to interpret [`Minus] and [`Plus] correctly, as - well as to ensure that all block elements begin on their own line. *) - | { value = `Space _; _ } -> - junk input; - consume_block_elements where_in_line acc - | { value = `Single_newline _; _ } -> - junk input; - consume_block_elements `At_start_of_line acc - | { value = `Blank_line _; _ } as next_token -> ( - match context with - (* Blank lines terminate shorthand lists ([- foo]) and tags. They also - terminate paragraphs, but the paragraph parser is aware of that - internally. *) - | In_implicitly_ended (`Tag | `Shorthand_list) -> - (List.rev acc, next_token, where_in_line) - (* Otherwise, blank lines are pretty much like single newlines. *) - | _ -> - junk input; - consume_block_elements `At_start_of_line acc) - (* Explicit list items ([{li ...}] and [{- ...}]) can never appear directly - in block content. They can only appear inside [{ul ...}] and [{ol ...}]. - So, catch those. *) - | { value = `Begin_list_item _ as token; location } -> - let suggestion = - Printf.sprintf "move %s into %s, or use %s." (Token.print token) - (Token.describe (`Begin_list `Unordered)) - (Token.describe `Minus) - in - Parse_error.not_allowed ~what:(Token.describe token) - ~in_what:(Token.describe parent_markup) - ~suggestion location - |> add_warning input; - - junk input; - consume_block_elements where_in_line acc - (* Table rows ([{tr ...}]) can never appear directly - in block content. They can only appear inside [{table ...}]. *) - | { value = `Begin_table_row as token; location } -> - let suggestion = - Printf.sprintf "move %s into %s." (Token.print token) - (Token.describe `Begin_table_heavy) - in - Parse_error.not_allowed ~what:(Token.describe token) - ~in_what:(Token.describe parent_markup) - ~suggestion location - |> add_warning input; - junk input; - consume_block_elements where_in_line acc - (* Table cells ([{th ...}] and [{td ...}]) can never appear directly - in block content. They can only appear inside [{tr ...}]. *) - | { value = `Begin_table_cell _ as token; location } -> - let suggestion = - Printf.sprintf "move %s into %s." (Token.print token) - (Token.describe `Begin_table_row) - in - Parse_error.not_allowed ~what:(Token.describe token) - ~in_what:(Token.describe parent_markup) - ~suggestion location - |> add_warning input; - junk input; - consume_block_elements where_in_line acc - (* Tags. These can appear at the top level only. *) - | { value = `Tag tag as token; location } as next_token -> ( - let recover_when_not_at_top_level context = - warn_because_not_at_top_level next_token; - junk input; - let words = List.map (Loc.at location) (tag_to_words tag) in - let paragraph = - `Paragraph words - |> accepted_in_all_contexts context - |> Loc.at location - in - consume_block_elements `At_start_of_line (paragraph :: acc) - in - - match context with - (* Tags cannot make sense in an explicit list ([{ul {li ...}}]). *) - | In_explicit_list -> recover_when_not_at_top_level context - (* If a tag starts at the beginning of a line, it terminates the preceding - tag and/or the current shorthand list. In this case, return to the - caller, and let the caller decide how to interpret the tag token. *) - | In_implicitly_ended (`Tag | `Shorthand_list) -> - if where_in_line = `At_start_of_line then - (List.rev acc, next_token, where_in_line) - else recover_when_not_at_top_level context - | In_table_cell -> recover_when_not_at_top_level context - | In_code_results -> recover_when_not_at_top_level context - (* If this is the top-level call to [block_element_list], parse the - tag. *) - | Top_level -> ( - if where_in_line <> `At_start_of_line then - Parse_error.should_begin_on_its_own_line - ~what:(Token.describe token) location - |> add_warning input; - - junk input; - - match tag with - | (`Author s | `Since s | `Version s | `Canonical s) as tag -> - let s = String.trim s in - if s = "" then - Parse_error.should_not_be_empty ~what:(Token.describe token) - location - |> add_warning input; - let tag = - match tag with - | `Author _ -> `Author s - | `Since _ -> `Since s - | `Version _ -> `Version s - | `Canonical _ -> - (* TODO The location is only approximate, as we need lexer - cooperation to get the real location. *) - let r_location = - Loc.nudge_start (String.length "@canonical ") location - in - `Canonical (Loc.at r_location s) - in - - let tag = Loc.at location (`Tag tag) in - consume_block_elements `After_text (tag :: acc) - | ( `Deprecated | `Return | `Children_order | `Short_title - | `Toc_status | `Order_category ) as tag -> - let content, _stream_head, where_in_line = - block_element_list (In_implicitly_ended `Tag) - ~parent_markup:token input - in - let tag = - match tag with - | `Deprecated -> `Deprecated content - | `Toc_status -> `Toc_status content - | `Return -> `Return content - | `Children_order -> `Children_order content - | `Short_title -> `Short_title content - | `Order_category -> `Order_category content - in - let location = - location :: List.map Loc.location content |> Loc.span - in - let tag = Loc.at location (`Tag tag) in - consume_block_elements where_in_line (tag :: acc) - | (`Param _ | `Raise _ | `Before _) as tag -> - let content, _stream_head, where_in_line = - block_element_list (In_implicitly_ended `Tag) - ~parent_markup:token input - in - let tag = - match tag with - | `Param s -> `Param (s, content) - | `Raise s -> `Raise (s, content) - | `Before s -> `Before (s, content) - in - let location = - location :: List.map Loc.location content |> Loc.span - in - let tag = Loc.at location (`Tag tag) in - consume_block_elements where_in_line (tag :: acc) - | `See (kind, target) -> - let content, _next_token, where_in_line = - block_element_list (In_implicitly_ended `Tag) - ~parent_markup:token input - in - let location = - location :: List.map Loc.location content |> Loc.span - in - let tag = `Tag (`See (kind, target, content)) in - let tag = Loc.at location tag in - consume_block_elements where_in_line (tag :: acc) - | (`Inline | `Open | `Closed | `Hidden) as tag -> - let tag = Loc.at location (`Tag tag) in - consume_block_elements `After_text (tag :: acc))) - | ( { value = #token_that_always_begins_an_inline_element; _ } - | { value = `Bar; _ } ) as next_token -> - warn_if_after_text next_token; - - let block = paragraph input in - let block = Loc.map (accepted_in_all_contexts context) block in - let acc = block :: acc in - consume_block_elements `After_text acc - | { value = `Verbatim s as token; location } as next_token -> - warn_if_after_text next_token; - if s = "" then - Parse_error.should_not_be_empty ~what:(Token.describe token) location - |> add_warning input; - - junk input; - let block = accepted_in_all_contexts context token in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements `After_text acc - | { value = `Math_block s as token; location } as next_token -> - warn_if_after_text next_token; - if s = "" then - Parse_error.should_not_be_empty ~what:(Token.describe token) location - |> add_warning input; - - junk input; - let block = accepted_in_all_contexts context token in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements `After_text acc - | { - value = - `Code_block (meta, delim, { value = s; location = v_loc }, has_outputs) - as token; - location; - } as next_token -> - warn_if_after_text next_token; - junk input; - let delimiter = if delim = "" then None else Some delim in - let output, location = - if not has_outputs then (None, location) - else - let content, next_token, _where_in_line = - block_element_list In_code_results ~parent_markup:token input - in - junk input; - let locations = - location :: List.map (fun content -> content.Loc.location) content - in - let location = Loc.span locations in - let location = { location with end_ = next_token.location.end_ } in - (Some content, location) - in - - if s = "" then - Parse_error.should_not_be_empty ~what:(Token.describe token) location - |> add_warning input; - - let meta = - match meta with - | None -> None - | Some (language, tags) -> Some { Ast.language; tags } - in - let block = - accepted_in_all_contexts context - (`Code_block - { - Ast.meta; - delimiter; - content = { value = s; location = v_loc }; - output; - }) - in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements `After_text acc - | { value = `Modules s as token; location } as next_token -> - warn_if_after_text next_token; - - junk input; - - (* TODO Use some library for a splitting function, or move this out into a - Util module. *) - let split_string delimiters s = - let rec scan_delimiters acc index = - if index >= String.length s then List.rev acc - else if String.contains delimiters s.[index] then - scan_delimiters acc (index + 1) - else scan_word acc index (index + 1) - and scan_word acc start_index index = - if index >= String.length s then - let word = String.sub s start_index (index - start_index) in - List.rev (word :: acc) - else if String.contains delimiters s.[index] then - let word = String.sub s start_index (index - start_index) in - scan_delimiters (word :: acc) (index + 1) - else scan_word acc start_index (index + 1) - in - - scan_delimiters [] 0 - in - - (* TODO Correct locations await a full implementation of {!modules} - parsing. *) - let modules = - split_string " \t\r\n" s |> List.map (fun r -> Loc.at location r) - in - - if modules = [] then - Parse_error.should_not_be_empty ~what:(Token.describe token) location - |> add_warning input; - - let block = accepted_in_all_contexts context (`Modules modules) in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements `After_text acc - | { value = `Begin_list kind as token; location } as next_token -> - warn_if_after_text next_token; - - junk input; - - let items, brace_location = - explicit_list_items ~parent_markup:token input - in - if items = [] then - Parse_error.should_not_be_empty ~what:(Token.describe token) location - |> add_warning input; - - let location = Loc.span [ location; brace_location ] in - let block = `List (kind, `Heavy, items) in - let block = accepted_in_all_contexts context block in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements `After_text acc - | { value = (`Begin_table_light | `Begin_table_heavy) as token; location } - as next_token -> - warn_if_after_text next_token; - junk input; - let block, brace_location = - let parent_markup = token in - let parent_markup_location = location in - match token with - | `Begin_table_light -> - light_table input ~parent_markup ~parent_markup_location - | `Begin_table_heavy -> - heavy_table input ~parent_markup ~parent_markup_location - in - let location = Loc.span [ location; brace_location ] in - let block = accepted_in_all_contexts context (`Table block) in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements `After_text acc - | { value = (`Minus | `Plus) as token; location } as next_token -> ( - (match where_in_line with - | `After_text | `After_shorthand_bullet -> - Parse_error.should_begin_on_its_own_line - ~what:(Token.describe token) location - |> add_warning input - | _ -> ()); - - match context with - | In_implicitly_ended `Shorthand_list -> - (List.rev acc, next_token, where_in_line) - | _ -> - let items, where_in_line = - shorthand_list_items next_token where_in_line input - in - let kind = - match token with `Minus -> `Unordered | `Plus -> `Ordered - in - let location = - location :: List.map Loc.location (List.flatten items) |> Loc.span - in - let block = `List (kind, `Light, items) in - let block = accepted_in_all_contexts context block in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements where_in_line acc) - | { value = `Begin_section_heading (level, label) as token; location } as - next_token -> ( - let recover_when_not_at_top_level context = - warn_because_not_at_top_level next_token; - junk input; - let content, brace_location = - delimited_inline_element_list ~parent_markup:token - ~parent_markup_location:location ~requires_leading_whitespace:true - input - in - let location = Loc.span [ location; brace_location ] in - let paragraph = - `Paragraph content - |> accepted_in_all_contexts context - |> Loc.at location - in - consume_block_elements `At_start_of_line (paragraph :: acc) - in - - match context with - | In_implicitly_ended (`Tag | `Shorthand_list) -> - if where_in_line = `At_start_of_line then - (List.rev acc, next_token, where_in_line) - else recover_when_not_at_top_level context - | In_explicit_list -> recover_when_not_at_top_level context - | In_table_cell -> recover_when_not_at_top_level context - | In_code_results -> recover_when_not_at_top_level context - | Top_level -> - if where_in_line <> `At_start_of_line then - Parse_error.should_begin_on_its_own_line - ~what:(Token.describe token) location - |> add_warning input; - - let label = - match label with - | Some "" -> - Parse_error.should_not_be_empty ~what:"heading label" location - |> add_warning input; - None - | _ -> label - in - - junk input; - - let content, brace_location = - delimited_inline_element_list ~parent_markup:token - ~parent_markup_location:location - ~requires_leading_whitespace:true input - in - if content = [] then - Parse_error.should_not_be_empty ~what:(Token.describe token) - location - |> add_warning input; - - let location = Loc.span [ location; brace_location ] in - let heading = `Heading (level, label, content) in - let heading = Loc.at location heading in - let acc = heading :: acc in - consume_block_elements `After_text acc) - | { value = `Begin_paragraph_style _ as token; location } -> - junk input; - let content, brace_location = - delimited_inline_element_list ~parent_markup:token - ~parent_markup_location:location ~requires_leading_whitespace:true - input - in - let location = Loc.span [ location; brace_location ] in - - Parse_error.markup_should_not_be_used ~what:(Token.describe token) - location - |> add_warning input; - - let paragraph = - `Paragraph content - |> accepted_in_all_contexts context - |> Loc.at location - in - consume_block_elements `At_start_of_line (paragraph :: acc) - | { - location; - value = `Media_with_replacement_text (href, media, content) as token; - } -> - junk input; - - let r_location = - Loc.nudge_start - (String.length @@ Token.s_of_media `Replaced media) - location - |> Loc.nudge_end (String.length content + 1) - (* +1 for closing character *) - in - let c_location = - Loc.nudge_start - (String.length (Token.s_of_media `Replaced media) - + String.length (match href with `Reference s | `Link s -> s)) - location - |> Loc.nudge_end 1 - in - let content = String.trim content in - let href = href |> Loc.at r_location in - - if content = "" then - Parse_error.should_not_be_empty ~what:(Token.describe token) - c_location - |> add_warning input; - - let block = `Media (`Simple, href, content, media) in - let block = accepted_in_all_contexts context block in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements `After_text acc - | { location; value = `Simple_media (href, media) } -> - junk input; - - let r_location = - Loc.nudge_start - (String.length @@ Token.s_of_media `Simple media) - location - |> Loc.nudge_end 1 - in - let href = href |> Loc.at r_location in - let block = `Media (`Simple, href, "", media) in - let block = accepted_in_all_contexts context block in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements `After_text acc - in - - let where_in_line = - match context with - | Top_level -> `At_start_of_line - | In_implicitly_ended `Shorthand_list -> `After_shorthand_bullet - | In_explicit_list -> `After_explicit_list_bullet - | In_table_cell -> `After_table_cell - | In_code_results -> `After_tag - | In_implicitly_ended `Tag -> `After_tag - in - - consume_block_elements where_in_line [] - -(* {3 Lists} *) - -(* Consumes a sequence of implicit list items. Each one consists of a [`Minus] - or [`Plus] token, followed by block elements until: - - - a blank line, or - - a list bullet of the opposite kind (e.g. [`Plus] for a [`Minus] list). - - This function is called when the next token is known to be [`Minus] or - [`Plus]. It consumes that token, and calls the block element parser (see - above). That parser returns to [implicit_list_items] only on [`Blank_line], - [`End], [`Minus] or [`Plus] at the start of a line, or [`Right_brace]. *) -and shorthand_list_items : - [ `Minus | `Plus ] with_location -> - where_in_line -> - input -> - Ast.nestable_block_element with_location list list * where_in_line = - fun first_token where_in_line input -> - let bullet_token = first_token.value in - - let rec consume_list_items : - [> ] with_location -> - where_in_line -> - Ast.nestable_block_element with_location list list -> - Ast.nestable_block_element with_location list list * where_in_line = - fun next_token where_in_line acc -> - match next_token.value with - | `End | `Right_brace | `Blank_line _ | `Tag _ | `Begin_section_heading _ - | `Simple_media _ | `Media_with_replacement_text _ -> - (List.rev acc, where_in_line) - | (`Minus | `Plus) as bullet -> - if bullet = bullet_token then ( - junk input; - - let content, stream_head, where_in_line = - block_element_list (In_implicitly_ended `Shorthand_list) - ~parent_markup:bullet input - in - if content = [] then - Parse_error.should_not_be_empty ~what:(Token.describe bullet) - next_token.location - |> add_warning input; - - let acc = content :: acc in - consume_list_items stream_head where_in_line acc) - else (List.rev acc, where_in_line) - in - - consume_list_items - (first_token :> stopped_implicitly with_location) - where_in_line [] - -(* Consumes a sequence of explicit list items (starting with '{li ...}' and - '{-...}', which are represented by [`Begin_list_item _] tokens). - - This function is called immediately after '{ul' or '{ol' ([`Begin_list _]) is - read. The only "valid" way to exit is by reading a [`Right_brace] token, - which is consumed. - - Whitespace inside the list, but outside list items, is not significant – this - parsing function consumes all of it. Otherwise, only list item start tokens - are accepted. Everything else is an error. *) -and explicit_list_items : - parent_markup:[< Token.t ] -> - input -> - Ast.nestable_block_element with_location list list * Loc.span = - fun ~parent_markup input -> - let rec consume_list_items : - Ast.nestable_block_element with_location list list -> - Ast.nestable_block_element with_location list list * Loc.span = - fun acc -> - let next_token = peek input in - match next_token.value with - | `End -> - Parse_error.end_not_allowed next_token.location - ~in_what:(Token.describe parent_markup) - |> add_warning input; - (List.rev acc, next_token.location) - | `Right_brace -> - junk input; - (List.rev acc, next_token.location) - | `Space _ | `Single_newline _ | `Blank_line _ -> - junk input; - consume_list_items acc - | `Begin_list_item kind as token -> - junk input; - - (* '{li', represented by [`Begin_list_item `Li], must be followed by - whitespace. *) - (if kind = `Li then - match (peek input).value with - | `Space _ | `Single_newline _ | `Blank_line _ | `Right_brace -> - () - (* The presence of [`Right_brace] above requires some explanation: - - - It is better to be silent about missing whitespace if the next - token is [`Right_brace], because the error about an empty list - item will be generated below, and that error is more important to - the user. - - The [`Right_brace] token also happens to include all whitespace - before it, as a convenience for the rest of the parser. As a - result, not ignoring it could be wrong: there could in fact be - whitespace in the concrete syntax immediately after '{li', just - it is not represented as [`Space], [`Single_newline], or - [`Blank_line]. *) - | _ -> - Parse_error.should_be_followed_by_whitespace next_token.location - ~what:(Token.print token) - |> add_warning input); - - let content, token_after_list_item, _where_in_line = - block_element_list In_explicit_list ~parent_markup:token input - in - - if content = [] then - Parse_error.should_not_be_empty next_token.location - ~what:(Token.describe token) - |> add_warning input; - - (match token_after_list_item.value with - | `Right_brace -> junk input - | `End -> - Parse_error.end_not_allowed token_after_list_item.location - ~in_what:(Token.describe token) - |> add_warning input); - - let acc = content :: acc in - consume_list_items acc - | token -> - let suggestion = - match token with - | `Begin_section_heading _ | `Tag _ -> - Printf.sprintf "move %s outside the list." (Token.describe token) - | _ -> - Printf.sprintf "move %s into a list item, %s or %s." - (Token.describe token) - (Token.print (`Begin_list_item `Li)) - (Token.print (`Begin_list_item `Dash)) - in - Parse_error.not_allowed next_token.location ~what:(Token.describe token) - ~in_what:(Token.describe parent_markup) - ~suggestion - |> add_warning input; - - junk input; - consume_list_items acc - in - - consume_list_items [] - -(* Consumes a sequence of table rows that might start with [`Bar]. - - This function is called immediately after '{t' ([`Begin_table `Light]) is - read. The only "valid" way to exit is by reading a [`Right_brace] token, - which is consumed. *) -and light_table ~parent_markup ~parent_markup_location input = - let rec consume_rows acc ~last_loc = - Reader.until_rbrace_or_eof input acc >>> fun next_token -> - match next_token.Loc.value with - | `Bar | #token_that_always_begins_an_inline_element -> ( - let next, row, last_loc = - light_table_row ~parent_markup ~last_loc input - in - match next with - | `Continue -> consume_rows (row :: acc) ~last_loc - | `Stop -> (row :: acc, last_loc)) - | other_token -> - Parse_error.not_allowed next_token.location - ~what:(Token.describe other_token) - ~in_what:(Token.describe parent_markup) - |> add_warning input; - junk input; - consume_rows acc ~last_loc - in - let rows, brace_location = consume_rows [] ~last_loc:parent_markup_location in - let grid = List.rev rows in - (Table.Light_syntax.from_raw_data grid, brace_location) - -(* Consumes a table row that might start with [`Bar]. *) -and light_table_row ~parent_markup ~last_loc input = - let rec consume_row acc_row acc_cell acc_space ~new_line ~last_loc = - let push_cells row cell = - match cell with [] -> row | _ -> List.rev cell :: row - in - let return row cell = List.rev (push_cells row cell) in - let next_token = peek input in - match next_token.value with - | `End -> - Parse_error.end_not_allowed next_token.location ~in_what:"table" - |> add_warning input; - junk input; - (`Stop, return acc_row acc_cell, next_token.location) - | `Right_brace -> - junk input; - (`Stop, return acc_row acc_cell, next_token.location) - | `Space _ as token -> - junk input; - let i = Loc.at next_token.location token in - consume_row acc_row acc_cell (i :: acc_space) ~new_line ~last_loc - | `Single_newline _ | `Blank_line _ -> - junk input; - (`Continue, return acc_row acc_cell, last_loc) - | `Bar -> - junk input; - let acc_row = if new_line then [] else List.rev acc_cell :: acc_row in - consume_row acc_row [] [] ~new_line:false ~last_loc - | #token_that_always_begins_an_inline_element as token -> - let i = inline_element input next_token.location token in - if Loc.spans_multiple_lines i then - Parse_error.not_allowed - ~what:(Token.describe (`Single_newline "")) - ~in_what:(Token.describe `Begin_table_light) - i.location - |> add_warning input; - let acc_cell = - if acc_cell = [] then [ i ] else (i :: acc_space) @ acc_cell - in - consume_row acc_row acc_cell [] ~new_line:false - ~last_loc:next_token.location - | other_token -> - Parse_error.not_allowed next_token.location - ~what:(Token.describe other_token) - ~in_what:(Token.describe parent_markup) - |> add_warning input; - junk input; - consume_row acc_row acc_cell acc_space ~new_line ~last_loc - in - consume_row [] [] [] ~new_line:true ~last_loc - -(* Consumes a sequence of table rows (starting with '{tr ...}', which are - represented by [`Begin_table_row] tokens). - - This function is called immediately after '{table' ([`Begin_table `Heavy]) is - read. The only "valid" way to exit is by reading a [`Right_brace] token, - which is consumed. *) -and heavy_table ~parent_markup ~parent_markup_location input = - let rec consume_rows acc ~last_loc = - Reader.until_rbrace_or_eof input acc >>> fun next_token -> - match next_token.Loc.value with - | `Begin_table_row as token -> - junk input; - let items, last_loc = heavy_table_row ~parent_markup:token input in - consume_rows (List.rev items :: acc) ~last_loc - | token -> - Parse_error.not_allowed next_token.location ~what:(Token.describe token) - ~in_what:(Token.describe parent_markup) - ~suggestion:"Move outside of {table ...}, or inside {tr ...}" - |> add_warning input; - junk input; - consume_rows acc ~last_loc - in - let rows, brace_location = consume_rows [] ~last_loc:parent_markup_location in - let grid = List.rev rows in - (Table.Heavy_syntax.from_grid grid, brace_location) - -(* Consumes a sequence of table cells (starting with '{th ...}' or '{td ... }', - which are represented by [`Begin_table_cell] tokens). - - This function is called immediately after '{tr' ([`Begin_table_row]) is - read. The only "valid" way to exit is by reading a [`Right_brace] token, - which is consumed. *) -and heavy_table_row ~parent_markup input = - let rec consume_cell_items acc = - Reader.until_rbrace_or_eof input acc >>> fun next_token -> - match next_token.Loc.value with - | `Begin_table_cell kind as token -> - junk input; - let content, token_after_list_item, _where_in_line = - block_element_list In_table_cell ~parent_markup:token input - in - (match token_after_list_item.value with - | `Right_brace -> junk input - | `End -> - Parse_error.not_allowed token_after_list_item.location - ~what:(Token.describe `End) ~in_what:(Token.describe token) - |> add_warning input); - consume_cell_items ((content, kind) :: acc) - | token -> - Parse_error.not_allowed next_token.location ~what:(Token.describe token) - ~in_what:(Token.describe parent_markup) - ~suggestion: - "Move outside of {table ...}, or inside {td ...} or {th ...}" - |> add_warning input; - junk input; - consume_cell_items acc - in - consume_cell_items [] - -(* {2 Entry point} *) - -let parse warnings tokens = - let input : input = { tokens; warnings } in - - let rec parse_block_elements () = - let elements, last_token, _where_in_line = - block_element_list Top_level ~parent_markup:`Comment input - in - - match last_token.value with - | `End -> elements - | `Right_brace -> - Parse_error.unpaired_right_brace last_token.location - |> add_warning input; - - let block = - Loc.same last_token (`Paragraph [ Loc.same last_token (`Word "}") ]) - in - - junk input; - elements @ (block :: parse_block_elements ()) - in - let ast = parse_block_elements () in - (ast, List.rev !(input.warnings)) diff --git a/src/parser/token.ml b/src/parser/token.ml deleted file mode 100644 index ac23cd76ba..0000000000 --- a/src/parser/token.ml +++ /dev/null @@ -1,256 +0,0 @@ -(* This module contains the token type, emitted by the lexer, and consumed by - the comment syntax parser. It also contains two functions that format tokens - for error messages. *) - -type section_heading = [ `Begin_section_heading of int * string option ] -type style = [ `Bold | `Italic | `Emphasis | `Superscript | `Subscript ] -type paragraph_style = [ `Left | `Center | `Right ] - -type tag = - [ `Tag of - [ `Author of string - | `Deprecated - | `Param of string - | `Raise of string - | `Return - | `See of [ `Url | `File | `Document ] * string - | `Since of string - | `Before of string - | `Version of string - | `Canonical of string - | `Children_order - | `Toc_status - | `Order_category - | `Short_title - | `Inline - | `Open - | `Closed - | `Hidden ] ] - -type media = [ `Audio | `Video | `Image ] -type media_href = [ `Reference of string | `Link of string ] - -type media_markup = - [ `Simple_media of media_href * media - | `Media_with_replacement_text of media_href * media * string ] - -let s_of_media kind media = - match (kind, media) with - | `Simple, `Audio -> "{audio!" - | `Simple, `Video -> "{video!" - | `Simple, `Image -> "{image!" - | `Replaced, `Audio -> "{{audio!" - | `Replaced, `Video -> "{{video!" - | `Replaced, `Image -> "{{image!" - -type t = - [ (* End of input. *) - `End - | (* Runs of whitespace. [Blank_line] is any run of whitespace that contains two - or more newline characters. [Single_newline] is any run of whitespace that - contains exactly one newline character. [Space] is any run of whitespace - that contains no newline characters. - - It is an important invariant in the parser that no adjacent whitespace - tokens are emitted by the lexer. Otherwise, there would be the need for - unbounded lookahead, a (co-?)ambiguity between - [Single_newline Single_newline] and [Blank_line], and other problems. *) - `Space of - string - | `Single_newline of string - | `Blank_line of string - | (* A right curly brace ([}]), i.e. end of markup. *) - `Right_brace - | `Right_code_delimiter - | (* Words are anything that is not whitespace or markup. Markup symbols can be - be part of words if escaped. - - Words can contain plus and minus symbols, but those are emitted as [Plus] - and [Minus] tokens. The parser combines plus and minus into words, except - when they appear first on a line, in which case the tokens are list item - bullets. *) - `Word of - string - | `Code_span of string - | `Raw_markup of string option * string - | `Math_span of string - | `Math_block of string - | `Begin_style of style - | `Begin_paragraph_style of paragraph_style - | (* Other inline element markup. *) - `Simple_reference of string - | `Begin_reference_with_replacement_text of string - | `Simple_link of string - | `Begin_link_with_replacement_text of string - | media_markup - | (* Leaf block element markup. *) - `Code_block of - (string Loc.with_location * string Loc.with_location option) option - * string - * string Loc.with_location - * bool - | `Verbatim of string - | `Modules of string - | (* List markup. *) - `Begin_list of [ `Unordered | `Ordered ] - | `Begin_list_item of [ `Li | `Dash ] - | (* Table markup. *) - `Begin_table_light - | `Begin_table_heavy - | `Begin_table_row - | `Begin_table_cell of [ `Header | `Data ] - | `Minus - | `Plus - | `Bar - | section_heading - | tag ] - -let print : [< t ] -> string = function - | `Begin_paragraph_style `Left -> "'{L'" - | `Begin_paragraph_style `Center -> "'{C'" - | `Begin_paragraph_style `Right -> "'{R'" - | `Begin_style `Bold -> "'{b'" - | `Begin_style `Italic -> "'{i'" - | `Begin_style `Emphasis -> "'{e'" - | `Begin_style `Superscript -> "'{^'" - | `Begin_style `Subscript -> "'{_'" - | `Begin_reference_with_replacement_text _ -> "'{{!'" - | `Begin_link_with_replacement_text _ -> "'{{:'" - | `Begin_list_item `Li -> "'{li ...}'" - | `Begin_list_item `Dash -> "'{- ...}'" - | `Begin_table_light -> "{t" - | `Begin_table_heavy -> "{table" - | `Begin_table_row -> "'{tr'" - | `Begin_table_cell `Header -> "'{th'" - | `Begin_table_cell `Data -> "'{td'" - | `Minus -> "'-'" - | `Plus -> "'+'" - | `Bar -> "'|'" - | `Begin_section_heading (level, label) -> - let label = match label with None -> "" | Some label -> ":" ^ label in - Printf.sprintf "'{%i%s'" level label - | `Tag (`Author _) -> "'@author'" - | `Tag `Deprecated -> "'@deprecated'" - | `Tag (`Param _) -> "'@param'" - | `Tag (`Raise _) -> "'@raise'" - | `Tag `Return -> "'@return'" - | `Tag `Children_order -> "'@children_order'" - | `Tag `Order_category -> "'@order_category'" - | `Tag `Toc_status -> "'@toc_status'" - | `Tag `Short_title -> "'@short_title'" - | `Tag (`See _) -> "'@see'" - | `Tag (`Since _) -> "'@since'" - | `Tag (`Before _) -> "'@before'" - | `Tag (`Version _) -> "'@version'" - | `Tag (`Canonical _) -> "'@canonical'" - | `Tag `Inline -> "'@inline'" - | `Tag `Open -> "'@open'" - | `Tag `Closed -> "'@closed'" - | `Tag `Hidden -> "'@hidden" - | `Raw_markup (None, _) -> "'{%...%}'" - | `Raw_markup (Some target, _) -> "'{%" ^ target ^ ":...%}'" - | `Simple_media (`Reference _, `Image) -> "{image!...}" - | `Simple_media (`Reference _, `Audio) -> "{audio!...}" - | `Simple_media (`Reference _, `Video) -> "{video!...}" - | `Simple_media (`Link _, `Image) -> "{image:...}" - | `Simple_media (`Link _, `Audio) -> "{audio:...}" - | `Simple_media (`Link _, `Video) -> "{video:...}" - | `Media_with_replacement_text (`Reference _, `Image, _) -> - "{{image!...} ...}" - | `Media_with_replacement_text (`Reference _, `Audio, _) -> - "{{audio!...} ...}" - | `Media_with_replacement_text (`Reference _, `Video, _) -> - "{{video!...} ...}" - | `Media_with_replacement_text (`Link _, `Image, _) -> "{{image:...} ...}" - | `Media_with_replacement_text (`Link _, `Audio, _) -> "{{audio:...} ...}" - | `Media_with_replacement_text (`Link _, `Video, _) -> "{{video:...} ...}" - -(* [`Minus] and [`Plus] are interpreted as if they start list items. Therefore, - for error messages based on [Token.describe] to be accurate, formatted - [`Minus] and [`Plus] should always be plausibly list item bullets. *) -let describe : [< t | `Comment ] -> string = function - | `Word w -> Printf.sprintf "'%s'" w - | `Code_span _ -> "'[...]' (code)" - | `Raw_markup _ -> "'{%...%}' (raw markup)" - | `Begin_paragraph_style `Left -> "'{L ...}' (left alignment)" - | `Begin_paragraph_style `Center -> "'{C ...}' (center alignment)" - | `Begin_paragraph_style `Right -> "'{R ...}' (right alignment)" - | `Begin_style `Bold -> "'{b ...}' (boldface text)" - | `Begin_style `Italic -> "'{i ...}' (italic text)" - | `Begin_style `Emphasis -> "'{e ...}' (emphasized text)" - | `Begin_style `Superscript -> "'{^...}' (superscript)" - | `Begin_style `Subscript -> "'{_...}' (subscript)" - | `Math_span _ -> "'{m ...}' (math span)" - | `Math_block _ -> "'{math ...}' (math block)" - | `Simple_reference _ -> "'{!...}' (cross-reference)" - | `Begin_reference_with_replacement_text _ -> - "'{{!...} ...}' (cross-reference)" - | `Simple_media (`Reference _, `Image) -> "'{image!...}' (image-reference)" - | `Simple_media (`Reference _, `Audio) -> "'{audio!...}' (audio-reference)" - | `Simple_media (`Reference _, `Video) -> "'{video!...}' (video-reference)" - | `Simple_media (`Link _, `Image) -> "'{image:...}' (image-link)" - | `Simple_media (`Link _, `Audio) -> "'{audio:...}' (audio-link)" - | `Simple_media (`Link _, `Video) -> "'{video:...}' (video-link)" - | `Media_with_replacement_text (`Reference _, `Image, _) -> - "'{{image!...} ...}' (image-reference)" - | `Media_with_replacement_text (`Reference _, `Audio, _) -> - "'{{audio!...} ...}' (audio-reference)" - | `Media_with_replacement_text (`Reference _, `Video, _) -> - "'{{video!...} ...}' (video-reference)" - | `Media_with_replacement_text (`Link _, `Image, _) -> - "'{{image:...} ...}' (image-link)" - | `Media_with_replacement_text (`Link _, `Audio, _) -> - "'{{audio:...} ...}' (audio-link)" - | `Media_with_replacement_text (`Link _, `Video, _) -> - "'{{video:...} ...}' (video-link)" - | `Simple_link _ -> "'{:...} (external link)'" - | `Begin_link_with_replacement_text _ -> "'{{:...} ...}' (external link)" - | `End -> "end of text" - | `Space _ -> "whitespace" - | `Single_newline _ -> "line break" - | `Blank_line _ -> "blank line" - | `Right_brace -> "'}'" - | `Right_code_delimiter -> "']}'" - | `Code_block _ -> "'{[...]}' (code block)" - | `Verbatim _ -> "'{v ... v}' (verbatim text)" - | `Modules _ -> "'{!modules ...}'" - | `Begin_list `Unordered -> "'{ul ...}' (bulleted list)" - | `Begin_list `Ordered -> "'{ol ...}' (numbered list)" - | `Begin_list_item `Li -> "'{li ...}' (list item)" - | `Begin_list_item `Dash -> "'{- ...}' (list item)" - | `Begin_table_light -> "'{t ...}' (table)" - | `Begin_table_heavy -> "'{table ...}' (table)" - | `Begin_table_row -> "'{tr ...}' (table row)" - | `Begin_table_cell `Header -> "'{th ... }' (table header cell)" - | `Begin_table_cell `Data -> "'{td ... }' (table data cell)" - | `Minus -> "'-' (bulleted list item)" - | `Plus -> "'+' (numbered list item)" - | `Bar -> "'|'" - | `Begin_section_heading (level, _) -> - Printf.sprintf "'{%i ...}' (section heading)" level - | `Tag (`Author _) -> "'@author'" - | `Tag `Deprecated -> "'@deprecated'" - | `Tag (`Param _) -> "'@param'" - | `Tag (`Raise _) -> "'@raise'" - | `Tag `Return -> "'@return'" - | `Tag (`See _) -> "'@see'" - | `Tag (`Since _) -> "'@since'" - | `Tag (`Before _) -> "'@before'" - | `Tag (`Version _) -> "'@version'" - | `Tag (`Canonical _) -> "'@canonical'" - | `Tag `Inline -> "'@inline'" - | `Tag `Open -> "'@open'" - | `Tag `Closed -> "'@closed'" - | `Tag `Hidden -> "'@hidden" - | `Tag `Children_order -> "'@children_order" - | `Tag `Toc_status -> "'@toc_status" - | `Tag `Order_category -> "'@order_category" - | `Tag `Short_title -> "'@short_title" - | `Comment -> "top-level text" - -let describe_element = function - | `Reference (`Simple, _, _) -> describe (`Simple_reference "") - | `Reference (`With_text, _, _) -> - describe (`Begin_reference_with_replacement_text "") - | `Link _ -> describe (`Begin_link_with_replacement_text "") - | `Heading (level, _, _) -> describe (`Begin_section_heading (level, None)) diff --git a/src/parser/tokens.ml b/src/parser/tokens.ml index 016966ab22..3ecd5fc78c 100644 --- a/src/parser/tokens.ml +++ b/src/parser/tokens.ml @@ -17,8 +17,8 @@ let ast_list_kind : list_kind -> Ast.list_kind = function | Ordered -> `Ordered | Unordered -> `Unordered -type 'a with_start_pos = { start : Loc.point; inner : 'a } -let with_start_pos : Lexing.position -> 'a -> 'a with_start_pos = +type 'a with_start_point = { start : Loc.point; inner : 'a } +let with_start_point : Lexing.position -> 'a -> 'a with_start_point = fun start inner -> { start = Loc.make_point start; inner } type code_block = { @@ -31,28 +31,29 @@ and meta = { tags : string Loc.with_location option; } +(* Token names follow Menhir conventions where ALL_CAPS denote a unit variant, + in this case generally representing a delimiter *) type token = | Space of string | Single_newline of string | Blank_line of string - | Simple_ref of string with_start_pos - | Ref_with_replacement of string with_start_pos - | Simple_link of string with_start_pos - | Link_with_replacement of string with_start_pos + | Simple_ref of string with_start_point + | Ref_with_replacement of string with_start_point + | Simple_link of string with_start_point + | Link_with_replacement of string with_start_point | MODULES - | Media of (media * media_target) with_start_pos - | Media_with_replacement of (media * media_target * string) with_start_pos - (* Start location *) - | Math_span of string with_start_pos - | Math_block of string with_start_pos - | Code_span of string with_start_pos - | Code_block of code_block with_start_pos - | Code_block_with_output of code_block with_start_pos + | Media of (media * media_target) with_start_point + | Media_with_replacement of (media * media_target * string) with_start_point + | Math_span of string with_start_point + | Math_block of string with_start_point + | Code_span of string with_start_point + | Code_block of code_block with_start_point + | Code_block_with_output of code_block with_start_point | Word of string - | Verbatim of string with_start_pos + | Verbatim of string with_start_point | RIGHT_CODE_DELIMITER | RIGHT_BRACE - | Paragraph_style of alignment with_start_pos + | Paragraph_style of alignment with_start_point | Style of style | List of list_kind | LI @@ -64,17 +65,17 @@ type token = | MINUS | PLUS | BAR - | Section_heading of (int * string option) with_start_pos - | Author of string with_start_pos + | Section_heading of (int * string option) with_start_point + | Author of string with_start_point | DEPRECATED - | Param of string with_start_pos - | Raise of string with_start_pos + | Param of string with_start_point + | Raise of string with_start_point | RETURN - | See of (internal_reference * string) with_start_pos - | Since of string with_start_pos - | Before of string with_start_pos - | Version of string with_start_pos - | Canonical of string with_start_pos + | See of (internal_reference * string) with_start_point + | Since of string with_start_point + | Before of string with_start_point + | Version of string with_start_point + | Canonical of string with_start_point | INLINE | OPEN | CLOSED @@ -83,7 +84,7 @@ type token = | TOC_STATUS | ORDER_CATEGORY | SHORT_TITLE - | Raw_markup of (string option * string) with_start_pos + | Raw_markup of (string option * string) with_start_point | END let media_description ref_kind media_kind = @@ -332,4 +333,7 @@ let describe_tag : Ast.tag -> string = function describe @@ Canonical { inner; start = Loc.dummy_pos } | `Hidden -> describe HIDDEN | `Inline -> describe INLINE - | _ -> assert false (* New tags currently unreachable *) + | `Children_order _ -> describe CHILDREN_ORDER + | `Toc_status _ -> describe TOC_STATUS + | `Order_category _ -> describe ORDER_CATEGORY + | `Short_title _ -> describe SHORT_TITLE From 94241786a08cb81e689f761b2d00ddd46cb622d2 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 13 Jan 2025 12:32:29 -0500 Subject: [PATCH 130/150] refactor tags --- src/parser/lexer.mll | 52 +++++----- src/parser/parser.mly | 216 ++++-------------------------------------- src/parser/tokens.ml | 176 +++++++++++++++++++++------------- 3 files changed, 152 insertions(+), 292 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index edbd4f3e77..e29506c0fe 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -600,77 +600,77 @@ and token input = parse | "@author" horizontal_space+ (([^ '\r' '\n']*)? as author) { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in - Author { inner = (trim_horizontal_start author); start } } + Tag (Author { inner = (trim_horizontal_start author); start }) } | "@deprecated" - { DEPRECATED } + { Tag_with_content DEPRECATED } | "@param" horizontal_space+ ((_ # space_char)+ as inner) { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in - Param { inner; start } } + Tag_with_content (Param { inner; start }) } | ("@raise" | "@raises") horizontal_space+ ((_ # space_char)+ as inner) { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in - Raise { inner; start } } + Tag_with_content (Raise { inner; start })} | ("@return" | "@returns") - { RETURN } + { Tag_with_content RETURN } | ("@children_order") - { CHILDREN_ORDER } + { Tag_with_content CHILDREN_ORDER } | ("@toc_status") - { TOC_STATUS } + { Tag_with_content TOC_STATUS } | ("@order_category") - { ORDER_CATEGORY } + { Tag_with_content ORDER_CATEGORY } | ("@short_title") - { SHORT_TITLE } + { Tag_with_content SHORT_TITLE } | "@see" horizontal_space* '<' ([^ '>']* as url) '>' { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in - See { inner = (URL, trim_horizontal_start url); start } } + Tag_with_content (See { inner = (URL, trim_horizontal_start url); start }) } | "@see" horizontal_space* '\'' ([^ '\'']* as filename) '\'' { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in - See { inner = (File, trim_horizontal_start filename); start } } + Tag_with_content (See { inner = (File, trim_horizontal_start filename); start }) } | "@see" horizontal_space* '"' ([^ '"']* as name) '"' { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in - See { inner = (Document, trim_horizontal_start name); start } } + Tag_with_content (See { inner = (Document, trim_horizontal_start name); start }) } | "@since" horizontal_space+ (([^ '\r' '\n']+) as inner) { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in - Since { inner; start } } + Tag (Since { inner; start }) } | "@since" - { Since { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } + { Tag (Since { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf }) } | "@before" horizontal_space+ ((_ # space_char)+ as inner) { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in - Before { inner; start } } + Tag_with_content (Before { inner; start }) } | "@version" horizontal_space+ (([^ '\r' '\n']+) as inner) - { Version { inner; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } + { Tag (Version { inner; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf }) } | "@version" - { Version { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } + { Tag (Version { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf }) } | "@canonical" horizontal_space+ (([^ '\r' '\n']+) as inner) - { Canonical { inner; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } + { Tag (Canonical { inner; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf }) } | "@canonical" - { Canonical { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } + { Tag (Canonical { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf }) } | "@inline" - { INLINE } + { Tag INLINE } | "@open" - { OPEN } + { Tag OPEN } | "@closed" - { CLOSED } + { Tag CLOSED } | "@hidden" - { HIDDEN } + { Tag HIDDEN } | "]}" { RIGHT_CODE_DELIMITER } @@ -693,15 +693,15 @@ and token input = parse | "@param" { warning lexbuf input Parse_error.truncated_param; - Param { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } + Tag_with_content (Param { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf }) } | ("@raise" | "@raises") as tag { warning lexbuf input (Parse_error.truncated_raise tag); - Raise { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } + Tag_with_content (Raise { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf }) } | "@before" { warning lexbuf input Parse_error.truncated_before; - Before { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } + Tag_with_content (Before { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf }) } | "@see" { warning lexbuf input Parse_error.truncated_see; diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 916a4a7fdc..86afec8dc0 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -17,7 +17,7 @@ %token Style "{i" (* or '{b' etc *) -(* or '{C' or '{R', but this syntax has been deprecated and is only kept around so legacy codebases don't break :p *) +(* or '{C' or '{R', but this syntax has been deprecated *) %token Paragraph_style "{L" %token MODULES "{!modules:" @@ -25,6 +25,8 @@ %token Math_span "{m" %token Math_block "{math" +%token Verbatim "{v" + %token <(string option * string) Tokens.with_start_point> Raw_markup "{%%}" %token Code_block "{[]}" (* Self-contained code-block *) @@ -46,31 +48,16 @@ %token <(int * string option) Tokens.with_start_point> Section_heading "{N:" (* Tags *) -%token Author "@author" -%token DEPRECATED "@deprecated" -%token Param "@param" -%token Raise "@raise(s)" -%token RETURN "@return" -%token <(Tokens.internal_reference * string) Tokens.with_start_point> See "@see" -%token Since "@since" -%token Before "@before" -%token Version "@version" -%token Canonical "@canonical" -%token INLINE "@inline" -%token OPEN "@open" -%token CLOSED "@closed" -%token HIDDEN "@hidden" -%token CHILDREN_ORDER "@children_order" -%token TOC_STATUS "@toc_status" -%token ORDER_CATEGORY "@order_category" -%token SHORT_TITLE "@short_title" +%token Tag +%token Tag_with_content + +(* Links and references *) %token Simple_ref "{!" %token Ref_with_replacement "{{!" %token Simple_link "{:" %token Link_with_replacement "{{:" %token <(Tokens.media * Tokens.media_target) Tokens.with_start_point> Media "{(format)!" %token <(Tokens.media * Tokens.media_target * string) Tokens.with_start_point> Media_with_replacement "{(format):" (* where 'format' is audio, video, image *) -%token Verbatim "{v" %token END @@ -243,189 +230,20 @@ let tag := | with_content = tag_with_content; line_break?; { with_content } | bare = tag_bare; line_break?; { bare } -let tag_with_content := - | ~ = before; <> - | ~ = raise; <> - | ~ = see; <> - | ~ = param; <> - | ~ = deprecated; <> - | ~ = return; <> - | ~ = children_order; <> - | ~ = toc_status; <> - | ~ = order_category; <> - | ~ = short_title; <> - -let return := - | startpos = located(RETURN); horizontal_whitespace; children = located(sequence_nonempty(nestable_block_element)); { - let span = Loc.delimited startpos children in - Writer.map ~f:(fun c -> Loc.at span @@ `Return c) children.Loc.value - } - | pos = located(RETURN); horizontal_whitespace?; { - return (Loc.same pos @@ `Return []) - } - -let deprecated := - | startpos = located(DEPRECATED); horizontal_whitespace; children = locatedM(sequence_nonempty(nestable_block_element)); { - Writer.bind children ~f:(fun c -> - let span = Loc.delimited startpos c in - return @@ Loc.at span (`Deprecated c.Loc.value)) - } - | pos = located(DEPRECATED); horizontal_whitespace?; - { return @@ { pos with Loc.value = `Deprecated [] } } - | pos = located(DEPRECATED); horizontal_whitespace?; errloc = located(RIGHT_BRACE); { - let warning = - Writer.Warning (Parse_error.unpaired_right_brace @@ errloc.Loc.location) - in - Writer.return_warning ({ pos with Loc.value = `Deprecated [] }) warning - } - -let before := - | content = Before; horizontal_whitespace; children = sequence_nonempty(nestable_block_element); { - Writer.bind children ~f:(fun c -> - let Tokens.{ inner; start } = content in - let child_span = Loc.span @@ List.map Loc.location c in - let span = { child_span with start } in - let inner = Loc.at span @@ `Before (inner, c) in - return inner) - } - | content = located(Before); { - let Loc.{ value = Tokens.{ inner; start }; location } = content in - return { Loc.value = `Before (inner, []); location = { location with start } } - } - -let raise := - | content = located(Raise); horizontal_whitespace; children = located(sequence_nonempty(nestable_block_element)); { - let Loc.{ value = Tokens.{ inner; start }; location } = content in - let location = { location with start } in - Writer.bind children.Loc.value ~f:(fun c -> - let span = Loc.span @@ location :: List.map Loc.location c in - let inner = Loc.at span @@ `Raise (inner, c) in - return inner) - } - | content = located(Raise); horizontal_whitespace?; { - let Loc.{ value = Tokens.{ inner; start }; location } = content in - let location = { location with start } in - return @@ Loc.at location (`Raise ( inner, [])) - } - -let see := - | content = located(See); horizontal_whitespace; children = located(sequence_nonempty(nestable_block_element)); { - let Loc.{ value = Tokens.{ inner = (kind, href); start }; location } = content in - let span = Loc.delimited { content with location = { location with start }} children in - Writer.bind children.Loc.value ~f:(fun c -> - let inner = Loc.at span @@ `See (Tokens.to_ast_ref kind, href, c) in - return inner) - } - | content = located(See); horizontal_whitespace?; { - let Loc.{ value = Tokens.{ inner = (kind, href); start }; location } = content in - let location = { location with start } in - return @@ Loc.at location @@ `See (Tokens.to_ast_ref kind, href, []) - } - -let param := - | content = located(Param); horizontal_whitespace; children = sequence_nonempty(nestable_block_element); { - Writer.bind children ~f:(fun c -> - let Loc.{ value = Tokens.{ inner; start }; location } = content in - let span = Loc.span @@ { location with start } :: List.map Loc.location c in - let inner = Loc.at span @@ `Param (inner, c) in - return inner) - } - | content = located(Param); horizontal_whitespace?; { - let Loc.{ value = Tokens.{ inner; start }; location } = content in - return @@ Loc.at { location with start } (`Param (inner, [])) - } - - -let children_order := - | start = located(CHILDREN_ORDER); horizontal_whitespace; children = sequence(nestable_block_element); { - Writer.bind children ~f:(function - | _ :: _ as children -> - let span = Loc.span @@ start.Loc.location :: List.map Loc.location children in - return @@ Loc.at span (`Children_order children) - | [] -> - return @@ Loc.map (Fun.const @@ `Children_order []) start) - } - - -let toc_status := - | start = located(TOC_STATUS); horizontal_whitespace; children = sequence(nestable_block_element); { - Writer.bind children ~f:(function - | _ :: _ as children -> - let span = Loc.span @@ start.Loc.location :: List.map Loc.location children in - return @@ Loc.at span (`Toc_status children) - | [] -> - return @@ Loc.map (Fun.const @@ `Toc_status []) start) - } - -let order_category := - | start = located(ORDER_CATEGORY); horizontal_whitespace; children = sequence(nestable_block_element); { - Writer.bind children ~f:(function - | _ :: _ as children -> - let span = Loc.span @@ start.Loc.location :: List.map Loc.location children in - return @@ Loc.at span (`Order_category children) - | [] -> - return @@ Loc.map (Fun.const (`Order_category [])) start) +let tag_with_content := tag = located(Tag_with_content); horizontal_whitespace; children = sequence(nestable_block_element); { + Writer.map children ~f:(fun children -> + let Loc.{ value; location } = tag in + let start = Tokens.tag_with_content_start_point value |> Option.map (fun start -> { location with start }) |> Option.value ~default:location in + let span = Loc.span @@ start :: List.map Loc.location children in + Loc.at span @@ Tokens.tag_with_content children value) } - -let short_title := - | start = located(SHORT_TITLE); horizontal_whitespace; children = sequence(nestable_block_element); { - Writer.bind children ~f:(function - | _ :: _ as children -> - let span = Loc.span @@ start.Loc.location :: List.map Loc.location children in - return @@ Loc.at span (`Short_title children) - | [] -> - return @@ Loc.map (Fun.const (`Short_title [])) start) + | tag = located(Tag_with_content); { + return @@ { tag with Loc.value = Tokens.tag_with_content [] tag.Loc.value } } -let tag_bare := - | content = located(Version); horizontal_whitespace?; { - let Loc.{ value; location } = content in - let Tokens.{ inner; start } = value in - let span = { location with start } in - let what = Tokens.describe (Version value) in - let warning = - Writer.Warning (Parse_error.should_not_be_empty ~what span) - in - Writer.ensure has_content warning (return inner) - |> Writer.map ~f:(fun v -> Loc.at location @@ `Version v ) - } - | content = located(Since); { - let Loc.{ value; location } = content in - let Tokens.{ inner; start } = value in - let location = { location with start } in - let what = Tokens.describe (Since value) in - let warning = - Writer.Warning (Parse_error.should_not_be_empty ~what location) - in - Writer.ensure has_content warning (return inner) - |> Writer.map ~f:(fun v -> Loc.at location @@ `Since v ) - } - | content = located(Canonical); { - let Loc.{ value; location } = content in - let Tokens.{ inner; start } = value in - let what = Tokens.describe @@ Canonical value in - let warning = - let span = Loc.of_position $sloc in - Writer.Warning (Parse_error.should_not_be_empty ~what span) - in - let location = { location with start } in - Writer.ensure has_content warning @@ return inner - |> Writer.map ~f:(fun value -> Loc.at location @@ `Canonical (Loc.at location value)) - } - | content = located(Author); { - let Loc.{ value; location } = content in - let Tokens.{ inner; start } = value in - let what = Tokens.describe @@ Author value in - let warning = - Writer.Warning (Parse_error.should_not_be_empty ~what location) - in - Writer.ensure has_content warning @@ return inner - |> Writer.map ~f:(fun a -> { Loc.value = `Author a; location = { location with start } }) +let tag_bare := tag = located(Tag); { + return @@ Loc.map (Fun.const @@ Tokens.tag_bare tag) tag } - | pos = position(OPEN); whitespace?; { let loc = Loc.of_position pos in return @@ Loc.at loc `Open } - | pos = position(INLINE); whitespace?; { let loc = Loc.of_position pos in return @@ Loc.at loc `Inline } - | pos = position(CLOSED); whitespace?; { let loc = Loc.of_position pos in return @@ Loc.at loc `Closed } - | pos = position(HIDDEN); whitespace?; { let loc = Loc.of_position pos in return @@ Loc.at loc `Hidden } let inline_element(ws) := | ~ = inline_element_without_whitespace; <> diff --git a/src/parser/tokens.ml b/src/parser/tokens.ml index 3ecd5fc78c..be2e2b4dae 100644 --- a/src/parser/tokens.ml +++ b/src/parser/tokens.ml @@ -66,26 +66,68 @@ type token = | PLUS | BAR | Section_heading of (int * string option) with_start_point + | Tag of tag + | Tag_with_content of tag_with_content + | Raw_markup of (string option * string) with_start_point + | END +and tag = | Author of string with_start_point - | DEPRECATED - | Param of string with_start_point - | Raise of string with_start_point - | RETURN - | See of (internal_reference * string) with_start_point | Since of string with_start_point - | Before of string with_start_point | Version of string with_start_point | Canonical of string with_start_point | INLINE | OPEN | CLOSED | HIDDEN +and tag_with_content = + | DEPRECATED + | Before of string with_start_point + | Raise of string with_start_point + | Param of string with_start_point + | See of (internal_reference * string) with_start_point + | RETURN | CHILDREN_ORDER | TOC_STATUS | ORDER_CATEGORY | SHORT_TITLE - | Raw_markup of (string option * string) with_start_point - | END + +let to_ref : internal_reference -> [ `Url | `File | `Document ] = function + | URL -> `Url + | File -> `File + | Document -> `Document + +let tag_with_content + (content : Ast.nestable_block_element Loc.with_location list) : + tag_with_content -> Ast.tag = function + | DEPRECATED -> `Deprecated content + | Before { inner; _ } -> `Before (inner, content) + | Raise { inner; _ } -> `Raise (inner, content) + | Param { inner; _ } -> `Param (inner, content) + | See { inner = kind, href; _ } -> `See (to_ref kind, href, content) + | RETURN -> `Return content + | CHILDREN_ORDER -> `Children_order content + | TOC_STATUS -> `Toc_status content + | ORDER_CATEGORY -> `Order_category content + | SHORT_TITLE -> `Short_title content + +let tag_with_content_start_point : tag_with_content -> Loc.point option = + function + | Before { start; _ } + | Raise { start; _ } + | Param { start; _ } + | See { start; _ } -> + Some start + | _ -> None + +let tag_bare : tag Loc.with_location -> Ast.tag = function + | { value = Author s; _ } -> `Author s.inner + | { value = Since s; _ } -> `Since s.inner + | { value = Version s; _ } -> `Version s.inner + | { value = Canonical s; _ } as loc -> `Canonical { loc with value = s.inner } + | { value = INLINE; _ } -> `Inline + | { value = OPEN; _ } -> `Open + | { value = CLOSED; _ } -> `Closed + | { value = HIDDEN; _ } -> `Hidden let media_description ref_kind media_kind = let media_kind = @@ -143,24 +185,24 @@ let print : token -> string = function | Section_heading { inner = level, label; _ } -> let label = match label with None -> "" | Some label -> ":" ^ label in Printf.sprintf "'{%i%s'" level label - | Author _ -> "'@author'" - | DEPRECATED -> "'@deprecated'" - | Param _ -> "'@param'" - | Raise _ -> "'@raise'" - | RETURN -> "'@return'" - | See _ -> "'@see'" - | Since _ -> "'@since'" - | Before _ -> "'@before'" - | Version _ -> "'@version'" - | Canonical _ -> "'@canonical'" - | INLINE -> "'@inline'" - | OPEN -> "'@open'" - | CLOSED -> "'@closed'" - | HIDDEN -> "'@hidden'" - | CHILDREN_ORDER -> "'@children_order" - | TOC_STATUS -> "'@toc_status'" - | ORDER_CATEGORY -> "'@order_category'" - | SHORT_TITLE -> "@'short_title'" + | Tag (Author _) -> "'@author'" + | Tag_with_content DEPRECATED -> "'@deprecated'" + | Tag_with_content (Param _) -> "'@param'" + | Tag_with_content (Raise _) -> "'@raise'" + | Tag_with_content RETURN -> "'@return'" + | Tag_with_content (See _) -> "'@see'" + | Tag_with_content (Before _) -> "'@before'" + | Tag_with_content CHILDREN_ORDER -> "'@children_order" + | Tag_with_content TOC_STATUS -> "'@toc_status'" + | Tag_with_content ORDER_CATEGORY -> "'@order_category'" + | Tag_with_content SHORT_TITLE -> "@'short_title'" + | Tag (Since _) -> "'@since'" + | Tag (Version _) -> "'@version'" + | Tag (Canonical _) -> "'@canonical'" + | Tag INLINE -> "'@inline'" + | Tag OPEN -> "'@open'" + | Tag CLOSED -> "'@closed'" + | Tag HIDDEN -> "'@hidden'" | Raw_markup { inner = None, _; _ } -> "'{%...%}'" | Raw_markup { inner = Some target, _; _ } -> "'{%" ^ target ^ ":...%}'" | END -> "EOI" @@ -215,24 +257,24 @@ let describe : token -> string = function | BAR -> "'|'" | Section_heading { inner = level, _; _ } -> Printf.sprintf "'{%i ...}' (section heading)" level - | Author _ -> "'@author'" - | DEPRECATED -> "'@deprecated'" - | Param _ -> "'@param'" - | Raise _ -> "'@raise'" - | RETURN -> "'@return'" - | See _ -> "'@see'" - | Since _ -> "'@since'" - | Before _ -> "'@before'" - | Version _ -> "'@version'" - | Canonical _ -> "'@canonical'" - | INLINE -> "'@inline'" - | OPEN -> "'@open'" - | CLOSED -> "'@closed'" - | HIDDEN -> "'@hidden'" - | CHILDREN_ORDER -> "'@children_order" - | TOC_STATUS -> "'@toc_status'" - | ORDER_CATEGORY -> "'@order_category'" - | SHORT_TITLE -> "@'short_title'" + | Tag (Author _) -> "'@author'" + | Tag_with_content DEPRECATED -> "'@deprecated'" + | Tag_with_content (Param _) -> "'@param'" + | Tag_with_content (Raise _) -> "'@raise'" + | Tag_with_content RETURN -> "'@return'" + | Tag_with_content (See _) -> "'@see'" + | Tag_with_content (Before _) -> "'@before'" + | Tag_with_content CHILDREN_ORDER -> "'@children_order" + | Tag_with_content TOC_STATUS -> "'@toc_status'" + | Tag_with_content ORDER_CATEGORY -> "'@order_category'" + | Tag_with_content SHORT_TITLE -> "@'short_title'" + | Tag (Since _) -> "'@since'" + | Tag (Version _) -> "'@version'" + | Tag (Canonical _) -> "'@canonical'" + | Tag INLINE -> "'@inline'" + | Tag OPEN -> "'@open'" + | Tag CLOSED -> "'@closed'" + | Tag HIDDEN -> "'@hidden'" let empty_code_block = { @@ -311,29 +353,29 @@ let of_ast_ref : [ `Document | `File | `Url ] -> internal_reference = function | `File -> File | `Url -> URL -let to_ast_ref : internal_reference -> [ `Url | `File | `Document ] = function - | URL -> `Url - | File -> `File - | Document -> `Document - let describe_tag : Ast.tag -> string = function | `See (kind, _, _) -> - describe @@ See { inner = (of_ast_ref kind, ""); start = Loc.dummy_pos } - | `Author inner -> describe @@ Author { inner; start = Loc.dummy_pos } - | `Deprecated _ -> describe DEPRECATED - | `Param (inner, _) -> describe @@ Param { inner; start = Loc.dummy_pos } - | `Raise (inner, _) -> describe @@ Raise { inner; start = Loc.dummy_pos } - | `Return _ -> describe RETURN - | `Since inner -> describe @@ Since { inner; start = Loc.dummy_pos } - | `Before (inner, _) -> describe @@ Before { inner; start = Loc.dummy_pos } - | `Version inner -> describe @@ Version { inner; start = Loc.dummy_pos } - | `Closed -> describe CLOSED - | `Open -> describe OPEN + describe + @@ Tag_with_content + (See { inner = (of_ast_ref kind, ""); start = Loc.dummy_pos }) + | `Author inner -> describe @@ Tag (Author { inner; start = Loc.dummy_pos }) + | `Deprecated _ -> describe @@ Tag_with_content DEPRECATED + | `Param (inner, _) -> + describe @@ Tag_with_content (Param { inner; start = Loc.dummy_pos }) + | `Raise (inner, _) -> + describe @@ Tag_with_content (Raise { inner; start = Loc.dummy_pos }) + | `Return _ -> describe @@ Tag_with_content RETURN + | `Since inner -> describe @@ Tag (Since { inner; start = Loc.dummy_pos }) + | `Before (inner, _) -> + describe @@ Tag_with_content (Before { inner; start = Loc.dummy_pos }) + | `Version inner -> describe @@ Tag (Version { inner; start = Loc.dummy_pos }) + | `Closed -> describe @@ Tag CLOSED + | `Open -> describe @@ Tag OPEN | `Canonical Loc.{ value = inner; _ } -> - describe @@ Canonical { inner; start = Loc.dummy_pos } - | `Hidden -> describe HIDDEN - | `Inline -> describe INLINE - | `Children_order _ -> describe CHILDREN_ORDER - | `Toc_status _ -> describe TOC_STATUS - | `Order_category _ -> describe ORDER_CATEGORY - | `Short_title _ -> describe SHORT_TITLE + describe @@ Tag (Canonical { inner; start = Loc.dummy_pos }) + | `Hidden -> describe @@ Tag HIDDEN + | `Inline -> describe @@ Tag INLINE + | `Children_order _ -> describe @@ Tag_with_content CHILDREN_ORDER + | `Toc_status _ -> describe @@ Tag_with_content TOC_STATUS + | `Order_category _ -> describe @@ Tag_with_content ORDER_CATEGORY + | `Short_title _ -> describe @@ Tag_with_content SHORT_TITLE From 5fc1d05b4ecf5900e8ee8076fb9e18279892d858 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 13 Jan 2025 14:57:05 -0500 Subject: [PATCH 131/150] Make paragraph parsing more robust, lex horizontal space folowing newlines --- src/parser/lexer.mll | 62 +++++++++++++++++++++---------------------- src/parser/parser.mly | 35 +++++++++++++++++------- src/parser/writer.ml | 3 +++ 3 files changed, 58 insertions(+), 42 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index e29506c0fe..0cf6fe0515 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -150,7 +150,6 @@ type input = { mutable warnings : Warning.t list; } - type math_kind = Inline | Block @@ -200,7 +199,7 @@ let with_location_adjustments : k lexbuf input location value let warning = - with_location_adjustments @@ fun _lexbuf input location error -> + with_location_adjustments @@ fun _ input location error -> input.warnings <- error location :: input.warnings let reference_token lexbuf input media ~opening_delimiter ~start_offset ~inner = @@ -239,10 +238,11 @@ let trim_leading_space_or_accept_whitespace lexbuf input start_offset text = | '\t' | '\r' | '\n' -> text | exception Invalid_argument _ -> "" | _ -> - warning lexbuf - input - ~start_offset - ~end_offset:(start_offset + 2) + warning + lexbuf + input + ~start_offset + ~end_offset:(start_offset + 2) Parse_error.no_leading_whitespace_in_verbatim; text @@ -272,27 +272,24 @@ let emit_verbatim lexbuf input start_offset buffer = let emit_code_block lexbuf input ~start_offset ~content_offset ~metadata ~delimiter ~terminator ~content has_output = let content = Buffer.contents content |> trim_trailing_blank_lines in let content_location = input.offset_to_location content_offset in - let content = + let content = with_location_adjustments - (fun _ _location _ c -> + (fun _ _ _ c -> let first_line_offset = content_location.column in trim_leading_whitespace ~first_line_offset c) lexbuf input content + |> trim_leading_blank_lines + |> with_location_adjustments + ~adjust_end_by:terminator + ~start_offset:content_offset + (fun _ _ -> Loc.at) + lexbuf + input in - let content = trim_leading_blank_lines content in - let content = - with_location_adjustments - ~adjust_end_by:terminator - ~start_offset:content_offset - (fun _ _ -> Loc.at) - lexbuf - input - content - in - let inner = { metadata; delimiter; content } - and start = input.offset_to_location start_offset in + let inner = { metadata; delimiter; content } in + let start = input.offset_to_location start_offset in if has_output then Code_block_with_output { inner; start } else @@ -308,7 +305,7 @@ let heading_level lexbuf input level = let buffer_add_lexeme buffer lexbuf = Buffer.add_string buffer (Lexing.lexeme lexbuf) -let trim_horizontal_start : string -> string = fun s -> +let trim_start_horizontal_whitespace : string -> string = fun s -> let rec go idx = let c = s.[idx] in if Char.equal c ' ' @@ -415,14 +412,14 @@ and token input = parse { END } | ((horizontal_space* newline as prefix) - horizontal_space* ((newline)+ as suffix) as ws) + horizontal_space* (((newline)+ as suffix) as ws) horizontal_space*) { (* Account for the first newline we got *) update_content_newlines ~content:("\n" ^ prefix ^ suffix) lexbuf; Blank_line ws } - | (horizontal_space* newline as ws) + | (horizontal_space* (newline as ws) horizontal_space*) { Lexing.new_line lexbuf; Single_newline ws @@ -600,7 +597,7 @@ and token input = parse | "@author" horizontal_space+ (([^ '\r' '\n']*)? as author) { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in - Tag (Author { inner = (trim_horizontal_start author); start }) } + Tag (Author { inner = (trim_start_horizontal_whitespace author); start }) } | "@deprecated" { Tag_with_content DEPRECATED } @@ -630,15 +627,15 @@ and token input = parse | "@see" horizontal_space* '<' ([^ '>']* as url) '>' { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in - Tag_with_content (See { inner = (URL, trim_horizontal_start url); start }) } + Tag_with_content (See { inner = (URL, trim_start_horizontal_whitespace url); start }) } | "@see" horizontal_space* '\'' ([^ '\'']* as filename) '\'' { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in - Tag_with_content (See { inner = (File, trim_horizontal_start filename); start }) } + Tag_with_content (See { inner = (File, trim_start_horizontal_whitespace filename); start }) } | "@see" horizontal_space* '"' ([^ '"']* as name) '"' { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in - Tag_with_content (See { inner = (Document, trim_horizontal_start name); start }) } + Tag_with_content (See { inner = (Document, trim_start_horizontal_whitespace name); start }) } | "@since" horizontal_space+ (([^ '\r' '\n']+) as inner) { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in @@ -861,12 +858,13 @@ and verbatim buffer last_false_terminator start_offset input = parse | eof { begin match last_false_terminator with | None -> - warning - lexbuf - input - (Parse_error.not_allowed + let not_allowed = + Parse_error.not_allowed ~what:(Tokens.describe END) - ~in_what:(Tokens.describe (Verbatim {inner = ""; start = Loc.dummy_pos}))) + ~in_what:(Tokens.describe (Verbatim {inner = ""; start = Loc.dummy_pos})) + in + warning lexbuf input not_allowed + | Some location -> warning lexbuf diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 86afec8dc0..6fb0d09f92 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -81,16 +81,16 @@ let delimited_location(opening, rule, closing) := startpos = located(opening); i (* When we have to handle errors with Menhir's `error` token, we need - `Lexing.position` as opposed to `Loc.with_location` so that we can cleanly + `Lexing.position` as opposed to `Loc.with_location`. Because we can cleanly take a slice of the input text with those positions, which would be - difficult using `Loc.with_location` because of the way the lexbuf tracks - position + difficult using `Loc.with_location` *) let with_position(rule) == inner = rule; { (inner, $sloc) } let position(rule) == _ = rule; { $sloc } -(* Utilities for working inside the Writer.t monad *) +(* Wrappers around Menhir's built-in utilities that make working inside the + Writer.t monad easier *) let sequence(rule) == xs = list(rule); { Writer.sequence xs } let sequence_nonempty(rule) == xs = nonempty_list(rule); { Writer.sequence xs } let separated_nonempty_sequence(sep, rule) := xs = separated_nonempty_list(sep, rule); { Writer.sequence xs } @@ -123,6 +123,7 @@ let toplevel := | ~ = section_heading; <> | ~ = toplevel_error; <> +(* Tokens which cannot begin any block element *) let toplevel_error := (* Stray heavy list items, `{li` or `{-` *) | err = located(list_opening); horizontal_whitespace; children = sequence_nonempty(inline_element(horizontal_whitespace)); endpos = located(RIGHT_BRACE)?; { @@ -237,14 +238,16 @@ let tag_with_content := tag = located(Tag_with_content); horizontal_whitespace; let span = Loc.span @@ start :: List.map Loc.location children in Loc.at span @@ Tokens.tag_with_content children value) } - | tag = located(Tag_with_content); { + | tag = located(Tag_with_content); horizontal_whitespace?; { return @@ { tag with Loc.value = Tokens.tag_with_content [] tag.Loc.value } } -let tag_bare := tag = located(Tag); { + let tag_bare := tag = located(Tag); horizontal_whitespace?; { return @@ Loc.map (Fun.const @@ Tokens.tag_bare tag) tag } +(* INLINE ELEMENTS *) + let inline_element(ws) := | ~ = inline_element_without_whitespace; <> | s = located(ws); { return s } @@ -406,13 +409,12 @@ let link := (* LIST *) -(*TODO: For some reason this is always matching the `should_begin it's own line` branch?? *) let list_light_start := | MINUS; { Tokens.MINUS } | PLUS; { Tokens.PLUS } let list_light_item := - | start = located(list_light_start); item = nestable_block_element; { + | start = located(list_light_start); horizontal_whitespace*; item = nestable_block_element; { let Loc.{ value; location } = start in light_list_item value <$> (Loc.with_start_location location <$> item) } @@ -583,6 +585,9 @@ let table_heavy := let cell_inner := | ~ = inline_element(horizontal_whitespace); <> + | s = located(symbols_without_bar); { + return @@ Loc.map (fun w -> `Word w) s + } | (start_pos, end_pos) = position(error); { let span = Loc.of_position (start_pos, end_pos) in let illegal = Writer.InputNeeded (fun input -> @@ -717,8 +722,18 @@ let verbatim := verbatim = located(Verbatim); { *> return verbatim } -let paragraph := items = sequence_nonempty(inline_element(horizontal_whitespace)); { - paragraph <$> items +let symbols_without_bar := + | PLUS; { "+" } + | MINUS; { "-" } +let symbols := + | BAR; { "|" } + | ~ = symbols_without_bar; <> +let paragraph_middle_element := + | ~ = inline_element(horizontal_whitespace); <> + | s = located(symbols); { return @@ Loc.map (fun w -> `Word w) s } + +let paragraph := x = inline_element_without_whitespace; xs = sequence(paragraph_middle_element); { + paragraph <$> Writer.map2 ~f:List.cons x xs } let code_block := diff --git a/src/parser/writer.ml b/src/parser/writer.ml index 70f0909204..847ba565ab 100644 --- a/src/parser/writer.ml +++ b/src/parser/writer.ml @@ -16,6 +16,9 @@ let bind : 'a t -> f:('a -> 'b t) -> 'b t = let map : f:('a -> 'b) -> 'a t -> 'b t = fun ~f (Writer (x, ws)) -> Writer (f x, ws) +let map2 : f:('a -> 'b -> 'c) -> 'a t -> 'b t -> 'c t = + fun ~f (Writer (a, aws)) (Writer (b, bws)) -> Writer (f a b, aws @ bws) + let ( <$> ) f w = map ~f w let seq : 'a t -> 'b t -> 'b t = From bd16e11420597ac5350335d7aa65bf00551cb436 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Mon, 13 Jan 2025 15:59:11 -0500 Subject: [PATCH 132/150] Fix whitespace in lexer, consume all leading whitespace in comment --- src/parser/lexer.mll | 8 ++- src/parser/parser.mly | 18 +++--- src/parser/test_driver/tester.ml | 96 +++++++++++++++++++++++++++++++- 3 files changed, 108 insertions(+), 14 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 0cf6fe0515..1fb161c24f 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -202,6 +202,10 @@ let warning = with_location_adjustments @@ fun _ input location error -> input.warnings <- error location :: input.warnings +(* TODO: Opening delimiter position and parameter position need to be tweaked here + I think we probably need inner to be `string Loc.with_location` since + a `Reference takes one as a parameter. +*) let reference_token lexbuf input media ~opening_delimiter ~start_offset ~inner = let start = input.offset_to_location start_offset in match opening_delimiter with @@ -412,14 +416,14 @@ and token input = parse { END } | ((horizontal_space* newline as prefix) - horizontal_space* (((newline)+ as suffix) as ws) horizontal_space*) + horizontal_space* (((newline)+ as suffix) as ws)) { (* Account for the first newline we got *) update_content_newlines ~content:("\n" ^ prefix ^ suffix) lexbuf; Blank_line ws } - | (horizontal_space* (newline as ws) horizontal_space*) + | (horizontal_space* (newline as ws)) { Lexing.new_line lexbuf; Single_newline ws diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 6fb0d09f92..8560a2146c 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -114,8 +114,8 @@ let line_break := (* ENTRY *) let main := - | any_whitespace?; ~ = sequence_nonempty(toplevel); END; <> - | any_whitespace?; END; { return [] } + | any_whitespace*; ~ = sequence_nonempty(toplevel); END; <> + | any_whitespace*; END; { return [] } let toplevel := | block = nestable_block_element; { (block :> Ast.block_element Loc.with_location Writer.t) } @@ -126,7 +126,7 @@ let toplevel := (* Tokens which cannot begin any block element *) let toplevel_error := (* Stray heavy list items, `{li` or `{-` *) - | err = located(list_opening); horizontal_whitespace; children = sequence_nonempty(inline_element(horizontal_whitespace)); endpos = located(RIGHT_BRACE)?; { + | err = located(list_opening); children = sequence_nonempty(inline_element(horizontal_whitespace)); endpos = located(RIGHT_BRACE)?; { let default = Writer.get children |> List.rev @@ -231,7 +231,7 @@ let tag := | with_content = tag_with_content; line_break?; { with_content } | bare = tag_bare; line_break?; { bare } -let tag_with_content := tag = located(Tag_with_content); horizontal_whitespace; children = sequence(nestable_block_element); { +let tag_with_content := tag = located(Tag_with_content); children = sequence(nestable_block_element); { Writer.map children ~f:(fun children -> let Loc.{ value; location } = tag in let start = Tokens.tag_with_content_start_point value |> Option.map (fun start -> { location with start }) |> Option.value ~default:location in @@ -414,7 +414,7 @@ let list_light_start := | PLUS; { Tokens.PLUS } let list_light_item := - | start = located(list_light_start); horizontal_whitespace*; item = nestable_block_element; { + | start = located(list_light_start); horizontal_whitespace?; item = nestable_block_element; { let Loc.{ value; location } = start in light_list_item value <$> (Loc.with_start_location location <$> item) } @@ -444,7 +444,7 @@ let list_opening := | DASH; { Tokens.DASH } let item_heavy := - | start_pos = located(list_opening); whitespace; items = sequence(nestable_block_element); RIGHT_BRACE; any_whitespace*; { + | start_pos = located(list_opening); items = sequence(nestable_block_element); RIGHT_BRACE; any_whitespace*; { let Loc.{ value = token; location } = start_pos in let warning = Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe token) location) @@ -462,7 +462,7 @@ let item_heavy := Writer.ensure not_empty should_not_be_empty items |> Writer.warning should_be_followed_by_whitespace } - | startpos = located(DASH); items = sequence_nonempty(nestable_block_element)?; endpos = located(END); { + | startpos = located(list_opening); items = sequence_nonempty(nestable_block_element)?; endpos = located(END); { let end_not_allowed = Writer.Warning (Parse_error.end_not_allowed ~in_what:(Tokens.describe DASH) endpos.Loc.location) in @@ -526,7 +526,7 @@ let odoc_list := (* TABLES *) let cell_heavy := - | cell_kind = Table_cell; whitespace*; children = sequence_nonempty(nestable_block_element); RIGHT_BRACE; whitespace*; + | cell_kind = Table_cell; children = sequence_nonempty(nestable_block_element); RIGHT_BRACE; whitespace*; { Writer.map ~f:(fun c -> (c, cell_kind)) children } | cell_kind = Table_cell; RIGHT_BRACE; whitespace*; { return ([], cell_kind) } @@ -732,7 +732,7 @@ let paragraph_middle_element := | ~ = inline_element(horizontal_whitespace); <> | s = located(symbols); { return @@ Loc.map (fun w -> `Word w) s } -let paragraph := x = inline_element_without_whitespace; xs = sequence(paragraph_middle_element); { +let paragraph := horizontal_whitespace?; x = inline_element_without_whitespace; xs = sequence(paragraph_middle_element); { paragraph <$> Writer.map2 ~f:List.cons x xs } diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index 99c1b51c07..d457c6560e 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -192,7 +192,95 @@ let dashes = ("option", "--option"); ] -let isolated = [ ("empty line", " \n @foo") ] +let light_table_tests = + [ + ("empty_table_light", "{t }"); + ("unclosed_table", "{t "); + ("simple", {| + {t + | a | + } + |}); + ("stars", "{t\n |a| *b*|\n |*c| d* |\n}"); + ("backquotes", "{t\n | `a |`\n}"); + ("no_header", "{t\n |---|---|\n | x | y |\n}"); + ("no_align", "{t\n | x | y |\n | x | y |\n}"); + ("only_align", "{t\n |--|--|\n}"); + ("no_data", "{t\n | x | y |\n |---|---|\n}"); + ("alignment", "{t\n | a | b | c | d |\n |---|:--|--:|:-:|\n}"); + ("no_bars", "{t\n a | b | c | d\n ---|:--|--:|:-:\n a | b | c | d\n}"); + ( "light_table_new_lines", + "{t\n\n\ + \ | a | b | c | d |\n\n\ + \ |---|---|---|---|\n\n\ + \ | a | b | c | d |\n\n\ + }" ); + ( "light_table_markup", + "{t\n\ + \ | {i a} {:google.com} \\t | | {m b} {e c} {% xyz %} | {b d} [foo] |\n\ + \ |---|---|---|---|\n\ + }" ); + ( "light_table_markup_with_newlines", + "{t | h1 | h2 |\n\ + \ |--------------|-------------|\n\ + \ | {e with\n\ + \ newlines} | {b d} [foo] |\n\ + }" ); + ("no_space", "{t\n | a | b |c| d |\n |---|--:|:--|:-:|\n}"); + ( "multiple_headers", + "{t\n||a|b|\n|:-|---:|\n|c|d|\n|cc|dd|\n|-:|:-:|\n|e|f|\n|g|h||\n}" ); + ("block_element_in_cell", "{t\n| {[ a ]} | b |\n|---|---|\n}"); + ("block_element_in_row", "{t\n{[ a ]}\n| a | b |\n|---|---|\n}"); + ("more_cells_later", "{t\n | x | y |\n |---|---|\n | x | y | z |\n}"); + ("less_cells_later", "{t\n | x | y |\n |---|---|\n x \n}"); + ( "multiple_word", + "{t\n\ + | Header and other word |\n\ + |-----------------------|\n\ + | cell and other words |\n\ + }" ); + ( "multiple_word_header", + "{t\n\ + | Header other word |\n\ + |-------------------|\n\ + | Header other word |\n\ + }" ); + ] + +let explicit_list_tests = + [ + ("basic", "{ul {li foo}}"); + ("ordered", "{ol {li foo}}"); + ("two_items", "{ul {li foo} {li bar}}"); + ("items_on_separate_lines", "{ul {li foo}\n{li bar}}"); + ("blank_line", "{ul {li foo}\n\n{li bar}}"); + ("blank_line_in_item", "{ul {li foo\n\nbar}}"); + ("junk", "{ul foo}"); + ("junk_with_no_whitespace", "{ulfoo}"); + ("empty", "{ul}"); + ("unterminated_list", "{ul"); + ("no_whitespace", "{ul{li foo}}"); + ("whitespace_at_end_of_item", "{ul {li foo\n\n\n}}"); + ("unterminated_li_syntax", "{ul {li foo"); + ("unterminated_left_curly_brace", "{ul {- foo"); + ("empty_li_styntax", "{ul {li }}"); + ("empty_left_curly_brace", "{ul {- }}"); + ("li_syntax_without_whitespace", "{ul {lifoo}}"); + ("li_syntax_followed_by_newline", "{ul {li\nfoo}}"); + ("li_syntax_followed_by_cr_lf", "{ul {li\r\nfoo}}"); + ("li_syntax_followed_by_blank_line", "{ul {li\n\nfoo}}"); + ("left_curly_brace_without_whitespace", "{ul {-foo}}"); + ("mixed_list_items", "{ul {li foo} {- bar}}"); + ("nested", "{ul {li {ul {li foo}}}}"); + ("shorthand_in_explicit", "{ul {li - foo\n- bar}}"); + ("explicit_in_shorthand", "- {ul {li foo}}"); + ("bare_li_syntax", "{li foo}"); + ("bare_left_curly_brace", "{- foo"); + ("after_code_block", "{[foo]} {ul {li bar}}"); + ] + +let isolated = + [ ("light list horizontal offset", "- foo bar baz\n - ba ba ba") ] (* Cases (mostly) taken from the 'odoc for library authors' document *) let documentation_cases = @@ -236,7 +324,7 @@ let documentation_cases = let all_tests = code_cases @ error_recovery @ open_t @ tags @ utf8 @ bad_markup - @ documentation_cases + @ documentation_cases @ explicit_list_tests open Test.Serialize @@ -301,7 +389,7 @@ let run_test (label, case) = Right (mkfailure label exns (Option.get !offending_token) - !failure_index (List.rev !tokens)) + !failure_index !tokens) let sep = String.init 80 @@ Fun.const '-' @@ -340,8 +428,10 @@ let () = | "utf8" | "u" -> utf8 | "isolated" | "i" -> isolated | "tags" | "t" -> tags + | "light" | "lt" -> light_table_tests | "all" -> all_tests | "dashes" | "ds" -> dashes + | "list" | "l" -> explicit_list_tests | _ -> print_endline "unrecognized argument - running documentation_cases"; documentation_cases) From 9a1d5a66123e4919be7d95a0e2d07237709d2d36 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 14 Jan 2025 10:53:29 -0500 Subject: [PATCH 133/150] Fix heavy list ambiguity --- src/parser/parser.mly | 33 ++++++--------------------------- 1 file changed, 6 insertions(+), 27 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 8560a2146c..81403cc2a6 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -79,12 +79,10 @@ let delimited_location(opening, rule, closing) := startpos = located(opening); i Loc.at span inner } -(* - When we have to handle errors with Menhir's `error` token, we need +(* When we have to handle errors with Menhir's `error` token, we need `Lexing.position` as opposed to `Loc.with_location`. Because we can cleanly take a slice of the input text with those positions, which would be - difficult using `Loc.with_location` -*) + difficult using `Loc.with_location` *) let with_position(rule) == inner = rule; { (inner, $sloc) } let position(rule) == _ = rule; { $sloc } @@ -93,8 +91,8 @@ let position(rule) == _ = rule; { $sloc } Writer.t monad easier *) let sequence(rule) == xs = list(rule); { Writer.sequence xs } let sequence_nonempty(rule) == xs = nonempty_list(rule); { Writer.sequence xs } -let separated_nonempty_sequence(sep, rule) := xs = separated_nonempty_list(sep, rule); { Writer.sequence xs } -let separated_sequence(sep, rule) := xs = separated_list(sep, rule); { Writer.sequence xs } +let sequence_separated_nonempty(sep, rule) := xs = separated_nonempty_list(sep, rule); { Writer.sequence xs } +let sequence_separated(sep, rule) := xs = separated_list(sep, rule); { Writer.sequence xs } (* WHITESPACE *) let horizontal_whitespace := ~ = Space; <`Space> @@ -113,6 +111,7 @@ let line_break := (* ENTRY *) +(* Consume any leading whitespace *) let main := | any_whitespace*; ~ = sequence_nonempty(toplevel); END; <> | any_whitespace*; END; { return [] } @@ -157,19 +156,6 @@ let toplevel_error := else Writer.warning unclosed m) } - (* TODO: These(bar, +, -) need to be handled in paragraphs, where they should - be turned into text without emitting a warning - *) - | errloc = position(BAR); whitespace?; { - let span = Loc.of_position errloc in - let warning = - let what = Tokens.describe BAR in - Writer.Warning (Parse_error.bad_markup what span) - in - let as_text = Loc.at span @@ `Word "|" in - let node = (Loc.same as_text @@ `Paragraph [ as_text ]) in - Writer.return_warning node warning - } | errloc = position(RIGHT_BRACE); whitespace?; { let span = Loc.of_position errloc in let warning = @@ -430,7 +416,7 @@ let list_light_item := } let list_light := - | children = separated_nonempty_sequence(whitespace*, list_light_item); { + | children = sequence_separated_nonempty(whitespace*, list_light_item); { Writer.bind children ~f:(fun c -> let (list_kind, c) = split_light_list_items c in let span = Loc.span @@ List.map Loc.location c in @@ -444,13 +430,6 @@ let list_opening := | DASH; { Tokens.DASH } let item_heavy := - | start_pos = located(list_opening); items = sequence(nestable_block_element); RIGHT_BRACE; any_whitespace*; { - let Loc.{ value = token; location } = start_pos in - let warning = - Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe token) location) - in - Writer.ensure not_empty warning items - } | startpos = located(list_opening); items = sequence(nestable_block_element); endpos = located(RIGHT_BRACE); any_whitespace*; { let span = Loc.delimited startpos endpos in let should_be_followed_by_whitespace = From 116987b24d26d60d13cbed97c0efa9f45f585d54 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 14 Jan 2025 14:24:14 -0500 Subject: [PATCH 134/150] Refactor references/media to pass location --- src/parser/lexer.mll | 160 +++++++++++++++++++++++++++------------ src/parser/parser.mly | 57 +++++++++----- src/parser/parser_aux.ml | 2 +- src/parser/tokens.ml | 51 +++++++++---- 4 files changed, 191 insertions(+), 79 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 1fb161c24f..24eabe9744 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -23,8 +23,9 @@ let unescape_word : string -> string = fun s -> let c, increment = match c with | '\\' -> - if index + 1 < String.length s then - match s.[index + 1] with + let next = succ index in + if next < String.length s then + match s.[next] with | '{' | '}' | '[' | ']' | '@' as c -> c, 2 | _ -> c, 1 else c, 1 @@ -53,8 +54,8 @@ let trim_leading_blank_lines : string -> string = fun s -> String.length s else match s.[index] with - | ' ' | '\t' | '\r' -> scan_for_last_newline (index + 1) trim_until - | '\n' -> scan_for_last_newline (index + 1) (index + 1) + | ' ' | '\t' | '\r' -> scan_for_last_newline (succ index) trim_until + | '\n' -> let next = succ index in scan_for_last_newline next next | _ -> trim_until in let trim_until = scan_for_last_newline 0 0 in @@ -206,30 +207,38 @@ let warning = I think we probably need inner to be `string Loc.with_location` since a `Reference takes one as a parameter. *) -let reference_token lexbuf input media ~opening_delimiter ~start_offset ~inner = +let reference_token : + Lexing.lexbuf + -> input + -> 'a + -> opening_delimiter:string + -> start_offset:int + -> content:string Loc.with_location + -> token + = fun lexbuf input media ~opening_delimiter ~start_offset ~content -> let start = input.offset_to_location start_offset in match opening_delimiter with - | "{!" -> Simple_ref { inner; start } - | "{{!" -> Ref_with_replacement { inner; start } - | "{:" -> Simple_link { inner; start } - | "{{:" -> Link_with_replacement { inner; start } - - | "{image!" -> Media { inner = (Reference inner, Image); start } - | "{image:" -> Media { inner = (Link inner, Image); start } - | "{audio!" -> Media { inner = (Reference inner, Audio); start } - | "{audio:" -> Media { inner = (Link inner, Audio); start } - | "{video!" -> Media { inner = (Reference inner, Video); start } - | "{video:" -> Media { inner = (Link inner, Video); start } + | "{!" -> Simple_ref { inner = content; start } + | "{{!" -> Ref_with_replacement { inner = content; start } + | "{:" -> Simple_link { inner = content.Loc.value; start } + | "{{:" -> Link_with_replacement { inner = content.Loc.value; start } + + | "{image!" -> Media { inner = (Reference content, Image); start } + | "{image:" -> Media { inner = (Link content, Image); start } + | "{audio!" -> Media { inner = (Reference content, Audio); start } + | "{audio:" -> Media { inner = (Link content, Audio); start } + | "{video!" -> Media { inner = (Reference content, Video); start } + | "{video:" -> Media { inner = (Link content, Video); start } | _ -> let target, kind = match opening_delimiter with - | "{{image!" -> Reference inner, Image - | "{{image:" -> Link inner, Image - | "{{audio!" -> Reference inner, Audio - | "{{audio:" -> Link inner, Audio - | "{{video!" -> Reference inner, Video - | "{{video:" -> Link inner, Video + | "{{image!" -> Reference content, Image + | "{{image:" -> Link content, Image + | "{{audio!" -> Reference content, Audio + | "{{audio:" -> Link content, Audio + | "{{video!" -> Reference content, Video + | "{{video:" -> Link content, Video | _ -> assert false in let token_descr = Tokens.describe (Media_with_replacement { inner = (target, kind, ""); start }) in @@ -318,6 +327,14 @@ let trim_start_horizontal_whitespace : string -> string = fun s -> in go 0 +let or_insert_lazy default = + function + | None -> Some (default ()) + | x -> x + +let ensure_lexeme_start lexbuf = + or_insert_lazy (Fun.const @@ Lexing.lexeme_start lexbuf) + } let markup_char = @@ -353,63 +370,112 @@ let language_tag_char = let delim_char = ['a'-'z' 'A'-'Z' '0'-'9' '_' ] -rule reference_paren_content input start ref_offset start_offset depth_paren buffer = +rule reference_paren_content input opening_delimiter start_offset content_offset depth_paren buffer = parse | '(' { buffer_add_lexeme buffer lexbuf ; - reference_paren_content input start ref_offset start_offset - (depth_paren + 1) buffer lexbuf } + reference_paren_content + input + opening_delimiter + start_offset + content_offset + (succ depth_paren) + buffer + lexbuf } | ')' { buffer_add_lexeme buffer lexbuf ; if depth_paren = 0 then - reference_content input start ref_offset buffer lexbuf + reference_content + input + opening_delimiter + start_offset + content_offset + buffer + lexbuf else - reference_paren_content input start ref_offset start_offset - (depth_paren - 1) buffer lexbuf } + reference_paren_content + input + opening_delimiter + start_offset + content_offset + (pred depth_paren) + buffer + lexbuf } | eof { let unclosed_bracket = Parse_error.unclosed_bracket ~bracket:"(" in warning lexbuf input ~start_offset unclosed_bracket; - Buffer.contents buffer + let start = input.offset_to_location @@ Option.value ~default:(Lexing.lexeme_start lexbuf) content_offset + and end_ = input.offset_to_location @@ Lexing.lexeme_end lexbuf in + Loc.{ value = Buffer.contents buffer; location = { start; end_; file = input.file }} } | _ { buffer_add_lexeme buffer lexbuf ; - reference_paren_content input start ref_offset start_offset depth_paren - buffer lexbuf } + reference_paren_content + input + opening_delimiter + start_offset + content_offset + depth_paren + buffer + lexbuf } -and reference_content input start start_offset buffer = parse +and reference_content input opening_delimiter start_offset content_offset buffer = + parse | '}' { - Buffer.contents buffer + let start = input.offset_to_location @@ Option.value ~default:(Lexing.lexeme_start lexbuf) content_offset + and end_ = input.offset_to_location @@ Lexing.lexeme_end lexbuf in + Loc.{ value = Buffer.contents buffer; location = { start; end_; file = input.file }} } | '(' { - buffer_add_lexeme buffer lexbuf ; - reference_paren_content input start start_offset - (Lexing.lexeme_start lexbuf) 0 buffer lexbuf + buffer_add_lexeme buffer lexbuf; + reference_paren_content + input + opening_delimiter + start_offset + (ensure_lexeme_start lexbuf content_offset) (* get the content offset if we haven't *) + 0 + buffer + lexbuf } | '"' [^ '"']* '"' { buffer_add_lexeme buffer lexbuf ; - reference_content input start start_offset buffer lexbuf + reference_content + input + opening_delimiter + start_offset + (ensure_lexeme_start lexbuf content_offset) + buffer + lexbuf } | eof { let unclosed_bracket = - Parse_error.unclosed_bracket ~bracket:start + Parse_error.unclosed_bracket ~bracket:opening_delimiter in warning lexbuf input ~start_offset unclosed_bracket; - Buffer.contents buffer + let start = input.offset_to_location @@ Option.value ~default:(Lexing.lexeme_start lexbuf) content_offset + and end_ = input.offset_to_location @@ Lexing.lexeme_end lexbuf in + Loc.{ value = Buffer.contents buffer; location = { start; end_; file = input.file }} } | _ { - buffer_add_lexeme buffer lexbuf ; - reference_content input start start_offset buffer lexbuf } + buffer_add_lexeme buffer lexbuf; + reference_content + input + opening_delimiter + start_offset + (ensure_lexeme_start lexbuf content_offset) + buffer + lexbuf } and token input = parse | horizontal_space* eof @@ -491,10 +557,10 @@ and token input = parse | (reference as opening_delimiter) { let start_offset = Lexing.lexeme_start lexbuf in - let inner = - reference_content input opening_delimiter start_offset (Buffer.create 16) lexbuf + let content = + reference_content input opening_delimiter start_offset None (Buffer.create 16) lexbuf in - reference_token lexbuf input media ~start_offset ~opening_delimiter ~inner + reference_token lexbuf input media ~start_offset ~opening_delimiter ~content } | "{[" @@ -733,7 +799,7 @@ and code_span buffer nesting_level start_offset input = parse | '[' { Buffer.add_char buffer '['; - code_span buffer (nesting_level + 1) start_offset input lexbuf } + code_span buffer (succ nesting_level) start_offset input lexbuf } | '\\' ('[' | ']' as c) { Buffer.add_char buffer c; @@ -825,12 +891,12 @@ and media tok_descr buffer nesting_level start_offset input = parse Buffer.contents buffer else begin Buffer.add_char buffer '}'; - media tok_descr buffer (nesting_level - 1) start_offset input lexbuf + media tok_descr buffer (pred nesting_level) start_offset input lexbuf end } | '{' { Buffer.add_char buffer '{'; - media tok_descr buffer (nesting_level + 1) start_offset input lexbuf } + media tok_descr buffer (succ nesting_level) start_offset input lexbuf } | ("\\{" | "\\}") as s { Buffer.add_string buffer s; media tok_descr buffer nesting_level start_offset input lexbuf } diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 81403cc2a6..08dad200bb 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -52,8 +52,8 @@ %token Tag_with_content (* Links and references *) -%token Simple_ref "{!" -%token Ref_with_replacement "{{!" +%token Simple_ref "{!" +%token Ref_with_replacement "{{!" %token Simple_link "{:" %token Link_with_replacement "{{:" %token <(Tokens.media * Tokens.media_target) Tokens.with_start_point> Media "{(format)!" @@ -319,22 +319,23 @@ let style := (* LINKS + REFS *) +(* TODO: See comment @ lexer.mll:205 *) let reference := | ref_body = located(Simple_ref); { let Loc.{ value = Tokens.{ inner; start }; location } = ref_body in let span = { location with start } in - return @@ Loc.at span @@ `Reference (`Simple, Loc.at location inner, []) + return @@ Loc.at span @@ `Reference (`Simple, inner, []) } | ref_body = Ref_with_replacement; children = sequence_nonempty(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { Writer.bind children ~f:(fun c -> let Tokens.{ inner; start } = ref_body in let span = { endpos.Loc.location with start } in - return @@ Loc.at span @@ `Reference (`With_text, Loc.at span inner, trim_start c)) + return @@ Loc.at span @@ `Reference (`With_text, inner, trim_start c)) } | ref_body = Ref_with_replacement; endpos = located(RIGHT_BRACE); { let Tokens.{ inner; start } = ref_body in let span = { endpos.Loc.location with start } in - let node = Loc.at span @@ `Reference (`With_text, Loc.at span inner, []) in + let node = Loc.at span @@ `Reference (`With_text, inner, []) in let warning = let what = Tokens.describe @@ Ref_with_replacement ref_body in Writer.Warning (Parse_error.should_not_be_empty ~what span) @@ -349,7 +350,7 @@ let reference := Writer.Warning (Parse_error.end_not_allowed ~in_what span) in let* children = Option.value ~default:(return []) children in - let node = Loc.at span @@ `Reference (`With_text, Loc.at span inner, children) in + let node = Loc.at span @@ `Reference (`With_text, inner, children) in Writer.return_warning node not_allowed } @@ -641,22 +642,42 @@ let table := let media := | content = located(Media); whitespace*; { - let Loc.{ value = Tokens.{ inner; start }; location } = content in + let Loc.{ value = Tokens.{ inner = ref_kind, media_kind; start }; location } = content in let span = { location with start } in - let (located_media_kind, media_href) = split_simple_media @@ Loc.at span inner in - let wrapped_located_kind = Loc.map href_of_media located_media_kind in - let kind = media_kind_of_target media_href in - let inner = Loc.at span @@ `Media (`Simple, wrapped_located_kind, "", kind) in - return inner + let ref_kind = + let open Tokens in + match ref_kind with + | Reference refr -> Loc.map (fun r -> `Reference r) refr + | Link link -> Loc.map (fun l -> `Link l) link + in + let media_kind = + let open Tokens in + match media_kind with + | Audio -> `Audio + | Image -> `Image + | Video -> `Video + in + let inner = Loc.at span @@ `Media (`Simple, ref_kind, "", media_kind) in + return inner } | content = located(Media_with_replacement); whitespace*; { - let Loc.{ value = Tokens.{ inner; start }; location } = content in + let Loc.{ value = Tokens.{ inner = (ref_kind, media_kind, content); start }; location } = content in let span = { location with start } in - let (located_media_kind, media_href, content) = split_replacement_media @@ Loc.at span inner in - let wrapped_located_kind = Loc.map href_of_media located_media_kind in - let kind = media_kind_of_target media_href in - let inner = Loc.at span @@ `Media (`With_text, wrapped_located_kind, content, kind) in - return inner + let ref_kind = + let open Tokens in + match ref_kind with + | Reference refr -> Loc.map (fun r -> `Reference r) refr + | Link link -> Loc.map (fun l -> `Link l) link + in + let media_kind = + let open Tokens in + match media_kind with + | Audio -> `Audio + | Image -> `Image + | Video -> `Video + in + let inner = Loc.at span @@ `Media (`With_text, ref_kind, content, media_kind) in + return inner } (* TOP-LEVEL ELEMENTS *) diff --git a/src/parser/parser_aux.ml b/src/parser/parser_aux.ml index 374b6f24ab..9f42128620 100644 --- a/src/parser/parser_aux.ml +++ b/src/parser/parser_aux.ml @@ -135,7 +135,7 @@ let unclosed_table in Writer.warning warning node -let media_kind_of_target = +let media_kind = let open Tokens in function Audio -> `Audio | Video -> `Video | Image -> `Image diff --git a/src/parser/tokens.ml b/src/parser/tokens.ml index be2e2b4dae..be327b1117 100644 --- a/src/parser/tokens.ml +++ b/src/parser/tokens.ml @@ -1,8 +1,5 @@ type ref_kind = Simple | With_replacement -type media = Reference of string | Link of string -type media_target = Audio | Video | Image - type alignment = Left | Center | Right type style = Bold | Italic | Emphasis | Superscript | Subscript @@ -37,8 +34,8 @@ type token = | Space of string | Single_newline of string | Blank_line of string - | Simple_ref of string with_start_point - | Ref_with_replacement of string with_start_point + | Simple_ref of string Loc.with_location with_start_point + | Ref_with_replacement of string Loc.with_location with_start_point | Simple_link of string with_start_point | Link_with_replacement of string with_start_point | MODULES @@ -90,6 +87,10 @@ and tag_with_content = | TOC_STATUS | ORDER_CATEGORY | SHORT_TITLE +and media = + | Reference of string Loc.with_location + | Link of string Loc.with_location +and media_target = Audio | Video | Image let to_ref : internal_reference -> [ `Url | `File | `Document ] = function | URL -> `Url @@ -304,6 +305,17 @@ let to_ast_style : style -> Ast.style = function | Superscript -> `Superscript | Subscript -> `Subscript +let dummy_ref inner = + { + inner = + Loc. + { + value = inner; + location = { start = Loc.dummy_pos; end_ = Loc.dummy_pos; file = "" }; + }; + start = Loc.dummy_pos; + } + let describe_inline : Ast.inline_element -> string = function | `Word w -> describe @@ Word w | `Space _ -> describe @@ Space "" @@ -315,22 +327,35 @@ let describe_inline : Ast.inline_element -> string = function describe @@ Simple_link { inner; start = Loc.dummy_pos } | `Link (inner, _ :: _) -> describe @@ Link_with_replacement { inner; start = Loc.dummy_pos } - | `Reference (`Simple, { value = inner; _ }, _) -> - describe @@ Simple_ref { inner; start = Loc.dummy_pos } - | `Reference (`With_text, { value = inner; _ }, _) -> - describe @@ Ref_with_replacement { inner; start = Loc.dummy_pos } + | `Reference (`Simple, { value; _ }, _) -> + describe @@ Simple_ref (dummy_ref value) + | `Reference (`With_text, { value; _ }, _) -> + describe @@ Ref_with_replacement (dummy_ref value) -let of_href = function `Reference s -> Reference s | `Link s -> Link s +let of_href = + let open Loc in + function + | { value = `Reference value; location } -> Reference { value; location } + | { value = `Link value; location } -> Link { value; location } let of_media_kind = function | `Audio -> Audio | `Image -> Image | `Video -> Video -let of_media = function - | `Media (_, Loc.{ value; _ }, _, kind) -> +let of_media : + [> `Media of + Ast.reference_kind + * Ast.media_href Loc.with_location + * string + * Ast.media ] -> + token = function + | `Media (_, href, _, media_kind) -> Media - { inner = (of_href value, of_media_kind kind); start = Loc.dummy_pos } + { + inner = (of_href href, of_media_kind media_kind); + start = Loc.dummy_pos; + } (* NOTE: Fix list *) let describe_nestable_block : Ast.nestable_block_element -> string = function From 0fff83f05aad47a7a786655862956195031e4ee6 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 14 Jan 2025 15:44:31 -0500 Subject: [PATCH 135/150] Fix paragraph splitting --- src/parser/lexer.mll | 4 ++++ src/parser/parser.mly | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 24eabe9744..4edbb95026 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -332,6 +332,10 @@ let or_insert_lazy default = | None -> Some (default ()) | x -> x +(* This is an attempt to prevent having to compute the lexeme start for every + character in `reference_content`/`reference_paren_content` + Unsure if it's actually useful or worth keeping +*) let ensure_lexeme_start lexbuf = or_insert_lazy (Fun.const @@ Lexing.lexeme_start lexbuf) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 08dad200bb..585253ed20 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -729,7 +729,7 @@ let symbols := | BAR; { "|" } | ~ = symbols_without_bar; <> let paragraph_middle_element := - | ~ = inline_element(horizontal_whitespace); <> + | ~ = inline_element(whitespace); <> | s = located(symbols); { return @@ Loc.map (fun w -> `Word w) s } let paragraph := horizontal_whitespace?; x = inline_element_without_whitespace; xs = sequence(paragraph_middle_element); { From cf9f8497cdadea815fa457a2630c22d7f9d61411 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Tue, 14 Jan 2025 16:15:09 -0500 Subject: [PATCH 136/150] Parameterize nestable block elements over legal paragraphs --- src/parser/parser.mly | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 585253ed20..9df00e42e9 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -117,7 +117,7 @@ let main := | any_whitespace*; END; { return [] } let toplevel := - | block = nestable_block_element; { (block :> Ast.block_element Loc.with_location Writer.t) } + | block = nestable_block_element(paragraph); { (block :> Ast.block_element Loc.with_location Writer.t) } | t = tag; { Writer.map ~f:(fun loc -> Loc.{ loc with value = `Tag loc.value }) t } | ~ = section_heading; <> | ~ = toplevel_error; <> @@ -217,7 +217,7 @@ let tag := | with_content = tag_with_content; line_break?; { with_content } | bare = tag_bare; line_break?; { bare } -let tag_with_content := tag = located(Tag_with_content); children = sequence(nestable_block_element); { +let tag_with_content := tag = located(Tag_with_content); children = sequence(nestable_block_element(paragraph)); { Writer.map children ~f:(fun children -> let Loc.{ value; location } = tag in let start = Tokens.tag_with_content_start_point value |> Option.map (fun start -> { location with start }) |> Option.value ~default:location in @@ -400,12 +400,19 @@ let list_light_start := | MINUS; { Tokens.MINUS } | PLUS; { Tokens.PLUS } +let light_list_paragraph_item := + | ~ = inline_element(whitespace); <> + | ~ = symbol_as_word(bar); <> +let paragraph_no_list_symbols := horizontal_whitespace?; x = inline_element_without_whitespace; xs = sequence(light_list_paragraph_item); { + paragraph <$> Writer.map2 ~f:List.cons x xs +} + let list_light_item := - | start = located(list_light_start); horizontal_whitespace?; item = nestable_block_element; { + | start = located(list_light_start); horizontal_whitespace?; item = nestable_block_element(paragraph_no_list_symbols); { let Loc.{ value; location } = start in light_list_item value <$> (Loc.with_start_location location <$> item) } - | horizontal_whitespace; start = located(list_light_start); item = nestable_block_element; { + | horizontal_whitespace; start = located(list_light_start); item = nestable_block_element(paragraph_no_list_symbols); { let should_begin_on_its_own_line = let span = Loc.of_position $sloc in Writer.Warning (Parse_error.should_begin_on_its_own_line ~what:(Tokens.describe MINUS) span) @@ -431,7 +438,7 @@ let list_opening := | DASH; { Tokens.DASH } let item_heavy := - | startpos = located(list_opening); items = sequence(nestable_block_element); endpos = located(RIGHT_BRACE); any_whitespace*; { + | startpos = located(list_opening); items = sequence(nestable_block_element(paragraph)); endpos = located(RIGHT_BRACE); any_whitespace*; { let span = Loc.delimited startpos endpos in let should_be_followed_by_whitespace = Writer.Warning (Parse_error.should_be_followed_by_whitespace ~what:(Tokens.describe LI) span) @@ -442,7 +449,7 @@ let item_heavy := Writer.ensure not_empty should_not_be_empty items |> Writer.warning should_be_followed_by_whitespace } - | startpos = located(list_opening); items = sequence_nonempty(nestable_block_element)?; endpos = located(END); { + | startpos = located(list_opening); items = sequence_nonempty(nestable_block_element(paragraph))?; endpos = located(END); { let end_not_allowed = Writer.Warning (Parse_error.end_not_allowed ~in_what:(Tokens.describe DASH) endpos.Loc.location) in @@ -506,11 +513,11 @@ let odoc_list := (* TABLES *) let cell_heavy := - | cell_kind = Table_cell; children = sequence_nonempty(nestable_block_element); RIGHT_BRACE; whitespace*; + | cell_kind = Table_cell; children = sequence_nonempty(nestable_block_element(paragraph)); RIGHT_BRACE; whitespace*; { Writer.map ~f:(fun c -> (c, cell_kind)) children } | cell_kind = Table_cell; RIGHT_BRACE; whitespace*; { return ([], cell_kind) } - | cell_kind = Table_cell; children = sequence_nonempty(nestable_block_element)?; errloc = position(error); { + | cell_kind = Table_cell; children = sequence_nonempty(nestable_block_element(paragraph))?; errloc = position(error); { let illegal = Writer.InputNeeded (fun input -> let (start_pos, end_pos) as loc = errloc in let illegal_input = Loc.extract ~input ~start_pos ~end_pos in @@ -682,8 +689,8 @@ let media := (* TOP-LEVEL ELEMENTS *) -let nestable_block_element := ~ = nestable_block_element_inner; any_whitespace?; <> -let nestable_block_element_inner := +let nestable_block_element(paragraph) := ~ = nestable_block_element_inner(paragraph); any_whitespace?; <> +let nestable_block_element_inner(paragraph) := | ~ = verbatim; <> | ~ = code_block; <> | ~ = odoc_list; <> @@ -722,20 +729,27 @@ let verbatim := verbatim = located(Verbatim); { *> return verbatim } +(* Split so that we can exclude bar in i.e. light tables *) let symbols_without_bar := | PLUS; { "+" } | MINUS; { "-" } -let symbols := +let bar := | BAR; { "|" } + +let symbols := | ~ = symbols_without_bar; <> + | ~ = bar; <> +let symbol_as_word(symbols) == s = located(symbols); { return @@ Loc.map (fun w -> `Word w) s } let paragraph_middle_element := | ~ = inline_element(whitespace); <> - | s = located(symbols); { return @@ Loc.map (fun w -> `Word w) s } + | ~ = symbol_as_word(symbols); <> + let paragraph := horizontal_whitespace?; x = inline_element_without_whitespace; xs = sequence(paragraph_middle_element); { paragraph <$> Writer.map2 ~f:List.cons x xs } + let code_block := | content = located(Code_block); { let Loc.{ value = Tokens.{ inner; start }; location } = content in @@ -744,7 +758,7 @@ let code_block := let node = `Code_block Ast.{ meta; delimiter; content; output = None } in return @@ Loc.at { location with start } node } - | content = located(Code_block_with_output); output = sequence_nonempty(nestable_block_element); RIGHT_CODE_DELIMITER; { + | content = located(Code_block_with_output); output = sequence_nonempty(nestable_block_element(paragraph)); RIGHT_CODE_DELIMITER; { let* output = Option.some <$> output in let Loc.{ value = Tokens.{ inner; start }; location } = content in let Tokens.{ metadata; delimiter; content } = inner in From 95431688ac76fbb94946bbfedfb197a6eb967114 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 15 Jan 2025 12:40:33 -0500 Subject: [PATCH 137/150] Add blank line delimiter for tags with block element content --- src/parser/parser.mly | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 9df00e42e9..f1fb05b06f 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -214,10 +214,10 @@ let section_heading := (* TAGS *) let tag := - | with_content = tag_with_content; line_break?; { with_content } - | bare = tag_bare; line_break?; { bare } + | with_content = tag_with_content; { with_content } + | bare = tag_bare; { bare } -let tag_with_content := tag = located(Tag_with_content); children = sequence(nestable_block_element(paragraph)); { +let tag_with_content := tag = located(Tag_with_content); children = sequence(nestable_block_element(paragraph)); Blank_line?; { Writer.map children ~f:(fun children -> let Loc.{ value; location } = tag in let start = Tokens.tag_with_content_start_point value |> Option.map (fun start -> { location with start }) |> Option.value ~default:location in @@ -227,11 +227,24 @@ let tag_with_content := tag = located(Tag_with_content); children = sequence(nes | tag = located(Tag_with_content); horizontal_whitespace?; { return @@ { tag with Loc.value = Tokens.tag_with_content [] tag.Loc.value } } - - let tag_bare := tag = located(Tag); horizontal_whitespace?; { - return @@ Loc.map (Fun.const @@ Tokens.tag_bare tag) tag + (* NOTE: (@FayCarsons) Right now this is the only way to accept a newline + after a tag_with_content, adding an optional newline causes unsolvable + reduce conflicts. + Maybe if the line break/whitespace handling for nestable block element were + refactored, we could remove this + *) + | tag = located(Tag_with_content); Single_newline; children = sequence(nestable_block_element(paragraph)); Blank_line?; { + Writer.map children ~f:(fun children -> + let Loc.{ value; location } = tag in + let start = Tokens.tag_with_content_start_point value |> Option.map (fun start -> { location with start }) |> Option.value ~default:location in + let span = Loc.span @@ start :: List.map Loc.location children in + Loc.at span @@ Tokens.tag_with_content children value) } +let tag_bare := tag = located(Tag); horizontal_whitespace?; { + return @@ Loc.map (Fun.const @@ Tokens.tag_bare tag) tag +} + (* INLINE ELEMENTS *) let inline_element(ws) := @@ -440,14 +453,10 @@ let list_opening := let item_heavy := | startpos = located(list_opening); items = sequence(nestable_block_element(paragraph)); endpos = located(RIGHT_BRACE); any_whitespace*; { let span = Loc.delimited startpos endpos in - let should_be_followed_by_whitespace = - Writer.Warning (Parse_error.should_be_followed_by_whitespace ~what:(Tokens.describe LI) span) - in let should_not_be_empty = Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span) in Writer.ensure not_empty should_not_be_empty items - |> Writer.warning should_be_followed_by_whitespace } | startpos = located(list_opening); items = sequence_nonempty(nestable_block_element(paragraph))?; endpos = located(END); { let end_not_allowed = @@ -689,7 +698,7 @@ let media := (* TOP-LEVEL ELEMENTS *) -let nestable_block_element(paragraph) := ~ = nestable_block_element_inner(paragraph); any_whitespace?; <> +let nestable_block_element(paragraph) := ~ = nestable_block_element_inner(paragraph); whitespace?; <> let nestable_block_element_inner(paragraph) := | ~ = verbatim; <> | ~ = code_block; <> From 46c4d0a63d32efee669a438c769ecfae232ed247 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 15 Jan 2025 14:24:59 -0500 Subject: [PATCH 138/150] Fix heavy list whitespace handling --- src/parser/TODO.md | 1 + src/parser/dune | 2 +- src/parser/parser.mly | 49 +++++++++++++++++++------------------------ 3 files changed, 24 insertions(+), 28 deletions(-) create mode 100644 src/parser/TODO.md diff --git a/src/parser/TODO.md b/src/parser/TODO.md new file mode 100644 index 0000000000..10ca818b43 --- /dev/null +++ b/src/parser/TODO.md @@ -0,0 +1 @@ +- Some locations are p diff --git a/src/parser/dune b/src/parser/dune index 902a6a576b..69d306aed1 100644 --- a/src/parser/dune +++ b/src/parser/dune @@ -2,7 +2,7 @@ (menhir (modules parser) - (flags --table --external-tokens Tokens --explain )) + (flags --table --external-tokens Tokens --explain)) (library (name odoc_parser) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index f1fb05b06f..1e43ac049a 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -95,9 +95,9 @@ let sequence_separated_nonempty(sep, rule) := xs = separated_nonempty_list(sep, let sequence_separated(sep, rule) := xs = separated_list(sep, rule); { Writer.sequence xs } (* WHITESPACE *) -let horizontal_whitespace := ~ = Space; <`Space> +let horizontal_whitespace == ~ = Space; <`Space> -let whitespace := +let whitespace == | ~ = horizontal_whitespace; <> | ~ = Single_newline; <`Space> @@ -105,7 +105,7 @@ let any_whitespace := | ~ = whitespace; <> | ~ = Blank_line; <`Space> -let line_break := +let line_break == | ~ = Single_newline; <> | ~ = Blank_line; <> @@ -117,15 +117,15 @@ let main := | any_whitespace*; END; { return [] } let toplevel := - | block = nestable_block_element(paragraph); { (block :> Ast.block_element Loc.with_location Writer.t) } - | t = tag; { Writer.map ~f:(fun loc -> Loc.{ loc with value = `Tag loc.value }) t } + | block = nestable_block_element(paragraph); any_whitespace?; { (block :> Ast.block_element Loc.with_location Writer.t) } + | t = tag; line_break?; { Writer.map ~f:(fun loc -> Loc.{ loc with value = `Tag loc.value }) t } | ~ = section_heading; <> | ~ = toplevel_error; <> (* Tokens which cannot begin any block element *) let toplevel_error := (* Stray heavy list items, `{li` or `{-` *) - | err = located(list_opening); children = sequence_nonempty(inline_element(horizontal_whitespace)); endpos = located(RIGHT_BRACE)?; { + | err = located(item_open); children = sequence_nonempty(inline_element(horizontal_whitespace)); endpos = located(RIGHT_BRACE)?; { let default = Writer.get children |> List.rev @@ -217,7 +217,7 @@ let tag := | with_content = tag_with_content; { with_content } | bare = tag_bare; { bare } -let tag_with_content := tag = located(Tag_with_content); children = sequence(nestable_block_element(paragraph)); Blank_line?; { +let tag_with_content := tag = located(Tag_with_content); children = sequence(nestable_block_element(paragraph)); { Writer.map children ~f:(fun children -> let Loc.{ value; location } = tag in let start = Tokens.tag_with_content_start_point value |> Option.map (fun start -> { location with start }) |> Option.value ~default:location in @@ -233,7 +233,7 @@ let tag_with_content := tag = located(Tag_with_content); children = sequence(nes Maybe if the line break/whitespace handling for nestable block element were refactored, we could remove this *) - | tag = located(Tag_with_content); Single_newline; children = sequence(nestable_block_element(paragraph)); Blank_line?; { + | tag = located(Tag_with_content); Single_newline; children = sequence(nestable_block_element(paragraph)); { Writer.map children ~f:(fun children -> let Loc.{ value; location } = tag in let start = Tokens.tag_with_content_start_point value |> Option.map (fun start -> { location with start }) |> Option.value ~default:location in @@ -446,19 +446,19 @@ let list_light := |> return) } -let list_opening := +let item_open := | LI; { Tokens.LI } | DASH; { Tokens.DASH } let item_heavy := - | startpos = located(list_opening); items = sequence(nestable_block_element(paragraph)); endpos = located(RIGHT_BRACE); any_whitespace*; { +| startpos = located(item_open); any_whitespace*; items = sequence(nestable_block_element(paragraph)); any_whitespace*; endpos = located(RIGHT_BRACE); { let span = Loc.delimited startpos endpos in let should_not_be_empty = Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span) in Writer.ensure not_empty should_not_be_empty items } - | startpos = located(list_opening); items = sequence_nonempty(nestable_block_element(paragraph))?; endpos = located(END); { + | startpos = located(item_open); any_whitespace*; items = sequence(nestable_block_element(paragraph))?; any_whitespace*; endpos = located(END); { let end_not_allowed = Writer.Warning (Parse_error.end_not_allowed ~in_what:(Tokens.describe DASH) endpos.Loc.location) in @@ -475,24 +475,20 @@ let item_heavy := } let list_heavy := - | list_kind = located(List); whitespace?; items = sequence_nonempty(item_heavy); endpos = located(RIGHT_BRACE); { - let span = Loc.delimited list_kind endpos in - Writer.bind items ~f:(fun items -> - `List (Tokens.ast_list_kind list_kind.Loc.value, `Heavy, items) - |> Loc.at span - |> return) - } - | list_kind = located(List); endpos = located(RIGHT_BRACE); { + | list_kind = located(List); whitespace*; items = sequence(item_heavy); whitespace*; endpos = located(RIGHT_BRACE); { let span = Loc.delimited list_kind endpos in let should_not_be_empty = let what = Tokens.describe @@ List list_kind.Loc.value in Writer.Warning (Parse_error.should_not_be_empty ~what span) in - let node = Loc.at span @@ `List (Tokens.ast_list_kind list_kind.Loc.value, `Heavy, []) in - Writer.return_warning node should_not_be_empty + Writer.ensure not_empty should_not_be_empty items + |> Writer.bind ~f:(fun items -> + `List (Tokens.ast_list_kind list_kind.Loc.value, `Heavy, items) + |> Loc.at span + |> return) } - | list_kind = located(List); whitespace?; items = sequence_nonempty(item_heavy); errloc = position(error); { - let span = Loc.(span [list_kind.location; Loc.of_position errloc]) in + | list_kind = located(List); whitespace*; items = sequence_nonempty(item_heavy); errloc = position(error); { + let span = Loc.(span [list_kind.location; of_position errloc]) in let illegal = Writer.InputNeeded (fun input -> let (start_pos, end_pos) = errloc in let illegal_input = Loc.extract ~input ~start_pos ~end_pos in @@ -503,8 +499,8 @@ let list_heavy := let inner = Loc.at span @@ `List (Tokens.ast_list_kind list_kind.Loc.value, `Heavy, items) in return inner } - | list_kind = located(List); whitespace?; errloc = position(error); { - let span = Loc.(span [list_kind.location; Loc.of_position errloc]) in + | list_kind = located(List); errloc = position(error); { + let span = Loc.(span [list_kind.location; of_position errloc]) in let illegal = Writer.InputNeeded (fun input -> let (start_pos, end_pos) = errloc in let illegal_input = Loc.extract ~input ~start_pos ~end_pos in @@ -698,7 +694,7 @@ let media := (* TOP-LEVEL ELEMENTS *) -let nestable_block_element(paragraph) := ~ = nestable_block_element_inner(paragraph); whitespace?; <> +let nestable_block_element(paragraph) := ~ = nestable_block_element_inner(paragraph); <> let nestable_block_element_inner(paragraph) := | ~ = verbatim; <> | ~ = code_block; <> @@ -758,7 +754,6 @@ let paragraph := horizontal_whitespace?; x = inline_element_without_whitespace; paragraph <$> Writer.map2 ~f:List.cons x xs } - let code_block := | content = located(Code_block); { let Loc.{ value = Tokens.{ inner; start }; location } = content in From 2f64b98a8afe5f799bead519ee1b482b072e525e Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 15 Jan 2025 16:26:46 -0500 Subject: [PATCH 139/150] fix light list locations --- src/parser/parser.mly | 100 +++++++++++++++++++-------------------- src/parser/parser_aux.ml | 44 +++++++++++++---- 2 files changed, 85 insertions(+), 59 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 1e43ac049a..787ca108cf 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -68,47 +68,6 @@ %start main %% - -(* UTILITIES *) - -(* Utilities which wraps the return value of a rule in `Loc.with_location` *) -let locatedM(rule) == inner = rule; { wrap_location $sloc <$> inner } -let located(rule) == inner = rule; { wrap_location $sloc inner } -let delimited_location(opening, rule, closing) := startpos = located(opening); inner = rule; endpos = located(closing); { - let span = Loc.delimited startpos endpos in - Loc.at span inner -} - -(* When we have to handle errors with Menhir's `error` token, we need - `Lexing.position` as opposed to `Loc.with_location`. Because we can cleanly - take a slice of the input text with those positions, which would be - difficult using `Loc.with_location` *) -let with_position(rule) == inner = rule; { (inner, $sloc) } -let position(rule) == _ = rule; { $sloc } - - -(* Wrappers around Menhir's built-in utilities that make working inside the - Writer.t monad easier *) -let sequence(rule) == xs = list(rule); { Writer.sequence xs } -let sequence_nonempty(rule) == xs = nonempty_list(rule); { Writer.sequence xs } -let sequence_separated_nonempty(sep, rule) := xs = separated_nonempty_list(sep, rule); { Writer.sequence xs } -let sequence_separated(sep, rule) := xs = separated_list(sep, rule); { Writer.sequence xs } - -(* WHITESPACE *) -let horizontal_whitespace == ~ = Space; <`Space> - -let whitespace == - | ~ = horizontal_whitespace; <> - | ~ = Single_newline; <`Space> - -let any_whitespace := - | ~ = whitespace; <> - | ~ = Blank_line; <`Space> - -let line_break == - | ~ = Single_newline; <> - | ~ = Blank_line; <> - (* ENTRY *) (* Consume any leading whitespace *) @@ -423,7 +382,9 @@ let paragraph_no_list_symbols := horizontal_whitespace?; x = inline_element_with let list_light_item := | start = located(list_light_start); horizontal_whitespace?; item = nestable_block_element(paragraph_no_list_symbols); { let Loc.{ value; location } = start in - light_list_item value <$> (Loc.with_start_location location <$> item) + Writer.map ~f:(fun item -> + { item with Loc.location = Loc.span [location; item.Loc.location] } + ) (light_list_item value <$> item) } | horizontal_whitespace; start = located(list_light_start); item = nestable_block_element(paragraph_no_list_symbols); { let should_begin_on_its_own_line = @@ -431,19 +392,17 @@ let list_light_item := Writer.Warning (Parse_error.should_begin_on_its_own_line ~what:(Tokens.describe MINUS) span) in let Loc.{ value; location } = start in - Writer.map ~f:(Loc.with_start_location location) item - |> Writer.map ~f:(light_list_item value) + Writer.map ~f:(fun item -> + { (light_list_item value item) with Loc.location = Loc.span [location; item.Loc.location] } + ) item |> Writer.warning should_begin_on_its_own_line } let list_light := | children = sequence_separated_nonempty(whitespace*, list_light_item); { - Writer.bind children ~f:(fun c -> - let (list_kind, c) = split_light_list_items c in - let span = Loc.span @@ List.map Loc.location c in - `List (list_kind, `Light, [ c ]) - |> Loc.at span - |> return) + Writer.map children ~f:(fun children -> + let Loc.{ value = (list_kind, children); location } = split_light_list_items children in + Loc.at location @@ `List (list_kind, `Light, [ children ])) } let item_open := @@ -836,3 +795,44 @@ let modules := startpos = located(MODULES); modules = sequence(inline_element(wh Writer.return_warning inner not_allowed |> Writer.warning unexpected_end) } + + +(* UTILITIES *) + +(* Utilities which wraps the return value of a rule in `Loc.with_location` *) +let locatedM(rule) == inner = rule; { wrap_location $sloc <$> inner } +let located(rule) == inner = rule; { wrap_location $sloc inner } +let delimited_location(opening, rule, closing) := startpos = located(opening); inner = rule; endpos = located(closing); { + let span = Loc.delimited startpos endpos in + Loc.at span inner +} + +(* When we have to handle errors with Menhir's `error` token, we need + `Lexing.position` as opposed to `Loc.with_location`. Because we can cleanly + take a slice of the input text with those positions, which would be + difficult using `Loc.with_location` *) +let with_position(rule) == inner = rule; { (inner, $sloc) } +let position(rule) == _ = rule; { $sloc } + + +(* Wrappers around Menhir's built-in utilities that make working inside the + Writer.t monad easier *) +let sequence(rule) == xs = list(rule); { Writer.sequence xs } +let sequence_nonempty(rule) == xs = nonempty_list(rule); { Writer.sequence xs } +let sequence_separated_nonempty(sep, rule) := xs = separated_nonempty_list(sep, rule); { Writer.sequence xs } +let sequence_separated(sep, rule) := xs = separated_list(sep, rule); { Writer.sequence xs } + +(* WHITESPACE *) +let horizontal_whitespace == ~ = Space; <`Space> + +let whitespace == + | ~ = horizontal_whitespace; <> + | ~ = Single_newline; <`Space> + +let any_whitespace := + | ~ = whitespace; <> + | ~ = Blank_line; <`Space> + +let line_break == + | ~ = Single_newline; <> + | ~ = Blank_line; <> diff --git a/src/parser/parser_aux.ml b/src/parser/parser_aux.ml index 9f42128620..e290f68e3e 100644 --- a/src/parser/parser_aux.ml +++ b/src/parser/parser_aux.ml @@ -170,19 +170,45 @@ let legal_module_list : Ast.inline_element Loc.with_location list -> bool = && List.for_all (function `Word _ | `Space _ -> true | _ -> false) @@ List.map Loc.value xs -let light_list_item start item = +let light_list_item : + Tokens.token -> + Ast.nestable_block_element Loc.with_location -> + [< `Ordered of Ast.nestable_block_element + | `Unordered of Ast.nestable_block_element ] + Loc.with_location = + fun start -> let open Tokens in - match start with - | MINUS -> `Unordered item - | PLUS -> `Ordered item - | _ -> assert false (* unreachable *) + Loc.map (fun item -> + match start with + | MINUS -> `Unordered item + | PLUS -> `Ordered item + | _ -> assert false (* unreachable *)) let or_insert = function None -> Option.some | o -> Fun.const o -let split_light_list_items items = +let split_light_list_items : + [< `Ordered of Ast.nestable_block_element + | `Unordered of Ast.nestable_block_element ] + Loc.with_location + list -> + ([< `Ordered | `Unordered ] + * Ast.nestable_block_element Loc.with_location list) + Loc.with_location = + fun items -> let rec go acc list_kind = function - | `Ordered x :: xs -> go (x :: acc) (or_insert list_kind `Ordered) xs - | `Unordered x :: xs -> go (x :: acc) (or_insert list_kind `Unordered) xs + | Loc.{ value = `Ordered x; location } :: xs -> + go + ((Loc.at location x, location) :: acc) + (or_insert list_kind `Ordered) + xs + | Loc.{ value = `Unordered x; location } :: xs -> + go + ((Loc.at location x, location) :: acc) + (or_insert list_kind `Unordered) + xs | [] -> (Option.get list_kind, List.rev acc) in - go [] None items + let list_kind, xs = go [] None items in + let elements, spans = List.split xs in + let location = Loc.span spans in + { Loc.value = (list_kind, elements); location } From 3e9188ebece38b0bf73bf928a3131f93540f2613 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Wed, 15 Jan 2025 16:33:45 -0500 Subject: [PATCH 140/150] Actually(!) fix light list item locations --- src/parser/parser.mly | 12 +++++++----- src/parser/parser_aux.ml | 21 ++++++--------------- 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 787ca108cf..7e3a49d218 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -383,7 +383,7 @@ let list_light_item := | start = located(list_light_start); horizontal_whitespace?; item = nestable_block_element(paragraph_no_list_symbols); { let Loc.{ value; location } = start in Writer.map ~f:(fun item -> - { item with Loc.location = Loc.span [location; item.Loc.location] } + (Loc.span [location; item.Loc.location], item) ) (light_list_item value <$> item) } | horizontal_whitespace; start = located(list_light_start); item = nestable_block_element(paragraph_no_list_symbols); { @@ -393,16 +393,18 @@ let list_light_item := in let Loc.{ value; location } = start in Writer.map ~f:(fun item -> - { (light_list_item value item) with Loc.location = Loc.span [location; item.Loc.location] } - ) item + (Loc.span [location; item.Loc.location], item) + ) (light_list_item value <$> item) |> Writer.warning should_begin_on_its_own_line } let list_light := | children = sequence_separated_nonempty(whitespace*, list_light_item); { Writer.map children ~f:(fun children -> - let Loc.{ value = (list_kind, children); location } = split_light_list_items children in - Loc.at location @@ `List (list_kind, `Light, [ children ])) + let spans, children = List.split children in + let span = Loc.span spans in + let list_kind, children = split_light_list_items children in + Loc.at span @@ `List (list_kind, `Light, [ children ])) } let item_open := diff --git a/src/parser/parser_aux.ml b/src/parser/parser_aux.ml index e290f68e3e..3abbf4f67b 100644 --- a/src/parser/parser_aux.ml +++ b/src/parser/parser_aux.ml @@ -191,24 +191,15 @@ let split_light_list_items : | `Unordered of Ast.nestable_block_element ] Loc.with_location list -> - ([< `Ordered | `Unordered ] - * Ast.nestable_block_element Loc.with_location list) - Loc.with_location = + [< `Ordered | `Unordered ] + * Ast.nestable_block_element Loc.with_location list = fun items -> let rec go acc list_kind = function | Loc.{ value = `Ordered x; location } :: xs -> - go - ((Loc.at location x, location) :: acc) - (or_insert list_kind `Ordered) - xs + go (Loc.at location x :: acc) (or_insert list_kind `Ordered) xs | Loc.{ value = `Unordered x; location } :: xs -> - go - ((Loc.at location x, location) :: acc) - (or_insert list_kind `Unordered) - xs + go (Loc.at location x :: acc) (or_insert list_kind `Unordered) xs | [] -> (Option.get list_kind, List.rev acc) in - let list_kind, xs = go [] None items in - let elements, spans = List.split xs in - let location = Loc.span spans in - { Loc.value = (list_kind, elements); location } + let list_kind, elements = go [] None items in + (list_kind, elements) From cd67627f206ab5e7d5180396c4e535eed558df15 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 16 Jan 2025 10:52:34 -0500 Subject: [PATCH 141/150] Fix top-level line-break handling --- src/parser/parser.mly | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 7e3a49d218..f0e62f62f4 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -78,8 +78,8 @@ let main := let toplevel := | block = nestable_block_element(paragraph); any_whitespace?; { (block :> Ast.block_element Loc.with_location Writer.t) } | t = tag; line_break?; { Writer.map ~f:(fun loc -> Loc.{ loc with value = `Tag loc.value }) t } - | ~ = section_heading; <> - | ~ = toplevel_error; <> + | ~ = section_heading; line_break?; <> + | ~ = toplevel_error; line_break?; <> (* Tokens which cannot begin any block element *) let toplevel_error := From 82f012648502814107afa5595f66fcbb46122f58 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 16 Jan 2025 12:18:15 -0500 Subject: [PATCH 142/150] improve light list error handling --- src/parser/parser.mly | 59 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 6 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index f0e62f62f4..b1dd1795b2 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -65,8 +65,13 @@ %type tag_with_content %type tag +%on_error_reduce nestable_block_element(paragraph) +%on_error_reduce tag_with_content +%on_error_reduce section_heading + %start main + %% (* ENTRY *) @@ -76,10 +81,10 @@ let main := | any_whitespace*; END; { return [] } let toplevel := - | block = nestable_block_element(paragraph); any_whitespace?; { (block :> Ast.block_element Loc.with_location Writer.t) } + | block = nestable_block_element(paragraph); any_whitespace*; { (block :> Ast.block_element Loc.with_location Writer.t) } | t = tag; line_break?; { Writer.map ~f:(fun loc -> Loc.{ loc with value = `Tag loc.value }) t } - | ~ = section_heading; line_break?; <> - | ~ = toplevel_error; line_break?; <> + | ~ = section_heading; line_break*; <> + | ~ = toplevel_error; line_break*; <> (* Tokens which cannot begin any block element *) let toplevel_error := @@ -142,7 +147,7 @@ let section_heading := | content = Section_heading; children = sequence_nonempty(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { let Tokens.{ inner = (num, title); start } = content in let span = { endpos.Loc.location with start } in - Writer.map ~f:(fun c -> Loc.at span @@ `Heading (num, title, trim_start c)) children + Writer.map ~f:(fun c -> Loc.at span @@ `Heading (num, title, c)) children } | content = Section_heading; endpos = located(RIGHT_BRACE); { let Tokens.{ inner = (num, title); start } = content in @@ -190,8 +195,7 @@ let tag_with_content := tag = located(Tag_with_content); children = sequence(nes after a tag_with_content, adding an optional newline causes unsolvable reduce conflicts. Maybe if the line break/whitespace handling for nestable block element were - refactored, we could remove this - *) + refactored, we could remove this *) | tag = located(Tag_with_content); Single_newline; children = sequence(nestable_block_element(paragraph)); { Writer.map children ~f:(fun children -> let Loc.{ value; location } = tag in @@ -406,6 +410,49 @@ let list_light := let list_kind, children = split_light_list_items children in Loc.at span @@ `List (list_kind, `Light, [ children ])) } + | children = sequence_separated_nonempty(whitespace*, list_light_item); errpos = position(error); { + Writer.bind children ~f:(fun children -> + let spans, children = List.split children in + + let start_pos, end_pos = errpos in + let errloc = Loc.of_position errpos in + let span = Loc.span (spans @ [errloc]) in + let list_kind, children = split_light_list_items children in + + let illegal = Writer.InputNeeded (fun input -> + let error_text = Loc.extract ~input ~start_pos ~end_pos in + let in_what = Tokens.describe @@ + match list_kind with `Ordered -> PLUS | `Unordered -> MINUS + in + Parse_error.illegal ~in_what error_text span) + in + `List (list_kind, `Light, [ children ]) + |> Loc.at span + |> return + |> Writer.warning illegal) + } + | start = located(list_light_start); horizontal_whitespace?; errpos = position(error); { + let Loc.{ value; location } = start in + let list_kind = + match value with + | PLUS -> `Ordered + | MINUS -> `Unordered + | _ -> assert false (* unreachable *) + in + let errloc = Loc.of_position errpos in + let span = Loc.span [location; errloc] in + let illegal = Writer.InputNeeded (fun input -> + let (start_pos, end_pos) = errpos in + let error_text = Loc.extract ~input ~start_pos ~end_pos in + let in_what = Tokens.describe value in + Parse_error.illegal ~in_what error_text span) + in + `List (list_kind, `Light, []) + |> Loc.at span + |> return + |> Writer.warning illegal + } + let item_open := | LI; { Tokens.LI } From 167895bddeb6c62a3fd94a199901ff181fd1e3c2 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 16 Jan 2025 14:11:38 -0500 Subject: [PATCH 143/150] Clean and add comments --- src/parser/parser.mly | 35 +++++--- src/parser/parser_aux.ml | 45 +++++++++++ src/parser/tokens.ml | 170 ++++++++++++++++----------------------- 3 files changed, 140 insertions(+), 110 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index b1dd1795b2..7ce0f2a947 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -87,6 +87,15 @@ let toplevel := | ~ = toplevel_error; line_break*; <> (* Tokens which cannot begin any block element *) +(* TODO: This should have cases for block elements on the same line: + nestable_block_element -> section_heading + nestable_block_element -> nestable_block_element + nestable_block_element -> tag + etc + To do this we will need to refactor so that this returns a list of elements, + and `toplevel` is an explicit left-recursive list rule + (see Menhir's `list` impl) that can concat those lists together +*) let toplevel_error := (* Stray heavy list items, `{li` or `{-` *) | err = located(item_open); children = sequence_nonempty(inline_element(horizontal_whitespace)); endpos = located(RIGHT_BRACE)?; { @@ -184,12 +193,16 @@ let tag := let tag_with_content := tag = located(Tag_with_content); children = sequence(nestable_block_element(paragraph)); { Writer.map children ~f:(fun children -> let Loc.{ value; location } = tag in - let start = Tokens.tag_with_content_start_point value |> Option.map (fun start -> { location with start }) |> Option.value ~default:location in + let start = + tag_with_content_start_point value + |> Option.map (fun start -> { location with start }) + |> Option.value ~default:location + in let span = Loc.span @@ start :: List.map Loc.location children in - Loc.at span @@ Tokens.tag_with_content children value) + Loc.at span @@ tag_with_content children value) } | tag = located(Tag_with_content); horizontal_whitespace?; { - return @@ { tag with Loc.value = Tokens.tag_with_content [] tag.Loc.value } + return @@ { tag with Loc.value = tag_with_content [] tag.Loc.value } } (* NOTE: (@FayCarsons) Right now this is the only way to accept a newline after a tag_with_content, adding an optional newline causes unsolvable @@ -199,13 +212,13 @@ let tag_with_content := tag = located(Tag_with_content); children = sequence(nes | tag = located(Tag_with_content); Single_newline; children = sequence(nestable_block_element(paragraph)); { Writer.map children ~f:(fun children -> let Loc.{ value; location } = tag in - let start = Tokens.tag_with_content_start_point value |> Option.map (fun start -> { location with start }) |> Option.value ~default:location in + let start = tag_with_content_start_point value |> Option.map (fun start -> { location with start }) |> Option.value ~default:location in let span = Loc.span @@ start :: List.map Loc.location children in - Loc.at span @@ Tokens.tag_with_content children value) + Loc.at span @@ tag_with_content children value) } let tag_bare := tag = located(Tag); horizontal_whitespace?; { - return @@ Loc.map (Fun.const @@ Tokens.tag_bare tag) tag + return @@ Loc.map (Fun.const @@ tag_bare tag) tag } (* INLINE ELEMENTS *) @@ -243,7 +256,7 @@ let style := Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.ensure not_empty warning children - |> Writer.map ~f:(fun c -> Loc.at span @@ `Styled (Tokens.to_ast_style style, trim_start c)) + |> Writer.map ~f:(fun c -> Loc.at span @@ `Styled (ast_style style, trim_start c)) } | style = located(Style); endpos = located(RIGHT_BRACE); { let span = Loc.delimited style endpos in @@ -252,7 +265,7 @@ let style := let what = Tokens.describe @@ Style style in Writer.Warning (Parse_error.should_not_be_empty ~what span) in - let inner = Loc.at span @@ `Styled (Tokens.to_ast_style style, []) in + let inner = Loc.at span @@ `Styled (ast_style style, []) in Writer.return_warning inner warning } | style = located(Style); endpos = located(RIGHT_CODE_DELIMITER); { @@ -266,7 +279,7 @@ let style := let should_not_be_empty = Writer.Warning (Parse_error.should_not_be_empty ~what:style_desc span) in - let inner = Loc.at span @@ `Styled (Tokens.to_ast_style style, []) in + let inner = Loc.at span @@ `Styled (ast_style style, []) in return inner |> Writer.warning not_allowed |> Writer.warning should_not_be_empty @@ -279,7 +292,7 @@ let style := let illegal_section = Loc.extract ~input ~start_pos ~end_pos in Parse_error.illegal ~in_what illegal_section span) in - let inner = Loc.at span @@ `Styled (Tokens.to_ast_style style.Loc.value, []) in + let inner = Loc.at span @@ `Styled (ast_style style.Loc.value, []) in Writer.return_warning inner illegal } | style = located(Style); endpos = located(END); { @@ -289,7 +302,7 @@ let style := let in_what = Tokens.describe @@ Style style in Writer.Warning (Parse_error.end_not_allowed ~in_what span) in - let inner = Loc.at span @@ `Styled (Tokens.to_ast_style style, []) in + let inner = Loc.at span @@ `Styled (ast_style style, []) in Writer.return_warning inner warning } diff --git a/src/parser/parser_aux.ml b/src/parser/parser_aux.ml index 3abbf4f67b..8b258d0b16 100644 --- a/src/parser/parser_aux.ml +++ b/src/parser/parser_aux.ml @@ -203,3 +203,48 @@ let split_light_list_items : in let list_kind, elements = go [] None items in (list_kind, elements) + +let tag_with_content_start_point : Tokens.tag_with_content -> Loc.point option = + function + | Before { start; _ } + | Raise { start; _ } + | Param { start; _ } + | See { start; _ } -> + Some start + | _ -> None + +let to_ref : Tokens.uri_kind -> [ `Url | `File | `Document ] = function + | URL -> `Url + | File -> `File + | Document -> `Document + +let tag_with_content + (content : Ast.nestable_block_element Loc.with_location list) : + Tokens.tag_with_content -> Ast.tag = function + | DEPRECATED -> `Deprecated content + | Before { inner; _ } -> `Before (inner, content) + | Raise { inner; _ } -> `Raise (inner, content) + | Param { inner; _ } -> `Param (inner, content) + | See { inner = kind, href; _ } -> `See (to_ref kind, href, content) + | RETURN -> `Return content + | CHILDREN_ORDER -> `Children_order content + | TOC_STATUS -> `Toc_status content + | ORDER_CATEGORY -> `Order_category content + | SHORT_TITLE -> `Short_title content + +let tag_bare : Tokens.tag Loc.with_location -> Ast.tag = function + | { value = Author s; _ } -> `Author s.inner + | { value = Since s; _ } -> `Since s.inner + | { value = Version s; _ } -> `Version s.inner + | { value = Canonical s; _ } as loc -> `Canonical { loc with value = s.inner } + | { value = INLINE; _ } -> `Inline + | { value = OPEN; _ } -> `Open + | { value = CLOSED; _ } -> `Closed + | { value = HIDDEN; _ } -> `Hidden + +let ast_style : Tokens.style -> Ast.style = function + | Bold -> `Bold + | Italic -> `Italic + | Emphasis -> `Emphasis + | Superscript -> `Superscript + | Subscript -> `Subscript diff --git a/src/parser/tokens.ml b/src/parser/tokens.ml index be327b1117..9bf4d6acff 100644 --- a/src/parser/tokens.ml +++ b/src/parser/tokens.ml @@ -7,7 +7,7 @@ type table_cell_kind = Header | Data type list_kind = Ordered | Unordered -type internal_reference = URL | File | Document +type uri_kind = URL | File | Document type math = { start : Loc.point; content : string } let ast_list_kind : list_kind -> Ast.list_kind = function @@ -31,105 +31,79 @@ and meta = { (* Token names follow Menhir conventions where ALL_CAPS denote a unit variant, in this case generally representing a delimiter *) type token = - | Space of string - | Single_newline of string + | Space of string (* ' ' *) + | Single_newline of string (* '\n' with optional leading horizontal space *) | Blank_line of string - | Simple_ref of string Loc.with_location with_start_point - | Ref_with_replacement of string Loc.with_location with_start_point - | Simple_link of string with_start_point - | Link_with_replacement of string with_start_point - | MODULES - | Media of (media * media_target) with_start_point + (* '\n\n' with optional leading horizontal space for both newlines *) + | Simple_ref of string Loc.with_location with_start_point (* '{!Foo}' *) + | Ref_with_replacement of + string Loc.with_location with_start_point (* '{{!Foo} bar}' *) + | Simple_link of string with_start_point (* '{:janestreet.com}' *) + | Link_with_replacement of + string with_start_point (* '{{:janestreet.com}Jane Street}' *) + | MODULES (* {!modules *) + | Media of + (media * media_target) with_start_point (* i.e. '{audio!foo.wav}' *) | Media_with_replacement of (media * media_target * string) with_start_point - | Math_span of string with_start_point - | Math_block of string with_start_point - | Code_span of string with_start_point - | Code_block of code_block with_start_point + (* i.e. '{{audio!foo.wav} presentation on monadic fixpoints}' *) + | Math_span of string with_start_point (* '{m 2^32-1 }' *) + | Math_block of string with_start_point (* '{math \\sum_{i=0}^n x^i%}}' *) + | Code_span of string with_start_point (* '[bind m f]' *) + | Code_block of code_block with_start_point (* '{@haskell[fix f = f f]}' *) | Code_block_with_output of code_block with_start_point - | Word of string - | Verbatim of string with_start_point - | RIGHT_CODE_DELIMITER - | RIGHT_BRACE - | Paragraph_style of alignment with_start_point - | Style of style - | List of list_kind - | LI - | DASH - | TABLE_LIGHT - | TABLE_HEAVY - | TABLE_ROW - | Table_cell of Ast.table_cell_kind - | MINUS - | PLUS - | BAR - | Section_heading of (int * string option) with_start_point - | Tag of tag - | Tag_with_content of tag_with_content + (* '{@haskell[fix f = f f][Haskell can be cool]}' *) + | Word of string (* Any whitespace separated word *) + | Verbatim of string with_start_point (* '{v let foo = bar v}' *) + | RIGHT_CODE_DELIMITER (* ']}' *) + | RIGHT_BRACE (* '}' *) + | Paragraph_style of alignment with_start_point (* i.e. '{L' *) + | Style of style (* i.e. '{i italic}' or '{_ subscript}' *) + | List of list_kind (* '{ul' or '{ol' *) + | LI (* '{li' *) + | DASH (* '{-' *) + | TABLE_LIGHT (* '{t' *) + | TABLE_HEAVY (* '{table' *) + | TABLE_ROW (* '{tr' *) + | Table_cell of Ast.table_cell_kind (* '{td' *) + | MINUS (* '-' *) + | PLUS (* '+' *) + | BAR (* '|' *) + | Section_heading of (int * string option) with_start_point (* '{2 Foo}' *) + | Tag of tag (* '@author Fay Carsons' *) + | Tag_with_content of tag_with_content (* i.e. '@raises Foo' *) | Raw_markup of (string option * string) with_start_point - | END + (* '{%

inline html!<\p> %}' *) + | END (* End of input *) and tag = - | Author of string with_start_point - | Since of string with_start_point - | Version of string with_start_point - | Canonical of string with_start_point - | INLINE - | OPEN - | CLOSED - | HIDDEN + | Author of string with_start_point (* '@author' *) + | Since of string with_start_point (* '@since' *) + | Version of string with_start_point (* '@version' *) + | Canonical of string with_start_point (* '@canonical' *) + | INLINE (* '@inline' *) + | OPEN (* '@open' *) + | CLOSED (* '@closed' *) + | HIDDEN (* '@hidden' *) + +(* A tag with content is a tag which may be followed by some number of block + elements delimited by a blank line *) and tag_with_content = - | DEPRECATED - | Before of string with_start_point - | Raise of string with_start_point - | Param of string with_start_point - | See of (internal_reference * string) with_start_point - | RETURN - | CHILDREN_ORDER - | TOC_STATUS - | ORDER_CATEGORY - | SHORT_TITLE + | DEPRECATED (* '@deprecated with the release of 1.0.1' *) + | Before of + string with_start_point (* '@before 1.0.1 this did something else' *) + | Raise of string with_start_point (* '@raises Foo' *) + | Param of string with_start_point (* '@param foo is a monad' *) + | See of (uri_kind * string) with_start_point + (* '@see for more info' *) + | RETURN (* '@return a monad' *) + | CHILDREN_ORDER (* '@children_order' *) + | TOC_STATUS (* '@toc_status' *) + | ORDER_CATEGORY (* '@order_category' *) + | SHORT_TITLE (* '@short_title' *) and media = | Reference of string Loc.with_location | Link of string Loc.with_location and media_target = Audio | Video | Image -let to_ref : internal_reference -> [ `Url | `File | `Document ] = function - | URL -> `Url - | File -> `File - | Document -> `Document - -let tag_with_content - (content : Ast.nestable_block_element Loc.with_location list) : - tag_with_content -> Ast.tag = function - | DEPRECATED -> `Deprecated content - | Before { inner; _ } -> `Before (inner, content) - | Raise { inner; _ } -> `Raise (inner, content) - | Param { inner; _ } -> `Param (inner, content) - | See { inner = kind, href; _ } -> `See (to_ref kind, href, content) - | RETURN -> `Return content - | CHILDREN_ORDER -> `Children_order content - | TOC_STATUS -> `Toc_status content - | ORDER_CATEGORY -> `Order_category content - | SHORT_TITLE -> `Short_title content - -let tag_with_content_start_point : tag_with_content -> Loc.point option = - function - | Before { start; _ } - | Raise { start; _ } - | Param { start; _ } - | See { start; _ } -> - Some start - | _ -> None - -let tag_bare : tag Loc.with_location -> Ast.tag = function - | { value = Author s; _ } -> `Author s.inner - | { value = Since s; _ } -> `Since s.inner - | { value = Version s; _ } -> `Version s.inner - | { value = Canonical s; _ } as loc -> `Canonical { loc with value = s.inner } - | { value = INLINE; _ } -> `Inline - | { value = OPEN; _ } -> `Open - | { value = CLOSED; _ } -> `Closed - | { value = HIDDEN; _ } -> `Hidden - let media_description ref_kind media_kind = let media_kind = match media_kind with @@ -277,6 +251,9 @@ let describe : token -> string = function | Tag CLOSED -> "'@closed'" | Tag HIDDEN -> "'@hidden'" +(* Functions and helpers for describing a parsed AST node + This is useful in error handling inside the Menhir parser *) + let empty_code_block = { inner = @@ -298,13 +275,6 @@ let of_ast_style : Ast.style -> style = function | `Superscript -> Superscript | `Subscript -> Subscript -let to_ast_style : style -> Ast.style = function - | Bold -> `Bold - | Italic -> `Italic - | Emphasis -> `Emphasis - | Superscript -> `Superscript - | Subscript -> `Subscript - let dummy_ref inner = { inner = @@ -357,7 +327,6 @@ let of_media : start = Loc.dummy_pos; } -(* NOTE: Fix list *) let describe_nestable_block : Ast.nestable_block_element -> string = function | `Paragraph ws -> ( match ws with @@ -366,14 +335,17 @@ let describe_nestable_block : Ast.nestable_block_element -> string = function | `Code_block _ -> describe @@ Code_block empty_code_block | `Verbatim _ -> describe @@ Verbatim { inner = ""; start = Loc.dummy_pos } | `Modules _ -> describe MODULES - | `List (_, _, _) -> "List" + | `List (ordering, `Light, _) -> + describe @@ if ordering = `Ordered then PLUS else MINUS + | `List (ordering, `Heavy, _) -> + describe @@ List (if ordering = `Ordered then Ordered else Unordered) | `Table (_, kind) -> describe @@ if kind = `Light then TABLE_LIGHT else TABLE_HEAVY | `Math_block _ -> describe @@ Math_block { start = Loc.dummy_pos; inner = "" } | `Media _ as media -> describe @@ of_media media -let of_ast_ref : [ `Document | `File | `Url ] -> internal_reference = function +let of_ast_ref : [ `Document | `File | `Url ] -> uri_kind = function | `Document -> Document | `File -> File | `Url -> URL From d1dcf173d0a140ced028bfca9b67440207cf9ec5 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 16 Jan 2025 15:20:31 -0500 Subject: [PATCH 144/150] Add comments, TODO file --- src/parser/TODO.md | 15 ++++++++++++++- src/parser/lexer.mll | 4 ---- src/parser/parser.mly | 5 ++--- src/parser/test_driver/tester.ml | 8 ++++++-- 4 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/parser/TODO.md b/src/parser/TODO.md index 10ca818b43..c44dabcdbd 100644 --- a/src/parser/TODO.md +++ b/src/parser/TODO.md @@ -1 +1,14 @@ -- Some locations are p +- Some locations are still not accurate, all necessary tokens should pass + starting location so this should be trivial to fix +- Top-level errors like two nestable block elements or headings on the same line + need to be handled. Currently, they parse correctly but do not emit a warning. +- Repetition in `tag_with_content` parse rule(parser.mly:207). Two productions are identical + save for a newline. This is because an optional newline causes a reduce conflict due to + `nestable_block_element`'s handling of whitespace. +- Improve error handling inside light table cells. Currently, we cannot do much besides use + Menhir's `error` token, which erases all information about the error which happened and we + have to use a string of the offending token to display what went wrong to users, which + doesn't necessarily communicate a lot +- Tests. There are a few tests, like the ones which test the positions in the lexing buffer, + which don't apply to the new parser. Others expect error messages which cannot be produced + by the relevant parser rule diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 4edbb95026..c5b59d1ef3 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -203,10 +203,6 @@ let warning = with_location_adjustments @@ fun _ input location error -> input.warnings <- error location :: input.warnings -(* TODO: Opening delimiter position and parameter position need to be tweaked here - I think we probably need inner to be `string Loc.with_location` since - a `Reference takes one as a parameter. -*) let reference_token : Lexing.lexbuf -> input diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 7ce0f2a947..2f65465161 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -204,7 +204,7 @@ let tag_with_content := tag = located(Tag_with_content); children = sequence(nes | tag = located(Tag_with_content); horizontal_whitespace?; { return @@ { tag with Loc.value = tag_with_content [] tag.Loc.value } } - (* NOTE: (@FayCarsons) Right now this is the only way to accept a newline + (* TODO: (@FayCarsons) Right now this is the only way to accept a newline after a tag_with_content, adding an optional newline causes unsolvable reduce conflicts. Maybe if the line break/whitespace handling for nestable block element were @@ -308,7 +308,6 @@ let style := (* LINKS + REFS *) -(* TODO: See comment @ lexer.mll:205 *) let reference := | ref_body = located(Simple_ref); { let Loc.{ value = Tokens.{ inner; start }; location } = ref_body in @@ -607,7 +606,7 @@ let cell_inner := let text_span = Loc.extract ~start_pos ~end_pos ~input in Parse_error.illegal ~in_what:(Tokens.describe TABLE_LIGHT) text_span span) in - (* NOTE: (@FayCarsons) + (* TODO: (@FayCarsons) This is the best we can do right now. Accepting a `nestable_block_element`, for example, causes a reduce/reduce conflict. So we have to lose some information(what the invalid element was) via the diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml index d457c6560e..dd65fe1c69 100644 --- a/src/parser/test_driver/tester.ml +++ b/src/parser/test_driver/tester.ml @@ -280,7 +280,11 @@ let explicit_list_tests = ] let isolated = - [ ("light list horizontal offset", "- foo bar baz\n - ba ba ba") ] + [ + ("tag in list", "- @deprecated foo bar baz"); + ("section in list", "- {2 Foo}"); + ("Paragraph after section", "{2 Foo} bar"); + ] (* Cases (mostly) taken from the 'odoc for library authors' document *) let documentation_cases = @@ -389,7 +393,7 @@ let run_test (label, case) = Right (mkfailure label exns (Option.get !offending_token) - !failure_index !tokens) + !failure_index (List.rev !tokens)) let sep = String.init 80 @@ Fun.const '-' From e7c41b987de379828134b256d146a10b48daedc1 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 16 Jan 2025 15:35:00 -0500 Subject: [PATCH 145/150] fix unclosed links and refs --- src/parser/parser.mly | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 2f65465161..da57d7ab9f 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -341,6 +341,16 @@ let reference := let node = Loc.at span @@ `Reference (`With_text, inner, children) in Writer.return_warning node not_allowed } + | content = Ref_with_replacement; endpos = located(END); { + let Tokens.{ inner; start } = content in + let span = { endpos.Loc.location with start } in + let in_what = Tokens.describe @@ Ref_with_replacement content in + let end_not_allowed = Writer.Warning (Parse_error.end_not_allowed ~in_what span) in + `Reference (`With_text, inner, []) + |> Loc.at span + |> return + |> Writer.warning end_not_allowed + } let link := | content = located(Simple_link); { @@ -376,10 +386,20 @@ let link := let span = { endpos.Loc.location with start } in let node = Loc.at span @@ `Link (inner, []) in let what = Tokens.describe @@ Link_with_replacement content in - let warning = + let should_not_be_empty = Writer.Warning (Parse_error.should_not_be_empty ~what span) in - Writer.return_warning node warning + Writer.return_warning node should_not_be_empty + } + | content = Link_with_replacement; endpos = located(END); { + let Tokens.{ inner; start } = content in + let span = { endpos.Loc.location with start } in + let in_what = Tokens.describe @@ Link_with_replacement content in + let end_not_allowed = Writer.Warning (Parse_error.end_not_allowed ~in_what span) in + `Link (inner, []) + |> Loc.at span + |> return + |> Writer.warning end_not_allowed } (* LIST *) From 2e89908cd47fb6661a5ecf2f6ea1a2ffca07eb55 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 16 Jan 2025 15:39:05 -0500 Subject: [PATCH 146/150] update lexbuf line from `Code_block` content --- src/parser/lexer.mll | 17 +++++++++-------- src/parser/parser.mly | 20 ++++++++++---------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index c5b59d1ef3..7e16fda7f6 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -279,16 +279,17 @@ let emit_verbatim lexbuf input start_offset buffer = the value of the content in the tree has whitespace stripped from the beginning, and trailing empty lines removed. *) let emit_code_block lexbuf input ~start_offset ~content_offset ~metadata ~delimiter ~terminator ~content has_output = - let content = Buffer.contents content |> trim_trailing_blank_lines in + let content = Buffer.contents content in let content_location = input.offset_to_location content_offset in + update_content_newlines ~content lexbuf; let content = - with_location_adjustments - (fun _ _ _ c -> - let first_line_offset = content_location.column in - trim_leading_whitespace ~first_line_offset c) - lexbuf - input - content + trim_trailing_blank_lines content + |> with_location_adjustments + (fun _ _ _ c -> + let first_line_offset = content_location.column in + trim_leading_whitespace ~first_line_offset c) + lexbuf + input |> trim_leading_blank_lines |> with_location_adjustments ~adjust_end_by:terminator diff --git a/src/parser/parser.mly b/src/parser/parser.mly index da57d7ab9f..02ea962f54 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -309,32 +309,32 @@ let style := (* LINKS + REFS *) let reference := - | ref_body = located(Simple_ref); { - let Loc.{ value = Tokens.{ inner; start }; location } = ref_body in + | content = located(Simple_ref); { + let Loc.{ value = Tokens.{ inner; start }; location } = content in let span = { location with start } in return @@ Loc.at span @@ `Reference (`Simple, inner, []) } - | ref_body = Ref_with_replacement; children = sequence_nonempty(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { + | content = Ref_with_replacement; children = sequence_nonempty(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { Writer.bind children ~f:(fun c -> - let Tokens.{ inner; start } = ref_body in + let Tokens.{ inner; start } = content in let span = { endpos.Loc.location with start } in return @@ Loc.at span @@ `Reference (`With_text, inner, trim_start c)) } - | ref_body = Ref_with_replacement; endpos = located(RIGHT_BRACE); { - let Tokens.{ inner; start } = ref_body in + | content = Ref_with_replacement; endpos = located(RIGHT_BRACE); { + let Tokens.{ inner; start } = content in let span = { endpos.Loc.location with start } in let node = Loc.at span @@ `Reference (`With_text, inner, []) in let warning = - let what = Tokens.describe @@ Ref_with_replacement ref_body in + let what = Tokens.describe @@ Ref_with_replacement content in Writer.Warning (Parse_error.should_not_be_empty ~what span) in Writer.return_warning node warning } - | ref_body = Ref_with_replacement; children = sequence_nonempty(inline_element(whitespace))?; endpos = located(END); { - let Tokens.{ inner; start } = ref_body in + | content = Ref_with_replacement; children = sequence_nonempty(inline_element(whitespace))?; endpos = located(END); { + let Tokens.{ inner; start } = content in let span = { endpos.Loc.location with start } in let not_allowed = - let in_what = Tokens.describe (Ref_with_replacement ref_body) in + let in_what = Tokens.describe (Ref_with_replacement content) in Writer.Warning (Parse_error.end_not_allowed ~in_what span) in let* children = Option.value ~default:(return []) children in From 10f9113a107f533ef416407873b0a5992e930957 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 16 Jan 2025 15:46:20 -0500 Subject: [PATCH 147/150] Warn empty code_block --- src/parser/parser.mly | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/parser/parser.mly b/src/parser/parser.mly index 02ea962f54..ea1dd2c9c0 100644 --- a/src/parser/parser.mly +++ b/src/parser/parser.mly @@ -799,8 +799,14 @@ let code_block := let Loc.{ value = Tokens.{ inner; start }; location } = content in let Tokens.{ metadata; delimiter; content } = inner in let meta = Option.map (fun Tokens.{ language_tag; tags } -> Ast.{ language = language_tag; tags }) metadata in - let node = `Code_block Ast.{ meta; delimiter; content; output = None } in - return @@ Loc.at { location with start } node + let span = { location with start } in + let should_not_be_empty = Writer.Warning ( + let what = Tokens.describe @@ Code_block Tokens.{ inner; start } in + Parse_error.should_not_be_empty ~what span) + in + let node = Loc.at span @@ `Code_block Ast.{ meta; delimiter; content; output = None } in + Fun.const node <$> Writer.ensure (Loc.is has_content) should_not_be_empty (return content) + } | content = located(Code_block_with_output); output = sequence_nonempty(nestable_block_element(paragraph)); RIGHT_CODE_DELIMITER; { let* output = Option.some <$> output in From 9a11534bbedc4872ddf5bea17922b7164e950350 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 16 Jan 2025 15:51:06 -0500 Subject: [PATCH 148/150] Note on location problems --- src/parser/TODO.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/parser/TODO.md b/src/parser/TODO.md index c44dabcdbd..c38eb1919e 100644 --- a/src/parser/TODO.md +++ b/src/parser/TODO.md @@ -1,5 +1,7 @@ -- Some locations are still not accurate, all necessary tokens should pass - starting location so this should be trivial to fix +- Some locations are still not accurate. This seems to be acting up in comments that span + many lines. There is potentially an off-by-one error or similar in + `Lexer.update_content_newlines` which is (supposed) to increment the lexbuf's line + position for every newline encountered in some content (i.e. inside of a code or math block) - Top-level errors like two nestable block elements or headings on the same line need to be handled. Currently, they parse correctly but do not emit a warning. - Repetition in `tag_with_content` parse rule(parser.mly:207). Two productions are identical From 3f1cb4934d0627e054912c6c2c16a62c49910ae6 Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 16 Jan 2025 16:02:14 -0500 Subject: [PATCH 149/150] Notes in `TODO.md` --- src/parser/TODO.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/parser/TODO.md b/src/parser/TODO.md index c38eb1919e..ce0a7197ba 100644 --- a/src/parser/TODO.md +++ b/src/parser/TODO.md @@ -2,15 +2,40 @@ many lines. There is potentially an off-by-one error or similar in `Lexer.update_content_newlines` which is (supposed) to increment the lexbuf's line position for every newline encountered in some content (i.e. inside of a code or math block) + - Top-level errors like two nestable block elements or headings on the same line need to be handled. Currently, they parse correctly but do not emit a warning. + - Repetition in `tag_with_content` parse rule(parser.mly:207). Two productions are identical save for a newline. This is because an optional newline causes a reduce conflict due to `nestable_block_element`'s handling of whitespace. + - Improve error handling inside light table cells. Currently, we cannot do much besides use Menhir's `error` token, which erases all information about the error which happened and we have to use a string of the offending token to display what went wrong to users, which doesn't necessarily communicate a lot + - Tests. There are a few tests, like the ones which test the positions in the lexing buffer, which don't apply to the new parser. Others expect error messages which cannot be produced by the relevant parser rule + +- Likely some error cases which have not been handled. These should be trivial to fix, + you should really only need to add a new production to the relevant parser rule which + handles the offending token + +Notes for anyone working on this +- Due to the nature of Menhir, this parser is difficult to work on. + - Changes will have unexpected non-local consequences due to more or less tokens being consumed by + some neighboring (in the parse tree) rule. + - You need to familiarize yourself with the branch of the parse tree that you're working on + (i.e. toplevel->nestable_block_element->paragraph) before you start making non-trivial changes. + - Type errors will point towards unrelated sections of the parser or give you incorrect information + about what has gone wrong. + +- Be as specific as possible. Avoid optional tokens when possible. Prefer the non-empty + list rules (`sequence_nonempty`, `sequence_separated_nonempty`) over the alternatives. + Ambiguity will produce a compile-time reduce/reduce rule if you're lucky, unexpected + behavior at runtime if you're not. + +- Contact me on the company slack or at faycarsons23@gmail.com if you're confused about + anything! From 405b59e61e7352f54e1973a4557849d58dcf9b5d Mon Sep 17 00:00:00 2001 From: faycarsons Date: Thu, 16 Jan 2025 16:08:30 -0500 Subject: [PATCH 150/150] Note about emulating context --- src/parser/TODO.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/parser/TODO.md b/src/parser/TODO.md index ce0a7197ba..2d2f2dda4c 100644 --- a/src/parser/TODO.md +++ b/src/parser/TODO.md @@ -32,6 +32,11 @@ Notes for anyone working on this - Type errors will point towards unrelated sections of the parser or give you incorrect information about what has gone wrong. +- If you need to emulate some sort of context like "paragraphs can't accept '|' tokens if they're inside + tables", then you need to parameterize that rule by some other rule which dictates what it can accept. + For example, toplevel block elements match `paragraph(any_symbol)` and tables match + `paragraph(symbols_except_bar)` + - Be as specific as possible. Avoid optional tokens when possible. Prefer the non-empty list rules (`sequence_nonempty`, `sequence_separated_nonempty`) over the alternatives. Ambiguity will produce a compile-time reduce/reduce rule if you're lucky, unexpected