diff --git a/.ocamlformat b/.ocamlformat index 5626d8362a..9d9e841212 100644 --- a/.ocamlformat +++ b/.ocamlformat @@ -1,3 +1,3 @@ module-item-spacing=preserve -version=0.26.1 +version=0.26.2 ocaml-version=4.02 diff --git a/dune-project b/dune-project index 27d55bb37f..968f25cd5c 100644 --- a/dune-project +++ b/dune-project @@ -11,7 +11,7 @@ (authors "Anton Bachin " - "Daniel B\195\188nzli " + "Daniel Bünzli " "David Sheets " "Jon Ludlam " "Jules Aguillon " @@ -23,10 +23,10 @@ "Emile Trotignon ") (maintainers - "Daniel B\195\188nzli " + "Daniel Bünzli " "Jon Ludlam " "Jules Aguillon " - "Paul-Elliot Angl\195\168s d'Auriac ") + "Paul-Elliot Anglès d'Auriac ") (cram enable) diff --git a/src/parser/TODO.md b/src/parser/TODO.md new file mode 100644 index 0000000000..2d2f2dda4c --- /dev/null +++ b/src/parser/TODO.md @@ -0,0 +1,46 @@ +- Some locations are still not accurate. This seems to be acting up in comments that span + many lines. There is potentially an off-by-one error or similar in + `Lexer.update_content_newlines` which is (supposed) to increment the lexbuf's line + position for every newline encountered in some content (i.e. inside of a code or math block) + +- Top-level errors like two nestable block elements or headings on the same line + need to be handled. Currently, they parse correctly but do not emit a warning. + +- Repetition in `tag_with_content` parse rule(parser.mly:207). Two productions are identical + save for a newline. This is because an optional newline causes a reduce conflict due to + `nestable_block_element`'s handling of whitespace. + +- Improve error handling inside light table cells. Currently, we cannot do much besides use + Menhir's `error` token, which erases all information about the error which happened and we + have to use a string of the offending token to display what went wrong to users, which + doesn't necessarily communicate a lot + +- Tests. There are a few tests, like the ones which test the positions in the lexing buffer, + which don't apply to the new parser. Others expect error messages which cannot be produced + by the relevant parser rule + +- Likely some error cases which have not been handled. These should be trivial to fix, + you should really only need to add a new production to the relevant parser rule which + handles the offending token + +Notes for anyone working on this +- Due to the nature of Menhir, this parser is difficult to work on. + - Changes will have unexpected non-local consequences due to more or less tokens being consumed by + some neighboring (in the parse tree) rule. + - You need to familiarize yourself with the branch of the parse tree that you're working on + (i.e. toplevel->nestable_block_element->paragraph) before you start making non-trivial changes. + - Type errors will point towards unrelated sections of the parser or give you incorrect information + about what has gone wrong. + +- If you need to emulate some sort of context like "paragraphs can't accept '|' tokens if they're inside + tables", then you need to parameterize that rule by some other rule which dictates what it can accept. + For example, toplevel block elements match `paragraph(any_symbol)` and tables match + `paragraph(symbols_except_bar)` + +- Be as specific as possible. Avoid optional tokens when possible. Prefer the non-empty + list rules (`sequence_nonempty`, `sequence_separated_nonempty`) over the alternatives. + Ambiguity will produce a compile-time reduce/reduce rule if you're lucky, unexpected + behavior at runtime if you're not. + +- Contact me on the company slack or at faycarsons23@gmail.com if you're confused about + anything! diff --git a/src/parser/ast.ml b/src/parser/ast.ml index ed214dd8ec..4771e6a274 100644 --- a/src/parser/ast.ml +++ b/src/parser/ast.ml @@ -1,5 +1,17 @@ (** Abstract syntax tree representing ocamldoc comments *) +(* TODO: (@faycarsons) + We no longer need polymorphism in the parser, so for performance and + simplicity's sake the AST should (probably, assuming no issues in other + parts of Odoc) be refactored to use nominal sum types +*) + +type list_kind = [ `Ordered | `Unordered ] +type list_syntax = [ `Light | `Heavy ] +type list_item = [ `Li | `Dash ] + +type table_cell_kind = [ `Header | `Data ] + (** This is a syntactic representation of ocamldoc comments. See {{:https://ocaml.org/releases/4.12/htmlman/ocamldoc.html}The manual} for a detailed description of the syntax understood. Note that there is no attempt at semantic @@ -40,8 +52,8 @@ type code_block_meta = { tags : string with_location option; } -type media = Token.media -type media_href = Token.media_href +type media = [ `Audio | `Video | `Image ] +type media_href = [ `Reference of string | `Link of string ] type code_block = { meta : code_block_meta option; @@ -56,9 +68,7 @@ and nestable_block_element = | `Verbatim of string | `Modules of string with_location list | `List of - [ `Unordered | `Ordered ] - * [ `Light | `Heavy ] - * nestable_block_element with_location list list + list_kind * list_syntax * nestable_block_element with_location list list | `Table of table | `Math_block of string (** @since 2.0.0 *) | `Media of reference_kind * media_href with_location * string * media diff --git a/src/parser/dune b/src/parser/dune index e7a3d1ce4c..69d306aed1 100644 --- a/src/parser/dune +++ b/src/parser/dune @@ -1,5 +1,9 @@ (ocamllex lexer) +(menhir + (modules parser) + (flags --table --external-tokens Tokens --explain)) + (library (name odoc_parser) (public_name odoc-parser) @@ -9,4 +13,4 @@ (backend bisect_ppx)) (flags (:standard -w -50)) - (libraries astring result camlp-streams)) + (libraries astring result camlp-streams menhirLib)) diff --git a/src/parser/lexer.mli b/src/parser/lexer.mli index ce053b495e..ff39d6c13b 100644 --- a/src/parser/lexer.mli +++ b/src/parser/lexer.mli @@ -3,8 +3,7 @@ type input = { file : string; offset_to_location : int -> Loc.point; - warnings : Warning.t list ref; - lexbuf : Lexing.lexbuf; + mutable warnings : Warning.t list; } -val token : input -> Lexing.lexbuf -> Token.t Loc.with_location +val token : input -> Lexing.lexbuf -> Parser.token diff --git a/src/parser/lexer.mll b/src/parser/lexer.mll index 91267bd79b..7e16fda7f6 100644 --- a/src/parser/lexer.mll +++ b/src/parser/lexer.mll @@ -1,4 +1,13 @@ { +open Tokens + +(* NOTE: + I think that, ideally, this would be rewritten from scratch, removing the + currying in `emit` and `with_location_adjustments` that makes this code + difficult to understand + + Additionally, I there think there are probably a few things here that could + be better handled by the parser. *) let unescape_word : string -> string = fun s -> (* The common case is that there are no escape sequences. *) @@ -14,8 +23,9 @@ let unescape_word : string -> string = fun s -> let c, increment = match c with | '\\' -> - if index + 1 < String.length s then - match s.[index + 1] with + let next = succ index in + if next < String.length s then + match s.[next] with | '{' | '}' | '[' | ']' | '@' as c -> c, 2 | _ -> c, 1 else c, 1 @@ -27,13 +37,12 @@ let unescape_word : string -> string = fun s -> scan_word 0; Buffer.contents buffer -type math_kind = - Inline | Block -let math_constr kind x = - match kind with - | Inline -> `Math_span x - | Block -> `Math_block x +let update_content_newlines : content:string -> Lexing.lexbuf -> unit = + fun ~content lexbuf -> + String.iter + (function '\n' -> Lexing.new_line lexbuf | _ -> ()) + content (* This is used for code and verbatim blocks. It can be done with a regular expression, but the regexp gets quite ugly, so a function is easier to @@ -45,8 +54,8 @@ let trim_leading_blank_lines : string -> string = fun s -> String.length s else match s.[index] with - | ' ' | '\t' | '\r' -> scan_for_last_newline (index + 1) trim_until - | '\n' -> scan_for_last_newline (index + 1) (index + 1) + | ' ' | '\t' | '\r' -> scan_for_last_newline (succ index) trim_until + | '\n' -> let next = succ index in scan_for_last_newline next next | _ -> trim_until in let trim_until = scan_for_last_newline 0 0 in @@ -139,16 +148,32 @@ let trim_leading_whitespace : first_line_offset:int -> string -> string = type input = { file : string; offset_to_location : int -> Loc.point; - warnings : Warning.t list ref; - lexbuf : Lexing.lexbuf; + mutable warnings : Warning.t list; } -let with_location_adjustments - k input ?start_offset ?adjust_start_by ?end_offset ?adjust_end_by value = +type math_kind = + Inline | Block +let math_constr input kind inner start_offset = + let start_pos = input.offset_to_location start_offset in + match kind with + | Inline -> Math_span Tokens.{ start = start_pos; inner } + | Block -> Math_block Tokens.{ start = start_pos; inner } + +let with_location_adjustments : + (Lexing.lexbuf -> input -> Loc.span -> 'a) + -> Lexing.lexbuf + -> input + -> ?start_offset:int + -> ?adjust_start_by:string + -> ?end_offset:int + -> ?adjust_end_by:string + -> 'a + = + fun k lexbuf input ?start_offset ?adjust_start_by ?end_offset ?adjust_end_by value -> let start = match start_offset with - | None -> Lexing.lexeme_start input.lexbuf + | None -> Lexing.lexeme_start lexbuf | Some s -> s in let start = @@ -158,7 +183,7 @@ let with_location_adjustments in let end_ = match end_offset with - | None -> Lexing.lexeme_end input.lexbuf + | None -> Lexing.lexeme_end lexbuf | Some e -> e in let end_ = @@ -172,54 +197,61 @@ let with_location_adjustments end_ = input.offset_to_location end_; } in - k input location value - -let emit = - with_location_adjustments (fun _ -> Loc.at) + k lexbuf input location value let warning = - with_location_adjustments (fun input location error -> - input.warnings := (error location) :: !(input.warnings)) - -let reference_token media start target input lexbuf = - match start with - | "{!" -> `Simple_reference target - | "{{!" -> `Begin_reference_with_replacement_text target - | "{:" -> `Simple_link (target) - | "{{:" -> `Begin_link_with_replacement_text (target) - - | "{image!" -> `Simple_media (`Reference target, `Image) - | "{image:" -> `Simple_media (`Link target, `Image) - | "{audio!" -> `Simple_media (`Reference target, `Audio) - | "{audio:" -> `Simple_media (`Link target, `Audio) - | "{video!" -> `Simple_media (`Reference target, `Video) - | "{video:" -> `Simple_media (`Link target, `Video) + with_location_adjustments @@ fun _ input location error -> + input.warnings <- error location :: input.warnings + +let reference_token : + Lexing.lexbuf + -> input + -> 'a + -> opening_delimiter:string + -> start_offset:int + -> content:string Loc.with_location + -> token + = fun lexbuf input media ~opening_delimiter ~start_offset ~content -> + let start = input.offset_to_location start_offset in + match opening_delimiter with + | "{!" -> Simple_ref { inner = content; start } + | "{{!" -> Ref_with_replacement { inner = content; start } + | "{:" -> Simple_link { inner = content.Loc.value; start } + | "{{:" -> Link_with_replacement { inner = content.Loc.value; start } + + | "{image!" -> Media { inner = (Reference content, Image); start } + | "{image:" -> Media { inner = (Link content, Image); start } + | "{audio!" -> Media { inner = (Reference content, Audio); start } + | "{audio:" -> Media { inner = (Link content, Audio); start } + | "{video!" -> Media { inner = (Reference content, Video); start } + | "{video:" -> Media { inner = (Link content, Video); start } | _ -> let target, kind = - match start with - | "{{image!" -> `Reference target, `Image - | "{{image:" -> `Link target, `Image - | "{{audio!" -> `Reference target, `Audio - | "{{audio:" -> `Link target, `Audio - | "{{video!" -> `Reference target, `Video - | "{{video:" -> `Link target, `Video + match opening_delimiter with + | "{{image!" -> Reference content, Image + | "{{image:" -> Link content, Image + | "{{audio!" -> Reference content, Audio + | "{{audio:" -> Link content, Audio + | "{{video!" -> Reference content, Video + | "{{video:" -> Link content, Video | _ -> assert false in - let token_descr = Token.describe (`Media_with_replacement_text (target, kind, "")) in + let token_descr = Tokens.describe (Media_with_replacement { inner = (target, kind, ""); start }) in let content = media token_descr (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) input lexbuf in - `Media_with_replacement_text (target, kind, content) + Media_with_replacement { inner = (target, kind, content); start } -let trim_leading_space_or_accept_whitespace input start_offset text = +let trim_leading_space_or_accept_whitespace lexbuf input start_offset text = match text.[0] with | ' ' -> String.sub text 1 (String.length text - 1) | '\t' | '\r' | '\n' -> text | exception Invalid_argument _ -> "" | _ -> - warning - input - ~start_offset - ~end_offset:(start_offset + 2) + warning + lexbuf + input + ~start_offset + ~end_offset:(start_offset + 2) Parse_error.no_leading_whitespace_in_verbatim; text @@ -230,13 +262,15 @@ let trim_trailing_space_or_accept_whitespace text = | _ -> text | exception Invalid_argument _ -> text -let emit_verbatim input start_offset buffer = - let t = Buffer.contents buffer in - let t = trim_trailing_space_or_accept_whitespace t in - let t = trim_leading_space_or_accept_whitespace input start_offset t in - let t = trim_leading_blank_lines t in - let t = trim_trailing_blank_lines t in - emit input (`Verbatim t) ~start_offset +let emit_verbatim lexbuf input start_offset buffer = + let start = input.offset_to_location start_offset in + let inner = Buffer.contents buffer + |> trim_trailing_space_or_accept_whitespace + |> trim_leading_space_or_accept_whitespace lexbuf input start_offset + |> trim_leading_blank_lines + |> trim_trailing_blank_lines + in + Verbatim { inner; start } (* The locations have to be treated carefully in this function. We need to ensure that the []`Code_block] location matches the entirety of the block including the terminator, @@ -244,30 +278,64 @@ let emit_verbatim input start_offset buffer = Note that the location reflects the content _without_ stripping of whitespace, whereas the value of the content in the tree has whitespace stripped from the beginning, and trailing empty lines removed. *) -let emit_code_block ~start_offset content_offset input metadata delim terminator c has_results = - let c = Buffer.contents c |> trim_trailing_blank_lines in +let emit_code_block lexbuf input ~start_offset ~content_offset ~metadata ~delimiter ~terminator ~content has_output = + let content = Buffer.contents content in let content_location = input.offset_to_location content_offset in - let c = - with_location_adjustments - (fun _ _location c -> - let first_line_offset = content_location.column in - trim_leading_whitespace ~first_line_offset c) - input c + update_content_newlines ~content lexbuf; + let content = + trim_trailing_blank_lines content + |> with_location_adjustments + (fun _ _ _ c -> + let first_line_offset = content_location.column in + trim_leading_whitespace ~first_line_offset c) + lexbuf + input + |> trim_leading_blank_lines + |> with_location_adjustments + ~adjust_end_by:terminator + ~start_offset:content_offset + (fun _ _ -> Loc.at) + lexbuf + input in - let c = trim_leading_blank_lines c in - let c = with_location_adjustments ~adjust_end_by:terminator ~start_offset:content_offset (fun _ -> Loc.at) input c in - emit ~start_offset input (`Code_block (metadata, delim, c, has_results)) - -let heading_level input level = + let inner = { metadata; delimiter; content } in + let start = input.offset_to_location start_offset in + if has_output then + Code_block_with_output { inner; start } + else + Code_block { inner; start } + +let heading_level lexbuf input level = if String.length level >= 2 && level.[0] = '0' then begin - warning - input ~start_offset:1 (Parse_error.leading_zero_in_heading_level level) + let leading_zero = Parse_error.leading_zero_in_heading_level level in + warning lexbuf input ~start_offset:1 leading_zero end; int_of_string level let buffer_add_lexeme buffer lexbuf = Buffer.add_string buffer (Lexing.lexeme lexbuf) +let trim_start_horizontal_whitespace : string -> string = fun s -> + let rec go idx = + let c = s.[idx] in + if Char.equal c ' ' + then go @@ succ idx + else String.sub s idx (String.length s - idx) + in + go 0 + +let or_insert_lazy default = + function + | None -> Some (default ()) + | x -> x + +(* This is an attempt to prevent having to compute the lexeme start for every + character in `reference_content`/`reference_paren_content` + Unsure if it's actually useful or worth keeping +*) +let ensure_lexeme_start lexbuf = + or_insert_lazy (Fun.const @@ Lexing.lexeme_start lexbuf) + } let markup_char = @@ -285,7 +353,7 @@ let horizontal_space = let newline = '\n' | "\r\n" -let media_start = +let reference = "{!" | "{{!" | "{:" | "{{:" | "{image!" | "{{image!" | "{image:" | "{{image:" | "{video!" | "{{video!" | "{video:" | "{{video:" @@ -303,118 +371,179 @@ let language_tag_char = let delim_char = ['a'-'z' 'A'-'Z' '0'-'9' '_' ] -rule reference_paren_content input start ref_offset start_offset depth_paren - buffer = +rule reference_paren_content input opening_delimiter start_offset content_offset depth_paren buffer = parse | '(' { buffer_add_lexeme buffer lexbuf ; - reference_paren_content input start ref_offset start_offset - (depth_paren + 1) buffer lexbuf } + reference_paren_content + input + opening_delimiter + start_offset + content_offset + (succ depth_paren) + buffer + lexbuf } | ')' { buffer_add_lexeme buffer lexbuf ; if depth_paren = 0 then - reference_content input start ref_offset buffer lexbuf + reference_content + input + opening_delimiter + start_offset + content_offset + buffer + lexbuf else - reference_paren_content input start ref_offset start_offset - (depth_paren - 1) buffer lexbuf } + reference_paren_content + input + opening_delimiter + start_offset + content_offset + (pred depth_paren) + buffer + lexbuf } | eof - { warning - input - ~start_offset - (Parse_error.unclosed_bracket ~bracket:"(") ; - Buffer.contents buffer } + { + let unclosed_bracket = + Parse_error.unclosed_bracket ~bracket:"(" + in + warning lexbuf input ~start_offset unclosed_bracket; + let start = input.offset_to_location @@ Option.value ~default:(Lexing.lexeme_start lexbuf) content_offset + and end_ = input.offset_to_location @@ Lexing.lexeme_end lexbuf in + Loc.{ value = Buffer.contents buffer; location = { start; end_; file = input.file }} + } | _ { buffer_add_lexeme buffer lexbuf ; - reference_paren_content input start ref_offset start_offset depth_paren - buffer lexbuf } - -and reference_content input start start_offset buffer = parse + reference_paren_content + input + opening_delimiter + start_offset + content_offset + depth_paren + buffer + lexbuf } + +and reference_content input opening_delimiter start_offset content_offset buffer = + parse | '}' { - Buffer.contents buffer + let start = input.offset_to_location @@ Option.value ~default:(Lexing.lexeme_start lexbuf) content_offset + and end_ = input.offset_to_location @@ Lexing.lexeme_end lexbuf in + Loc.{ value = Buffer.contents buffer; location = { start; end_; file = input.file }} } | '(' { - buffer_add_lexeme buffer lexbuf ; - reference_paren_content input start start_offset - (Lexing.lexeme_start lexbuf) 0 buffer lexbuf + buffer_add_lexeme buffer lexbuf; + reference_paren_content + input + opening_delimiter + start_offset + (ensure_lexeme_start lexbuf content_offset) (* get the content offset if we haven't *) + 0 + buffer + lexbuf } | '"' [^ '"']* '"' { buffer_add_lexeme buffer lexbuf ; - reference_content input start start_offset buffer lexbuf + reference_content + input + opening_delimiter + start_offset + (ensure_lexeme_start lexbuf content_offset) + buffer + lexbuf } | eof - { warning - input - ~start_offset - (Parse_error.unclosed_bracket ~bracket:start) ; - Buffer.contents buffer } + { + let unclosed_bracket = + Parse_error.unclosed_bracket ~bracket:opening_delimiter + in + warning lexbuf input ~start_offset unclosed_bracket; + let start = input.offset_to_location @@ Option.value ~default:(Lexing.lexeme_start lexbuf) content_offset + and end_ = input.offset_to_location @@ Lexing.lexeme_end lexbuf in + Loc.{ value = Buffer.contents buffer; location = { start; end_; file = input.file }} + } | _ { - buffer_add_lexeme buffer lexbuf ; - reference_content input start start_offset buffer lexbuf } + buffer_add_lexeme buffer lexbuf; + reference_content + input + opening_delimiter + start_offset + (ensure_lexeme_start lexbuf content_offset) + buffer + lexbuf } and token input = parse | horizontal_space* eof - { emit input `End } + { END } | ((horizontal_space* newline as prefix) - horizontal_space* ((newline horizontal_space*)+ as suffix) as ws) - { emit input (`Blank_line ws) ~adjust_start_by:prefix ~adjust_end_by:suffix } + horizontal_space* (((newline)+ as suffix) as ws)) + { + (* Account for the first newline we got *) + update_content_newlines ~content:("\n" ^ prefix ^ suffix) lexbuf; + Blank_line ws + } - | (horizontal_space* newline horizontal_space* as ws) - { emit input (`Single_newline ws) } + | (horizontal_space* (newline as ws)) + { + Lexing.new_line lexbuf; + Single_newline ws + } | (horizontal_space+ as ws) - { emit input (`Space ws) } + { Space ws } - | (horizontal_space* (newline horizontal_space*)? as p) '}' - { emit input `Right_brace ~adjust_start_by:p } + | (horizontal_space* (newline)? as p) '}' + { + update_content_newlines ~content:p lexbuf; + RIGHT_BRACE } | '|' - { emit input `Bar } + { BAR } | word_char (word_char | bullet_char | '@')* | bullet_char (word_char | bullet_char | '@')+ as w - { emit input (`Word (unescape_word w)) } + { (Word (unescape_word w)) } | '[' { code_span (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) input lexbuf } | '-' - { emit input `Minus } + { MINUS } | '+' - { emit input `Plus } + { PLUS } | "{b" - { emit input (`Begin_style `Bold) } + { Style Bold } | "{i" - { emit input (`Begin_style `Italic) } + { Style Italic } | "{e" - { emit input (`Begin_style `Emphasis) } + { Style Emphasis } | "{L" - { emit input (`Begin_paragraph_style `Left) } + { Paragraph_style { inner = Left; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } | "{C" - { emit input (`Begin_paragraph_style `Center) } + { Paragraph_style { inner = Center; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } | "{R" - { emit input (`Begin_paragraph_style `Right) } + { Paragraph_style { inner = Right; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf } } | "{^" - { emit input (`Begin_style `Superscript) } + { Style Superscript } | "{_" - { emit input (`Begin_style `Subscript) } + { Style Subscript } | "{math" space_char { math Block (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) input lexbuf } @@ -423,257 +552,288 @@ and token input = parse { math Inline (Buffer.create 1024) 0 (Lexing.lexeme_start lexbuf) input lexbuf } - | "{!modules:" ([^ '}']* as modules) '}' - { emit input (`Modules modules) } + | "{!modules:" + { MODULES } - | (media_start as start) +| (reference as opening_delimiter) { let start_offset = Lexing.lexeme_start lexbuf in - let target = - reference_content input start start_offset (Buffer.create 16) lexbuf + let content = + reference_content input opening_delimiter start_offset None (Buffer.create 16) lexbuf in - let token = reference_token media start target input lexbuf in - emit ~start_offset input token } + reference_token lexbuf input media ~start_offset ~opening_delimiter ~content + } | "{[" { code_block false (Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf) None (Buffer.create 256) "" input lexbuf } - | (("{" (delim_char* as delim) "@" horizontal_space*) as prefix) (language_tag_char+ as lang_tag_) + | (("{" (delim_char* as delimiter) "@" horizontal_space*) as prefix) (language_tag_char+ as language_tag) { let start_offset = Lexing.lexeme_start lexbuf in - let lang_tag = - with_location_adjustments ~adjust_start_by:prefix (fun _ -> Loc.at) input lang_tag_ + let language_tag = + with_location_adjustments ~adjust_start_by:prefix (fun _ _ -> Loc.at) lexbuf input language_tag in let emit_truncated_code_block () = - let empty_content = with_location_adjustments (fun _ -> Loc.at) input "" in - emit ~start_offset input (`Code_block (Some (lang_tag, None), delim, empty_content, false)) + let empty_content = with_location_adjustments (fun _ _ -> Loc.at) lexbuf input "" in + Code_block { + inner = { + metadata = Some { language_tag; tags = None }; + delimiter = Some delimiter; + content = empty_content + }; + start = input.offset_to_location start_offset + } in (* Disallow result block sections for code blocks without a delimiter. This avoids the surprising parsing of '][' ending the code block. *) - let allow_result_block = delim <> "" in + let allow_result_block = delimiter <> "" in let code_block_with_metadata metadata = let content_offset = Lexing.lexeme_end lexbuf in - let metadata = Some (lang_tag, metadata) in + let metadata = Some { language_tag; tags = metadata } in let prefix = Buffer.create 256 in code_block allow_result_block start_offset content_offset metadata - prefix delim input lexbuf + prefix delimiter input lexbuf in match code_block_metadata_tail input lexbuf with - | `Ok metadata -> code_block_with_metadata metadata - | `Eof -> - warning input ~start_offset Parse_error.truncated_code_block_meta; + | Ok metadata -> code_block_with_metadata metadata + | Error `Eof -> + warning lexbuf input ~start_offset Parse_error.truncated_code_block_meta; emit_truncated_code_block () - | `Invalid_char c -> - warning input ~start_offset - (Parse_error.language_tag_invalid_char lang_tag_ c); + | Error (`Invalid_char c) -> + warning lexbuf input ~start_offset + (Parse_error.language_tag_invalid_char language_tag.Loc.value c); code_block_with_metadata None } | "{@" horizontal_space* '[' { - warning input Parse_error.no_language_tag_in_meta; + warning lexbuf input Parse_error.no_language_tag_in_meta; code_block false (Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf) None (Buffer.create 256) "" input lexbuf } | "{v" - { verbatim - (Buffer.create 1024) None (Lexing.lexeme_start lexbuf) input lexbuf } + { verbatim (Buffer.create 1024) None (Lexing.lexeme_start lexbuf) input lexbuf } | "{%" ((raw_markup_target as target) ':')? (raw_markup as s) ("%}" | eof as e) - { let token = `Raw_markup (target, s) in - if e <> "%}" then - warning - input - ~start_offset:(Lexing.lexeme_end lexbuf) - (Parse_error.not_allowed - ~what:(Token.describe `End) - ~in_what:(Token.describe token)); - emit input token } + { + let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + let token = Raw_markup { inner = (target, s); start } in + if e <> "%}" then begin + let not_allowed = + Parse_error.not_allowed + ~what:(Tokens.describe END) + ~in_what:(Tokens.describe token) + in + let start_offset = Lexing.lexeme_end lexbuf in + warning lexbuf input ~start_offset not_allowed + end; + token + } | "{ul" - { emit input (`Begin_list `Unordered) } + { List Unordered } | "{ol" - { emit input (`Begin_list `Ordered) } + { List Ordered } | "{li" - { emit input (`Begin_list_item `Li) } + { LI } | "{-" - { emit input (`Begin_list_item `Dash) } + { DASH } | "{table" - { emit input (`Begin_table_heavy) } + { TABLE_HEAVY } | "{t" - { emit input (`Begin_table_light) } + { TABLE_LIGHT } | "{tr" - { emit input `Begin_table_row } + { TABLE_ROW } | "{th" - { emit input (`Begin_table_cell `Header) } + { Table_cell `Header } | "{td" - { emit input (`Begin_table_cell `Data) } + { Table_cell `Data } | '{' (['0'-'9']+ as level) ':' (([^ '}'] # space_char)* as label) - { emit - input (`Begin_section_heading (heading_level input level, Some label)) } + { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + Section_heading { inner = (heading_level lexbuf input level, Some label); start } } | '{' (['0'-'9']+ as level) - { emit input (`Begin_section_heading (heading_level input level, None)) } + { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + Section_heading { inner = (heading_level lexbuf input level, None); start } } - | "@author" ((horizontal_space+ [^ '\r' '\n']*)? as author) - { emit input (`Tag (`Author author)) } + | "@author" horizontal_space+ (([^ '\r' '\n']*)? as author) + { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + Tag (Author { inner = (trim_start_horizontal_whitespace author); start }) } | "@deprecated" - { emit input (`Tag `Deprecated) } + { Tag_with_content DEPRECATED } - | "@param" horizontal_space+ ((_ # space_char)+ as name) - { emit input (`Tag (`Param name)) } + | "@param" horizontal_space+ ((_ # space_char)+ as inner) + { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + Tag_with_content (Param { inner; start }) } - | ("@raise" | "@raises") horizontal_space+ ((_ # space_char)+ as name) - { emit input (`Tag (`Raise name)) } + | ("@raise" | "@raises") horizontal_space+ ((_ # space_char)+ as inner) + { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + Tag_with_content (Raise { inner; start })} | ("@return" | "@returns") - { emit input (`Tag `Return) } + { Tag_with_content RETURN } | ("@children_order") - { emit input (`Tag `Children_order) } + { Tag_with_content CHILDREN_ORDER } | ("@toc_status") - { emit input (`Tag `Toc_status) } + { Tag_with_content TOC_STATUS } | ("@order_category") - { emit input (`Tag `Order_category) } + { Tag_with_content ORDER_CATEGORY } | ("@short_title") - { emit input (`Tag `Short_title) } + { Tag_with_content SHORT_TITLE } | "@see" horizontal_space* '<' ([^ '>']* as url) '>' - { emit input (`Tag (`See (`Url, url))) } + { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + Tag_with_content (See { inner = (URL, trim_start_horizontal_whitespace url); start }) } | "@see" horizontal_space* '\'' ([^ '\'']* as filename) '\'' - { emit input (`Tag (`See (`File, filename))) } + { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + Tag_with_content (See { inner = (File, trim_start_horizontal_whitespace filename); start }) } | "@see" horizontal_space* '"' ([^ '"']* as name) '"' - { emit input (`Tag (`See (`Document, name))) } + { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + Tag_with_content (See { inner = (Document, trim_start_horizontal_whitespace name); start }) } - | "@since" ((horizontal_space+ [^ '\r' '\n']*)? as version) - { emit input (`Tag (`Since version)) } + | "@since" horizontal_space+ (([^ '\r' '\n']+) as inner) + { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + Tag (Since { inner; start }) } + | "@since" + { Tag (Since { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf }) } - | "@before" horizontal_space+ ((_ # space_char)+ as version) - { emit input (`Tag (`Before version)) } + | "@before" horizontal_space+ ((_ # space_char)+ as inner) + { let start = input.offset_to_location @@ Lexing.lexeme_start lexbuf in + Tag_with_content (Before { inner; start }) } - | "@version" ((horizontal_space+ [^ '\r' '\n']*)? as version) - { emit input (`Tag (`Version version)) } + | "@version" horizontal_space+ (([^ '\r' '\n']+) as inner) + { Tag (Version { inner; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf }) } + | "@version" + { Tag (Version { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf }) } - | "@canonical" ((horizontal_space+ [^ '\r' '\n']*)? as identifier) - { emit input (`Tag (`Canonical identifier)) } + | "@canonical" horizontal_space+ (([^ '\r' '\n']+) as inner) + { Tag (Canonical { inner; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf }) } + | "@canonical" + { Tag (Canonical { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf }) } | "@inline" - { emit input (`Tag `Inline) } + { Tag INLINE } | "@open" - { emit input (`Tag `Open) } + { Tag OPEN } | "@closed" - { emit input (`Tag `Closed) } + { Tag CLOSED } | "@hidden" - { emit input (`Tag `Hidden) } + { Tag HIDDEN } | "]}" - { emit input `Right_code_delimiter} + { RIGHT_CODE_DELIMITER } | '{' - { try bad_markup_recovery (Lexing.lexeme_start lexbuf) input lexbuf - with Failure _ -> - warning - input - (Parse_error.bad_markup - "{" ~suggestion:"escape the brace with '\\{'."); - emit input (`Word "{") } + { + try bad_markup_recovery (Lexing.lexeme_start lexbuf) input lexbuf + with Failure _ -> begin + let bad_markup = + Parse_error.bad_markup "{" ~suggestion:"escape the brace with '\\{'." + in + warning lexbuf input bad_markup + end; + (Word "{") + } | ']' - { warning input Parse_error.unpaired_right_bracket; - emit input (`Word "]") } + { warning lexbuf input Parse_error.unpaired_right_bracket; + Word "]" } | "@param" - { warning input Parse_error.truncated_param; - emit input (`Tag (`Param "")) } + { warning lexbuf input Parse_error.truncated_param; + Tag_with_content (Param { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf }) } | ("@raise" | "@raises") as tag - { warning input (Parse_error.truncated_raise tag); - emit input (`Tag (`Raise "")) } + { warning lexbuf input (Parse_error.truncated_raise tag); + Tag_with_content (Raise { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf }) } | "@before" - { warning input Parse_error.truncated_before; - emit input (`Tag (`Before "")) } + { warning lexbuf input Parse_error.truncated_before; + Tag_with_content (Before { inner = ""; start = input.offset_to_location @@ Lexing.lexeme_start lexbuf }) } | "@see" - { warning input Parse_error.truncated_see; - emit input (`Word "@see") } + { warning lexbuf input Parse_error.truncated_see; + Word "@see" } | '@' ['a'-'z' 'A'-'Z']+ as tag - { warning input (Parse_error.unknown_tag tag); - emit input (`Word tag) } + { warning lexbuf input (Parse_error.unknown_tag tag); + Word tag } | '@' - { warning input Parse_error.stray_at; - emit input (`Word "@") } + { warning lexbuf input Parse_error.stray_at; + Word "@" } | '\r' - { warning input Parse_error.stray_cr; + { warning lexbuf input Parse_error.stray_cr; token input lexbuf } - | "{!modules:" ([^ '}']* as modules) eof - { warning - input - ~start_offset:(Lexing.lexeme_end lexbuf) - (Parse_error.not_allowed - ~what:(Token.describe `End) - ~in_what:(Token.describe (`Modules ""))); - emit input (`Modules modules) } - and code_span buffer nesting_level start_offset input = parse | ']' - { if nesting_level = 0 then - emit input (`Code_span (Buffer.contents buffer)) ~start_offset + { + if nesting_level = 0 then + Code_span { inner = (Buffer.contents buffer); start = input.offset_to_location start_offset } else begin Buffer.add_char buffer ']'; code_span buffer (nesting_level - 1) start_offset input lexbuf - end } + end + } | '[' { Buffer.add_char buffer '['; - code_span buffer (nesting_level + 1) start_offset input lexbuf } + code_span buffer (succ nesting_level) start_offset input lexbuf } | '\\' ('[' | ']' as c) { Buffer.add_char buffer c; code_span buffer nesting_level start_offset input lexbuf } - | newline horizontal_space* (newline horizontal_space*)+ - { warning - input - (Parse_error.not_allowed - ~what:(Token.describe (`Blank_line "\n\n")) - ~in_what:(Token.describe (`Code_span ""))); + | newline horizontal_space* ((newline horizontal_space*)+ as ws) + { + let not_allowed = + Parse_error.not_allowed + ~what:(Tokens.describe (Blank_line "\n\n")) + ~in_what:(Tokens.describe (Code_span {inner = ""; start = Loc.dummy_pos})) + in + warning lexbuf input not_allowed; + update_content_newlines ~content:("\n" ^ ws) lexbuf; Buffer.add_char buffer ' '; code_span buffer nesting_level start_offset input lexbuf } | newline horizontal_space* - { Buffer.add_char buffer ' '; - code_span buffer nesting_level start_offset input lexbuf } + { + Lexing.new_line lexbuf; + Buffer.add_char buffer ' '; + code_span buffer nesting_level start_offset input lexbuf + } | eof - { warning - input - (Parse_error.not_allowed - ~what:(Token.describe `End) - ~in_what:(Token.describe (`Code_span ""))); - emit input (`Code_span (Buffer.contents buffer)) ~start_offset } + { + let not_allowed = + Parse_error.not_allowed + ~what:(Tokens.describe END) + ~in_what:(Tokens.describe (Code_span {inner = ""; start = Loc.dummy_pos})) + in + warning lexbuf input not_allowed; + Code_span { inner = (Buffer.contents buffer); start = input.offset_to_location start_offset } + } | _ as c { Buffer.add_char buffer c; @@ -681,28 +841,32 @@ and code_span buffer nesting_level start_offset input = parse and math kind buffer nesting_level start_offset input = parse | '}' - { if nesting_level == 0 then - emit input (math_constr kind (Buffer.contents buffer)) ~start_offset - else begin + { + if nesting_level == 0 then ( + math_constr input kind (Buffer.contents buffer)) start_offset + else ( Buffer.add_char buffer '}'; - math kind buffer (nesting_level - 1) start_offset input lexbuf - end - } + math kind buffer (pred nesting_level) start_offset input lexbuf) + } | '{' - { Buffer.add_char buffer '{'; - math kind buffer (nesting_level + 1) start_offset input lexbuf } + { + Buffer.add_char buffer '{'; + math kind buffer (succ nesting_level) start_offset input lexbuf + } | ("\\{" | "\\}") as s { Buffer.add_string buffer s; math kind buffer nesting_level start_offset input lexbuf } | (newline) as s { + Lexing.new_line lexbuf; match kind with | Inline -> - warning - input - (Parse_error.not_allowed - ~what:(Token.describe (`Blank_line "\n")) - ~in_what:(Token.describe (math_constr kind ""))); + let not_allowed = + Parse_error.not_allowed + ~what:(Tokens.describe (Single_newline "\n")) + ~in_what:(Tokens.describe (math_constr input kind "" start_offset)) + in + warning lexbuf input not_allowed; Buffer.add_char buffer '\n'; math kind buffer nesting_level start_offset input lexbuf | Block -> @@ -710,12 +874,14 @@ and math kind buffer nesting_level start_offset input = parse math kind buffer nesting_level start_offset input lexbuf } | eof - { warning - input - (Parse_error.not_allowed - ~what:(Token.describe `End) - ~in_what:(Token.describe (math_constr kind ""))); - emit input (math_constr kind (Buffer.contents buffer)) ~start_offset } + { + let unexpected_eof = + Parse_error.end_not_allowed + ~in_what:(Tokens.describe (math_constr input kind "" start_offset)) + in + warning lexbuf input unexpected_eof; + math_constr input kind (Buffer.contents buffer) start_offset + } | _ as c { Buffer.add_char buffer c; math kind buffer nesting_level start_offset input lexbuf } @@ -726,20 +892,21 @@ and media tok_descr buffer nesting_level start_offset input = parse Buffer.contents buffer else begin Buffer.add_char buffer '}'; - media tok_descr buffer (nesting_level - 1) start_offset input lexbuf + media tok_descr buffer (pred nesting_level) start_offset input lexbuf end } | '{' { Buffer.add_char buffer '{'; - media tok_descr buffer (nesting_level + 1) start_offset input lexbuf } + media tok_descr buffer (succ nesting_level) start_offset input lexbuf } | ("\\{" | "\\}") as s { Buffer.add_string buffer s; media tok_descr buffer nesting_level start_offset input lexbuf } | eof { warning + lexbuf input (Parse_error.not_allowed - ~what:(Token.describe `End) + ~what:(Tokens.describe END) ~in_what:tok_descr); Buffer.contents buffer} | (newline) @@ -752,7 +919,7 @@ and media tok_descr buffer nesting_level start_offset input = parse and verbatim buffer last_false_terminator start_offset input = parse | (space_char as c) "v}" { Buffer.add_char buffer c; - emit_verbatim input start_offset buffer } + emit_verbatim lexbuf input start_offset buffer } | "v}" { Buffer.add_string buffer "v}"; @@ -762,19 +929,22 @@ and verbatim buffer last_false_terminator start_offset input = parse | eof { begin match last_false_terminator with | None -> - warning - input - (Parse_error.not_allowed - ~what:(Token.describe `End) - ~in_what:(Token.describe (`Verbatim ""))) + let not_allowed = + Parse_error.not_allowed + ~what:(Tokens.describe END) + ~in_what:(Tokens.describe (Verbatim {inner = ""; start = Loc.dummy_pos})) + in + warning lexbuf input not_allowed + | Some location -> warning + lexbuf input ~start_offset:location ~end_offset:(location + 2) Parse_error.no_trailing_whitespace_in_verbatim end; - emit_verbatim input start_offset buffer } + emit_verbatim lexbuf input start_offset buffer } | _ as c { Buffer.add_char buffer c; @@ -783,14 +953,15 @@ and verbatim buffer last_false_terminator start_offset input = parse and bad_markup_recovery start_offset input = parse - | [^ '}']+ as text '}' as rest - { let suggestion = - Printf.sprintf "did you mean '{!%s}' or '[%s]'?" text text in - warning - input - ~start_offset - (Parse_error.bad_markup ("{" ^ rest) ~suggestion); - emit input (`Code_span text) ~start_offset} + | [^ '}']+ as inner '}' as rest + { + let suggestion = + Printf.sprintf "did you mean '{!%s}' or '[%s]'?" inner inner + in + let bad_markup = Parse_error.bad_markup ("{" ^ rest) ~suggestion in + warning lexbuf input ~start_offset bad_markup; + Code_span { inner; start = input.offset_to_location start_offset } + } (* The second field of the metadata. This rule keeps whitespaces and newlines in the 'metadata' field except the @@ -801,45 +972,92 @@ and code_block_metadata_tail input = parse ((space_char* '[') as suffix) { let meta = - with_location_adjustments ~adjust_start_by:prefix ~adjust_end_by:suffix (fun _ -> Loc.at) input meta + with_location_adjustments ~adjust_start_by:prefix ~adjust_end_by:suffix (fun _ _ -> Loc.at) lexbuf input meta in - `Ok (Some meta) + Ok (Some meta) } | (newline | horizontal_space)* '[' - { `Ok None } + { Ok None } | _ as c - { `Invalid_char c } + { Error (`Invalid_char c) } | eof - { `Eof } + { Error `Eof } -and code_block allow_result_block start_offset content_offset metadata prefix delim input = parse +and code_block allow_result_block start_offset content_offset metadata buffer delimiter input = parse | ("]" (delim_char* as delim') "[") as terminator - { if delim = delim' && allow_result_block - then emit_code_block ~start_offset content_offset input metadata delim terminator prefix true + { if delimiter = delim' && allow_result_block then + emit_code_block + lexbuf + input + ~start_offset + ~content_offset + ~metadata + ~delimiter:(Some delimiter) + ~terminator + ~content:buffer + true else ( - Buffer.add_string prefix terminator; - code_block allow_result_block start_offset content_offset metadata - prefix delim input lexbuf + Buffer.add_string buffer terminator; + code_block + allow_result_block + start_offset + content_offset metadata + buffer + delimiter + input + lexbuf ) } | ("]" (delim_char* as delim') "}") as terminator { - if delim = delim' - then emit_code_block ~start_offset content_offset input metadata delim terminator prefix false + if delimiter = delim' then + emit_code_block + lexbuf + input + ~start_offset + ~content_offset + ~metadata + ~delimiter:(Some delimiter) + ~terminator + ~content:buffer + false else ( - Buffer.add_string prefix terminator; - code_block allow_result_block start_offset content_offset metadata - prefix delim input lexbuf + Buffer.add_string buffer terminator; + code_block + allow_result_block + start_offset + content_offset + metadata + buffer + delimiter + input + lexbuf ) } | eof { - warning input ~start_offset Parse_error.truncated_code_block; - emit_code_block ~start_offset content_offset input metadata delim "" prefix false + warning lexbuf input ~start_offset Parse_error.truncated_code_block; + emit_code_block + lexbuf + input + ~start_offset + ~content_offset + ~metadata + ~delimiter:(Some delimiter) + ~terminator:"" + ~content:buffer + false } | (_ as c) { - Buffer.add_char prefix c; - code_block allow_result_block start_offset content_offset metadata - prefix delim input lexbuf + Buffer.add_char buffer c; + code_block + allow_result_block + start_offset + content_offset + metadata + buffer + delimiter + input + lexbuf } diff --git a/src/parser/loc.ml b/src/parser/loc.ml index 46875a1f6f..938a8f024b 100644 --- a/src/parser/loc.ml +++ b/src/parser/loc.ml @@ -1,4 +1,9 @@ type point = { line : int; column : int } + +let make_point : Lexing.position -> point = + fun Lexing.{ pos_lnum; pos_cnum; pos_bol; _ } -> + { line = pos_lnum; column = pos_cnum - pos_bol } + type span = { file : string; start : point; end_ : point } type +'a with_location = { location : span; value : 'a } @@ -6,8 +11,39 @@ let at location value = { location; value } let location { location; _ } = location let value { value; _ } = value let map f annotated = { annotated with value = f annotated.value } +let is predicate { value; _ } = predicate value let same annotated value = { annotated with value } +let dummy_pos : point = { line = -1; column = -1 } + +let of_position : ?filename:string -> Lexing.position * Lexing.position -> span + = + fun ?filename (start, end_) -> + let start_point = make_point start and end_point = make_point end_ in + { + file = Option.value ~default:start.pos_fname filename; + start = start_point; + end_ = end_point; + } + +let extract : + input:string -> + start_pos:Lexing.position -> + end_pos:Lexing.position -> + string = + fun ~input ~start_pos ~end_pos -> + String.sub input start_pos.pos_cnum (end_pos.pos_cnum - start_pos.pos_cnum) + +let fmt { file; start; end_ } = + let { line = sline; column = scol } = start + and { line = eline; column = ecol } = end_ in + Printf.sprintf + "file: %s\nstart: { line : %d, col : %d }\nend: { line: %d; col: %d }" file + sline scol eline ecol + +let with_start_location : span -> 'a with_location -> 'a with_location = + fun { start; _ } self -> { self with location = { self.location with start } } + let span spans = match spans with | [] -> @@ -20,12 +56,22 @@ let span spans = let last = List.fold_left (fun _ span -> span) first spans in { file = first.file; start = first.start; end_ = last.end_ } +let delimited : 'a with_location -> 'b with_location -> span = + fun { location = startpos; _ } { location = endpos; _ } -> + { file = startpos.file; start = startpos.start; end_ = endpos.end_ } + let nudge_start offset span = { span with start = { span.start with column = span.start.column + offset } } let nudge_end offset span = { span with end_ = { span.end_ with column = span.end_.column - offset } } +let nudge_map_start offset loc = + { loc with location = nudge_start offset loc.location } + +let nudge_map_end offset loc = + { loc with location = nudge_end offset loc.location } + let spans_multiple_lines = function | { location = @@ -33,3 +79,52 @@ let spans_multiple_lines = function _; } -> end_line > start_line + +let map_location : (span -> span) -> 'a with_location -> 'a with_location = + fun f located -> { located with location = f located.location } + +(* Utilities for offsetting the locations of various AST nodes to be more accurate *) + +let _offset_inline_elt located = + let f = + match located.value with + | `Word _ | `Space _ -> Fun.id + | `Code_span _ -> nudge_start (-1) + | `Raw_markup (_, _) | `Styled _ | `Math_span _ -> nudge_start (-3) + | `Reference (`Simple, _, _) -> nudge_start (-2) + | `Reference (`With_text, ref, _) -> + let ref = value ref in + nudge_start (-(String.length ref + 4)) + | `Link (_, []) -> nudge_start (-2) + | `Link (link, _ :: _) -> nudge_start (-(String.length link + 3)) + in + map_location f located + +(* +and nestable_block_element = + [ `Paragraph of inline_element with_location list + | `Code_block of code_block + | `Verbatim of string + | `Modules of string with_location list + | `List of + list_kind * list_syntax * nestable_block_element with_location list list + | `Table of table + | `Math_block of string (** @since 2.0.0 *) + | `Media of reference_kind * media_href with_location * string * media + (** @since 3.0.0 *) ] +*) + +let _offset_block_elt located = + let f = + match located.value with + | `Paragraph children -> + let loc = span @@ List.map location children in + Fun.const loc + | `Verbatim _ -> nudge_start (-2) + | `Modules _ -> nudge_start @@ -String.length "{!modules:" + | `List (`Heavy, _, _) -> nudge_start (-4) + | `Table (_, `Heavy) -> nudge_start @@ -String.length "{table" + | `Math_block _ -> nudge_start @@ -String.length "{math" + | _ -> Fun.id + in + map_location f located diff --git a/src/parser/loc.mli b/src/parser/loc.mli index ad8dedb77a..e9d8284f45 100644 --- a/src/parser/loc.mli +++ b/src/parser/loc.mli @@ -8,6 +8,9 @@ type point = { line : int; column : int } (** A specific character *) +val make_point : Lexing.position -> point +val dummy_pos : point + type span = { file : string; start : point; end_ : point } (** A range of characters between [start] and [end_] in a particular file *) @@ -28,9 +31,29 @@ val nudge_end : int -> span -> span type +'a with_location = { location : span; value : 'a } (** Describes values located at a particular span *) +val fmt : span -> string +(** Format a `span` and return the resulting string *) + +val delimited : 'a with_location -> 'b with_location -> span +(** Returns the span which contains both arguments *) + +val with_start_location : span -> 'a with_location -> 'a with_location +(** [with_start_location start t] return t with it's starting point = start *) + +val nudge_map_start : int -> 'a with_location -> 'a with_location +val nudge_map_end : int -> 'a with_location -> 'a with_location + val at : span -> 'a -> 'a with_location (** Constructor for {!with_location} *) +val of_position : ?filename:string -> Lexing.position * Lexing.position -> span +(** Convert Menhir's `$loc` or `$sloc` into a span *) + +val extract : + input:string -> start_pos:Lexing.position -> end_pos:Lexing.position -> string +(** Given a starting positon and an ending position extract the portion of the + input text between those positions *) + val location : 'a with_location -> span (** Returns the location of a located value *) @@ -40,6 +63,9 @@ val value : 'a with_location -> 'a val map : ('a -> 'b) -> 'a with_location -> 'b with_location (** Map over a located value without changing its location *) +val is : ('a -> bool) -> 'a with_location -> bool +(** Test a located value with a predicate *) + val same : _ with_location -> 'b -> 'b with_location (** [same x y] retuns the value y wrapped in a {!with_location} whose location is that of [x] *) diff --git a/src/parser/odoc_parser.ml b/src/parser/odoc_parser.ml index 2d9fdd5de1..1bb5419db4 100644 --- a/src/parser/odoc_parser.ml +++ b/src/parser/odoc_parser.ml @@ -79,6 +79,19 @@ let offset_to_location : in scan_to_last_newline reversed_newlines +module Tester = struct + include Parser + module Lexer = Lexer + type token = Tokens.token + let is_EOI = function Tokens.END -> true | _ -> false + let pp_warning = Warning.to_string + let run = Writer.run + let reversed_newlines = reversed_newlines + let offset_to_location = offset_to_location + let string_of_token = Tokens.describe + let default_token = Tokens.Word "" +end + (* Given a Loc.point and the result of [parse_comment], this function returns a valid Lexing.position *) let position_of_point : t -> Loc.point -> Lexing.position = @@ -100,21 +113,33 @@ let position_of_point : t -> Loc.point -> Lexing.position = { Lexing.pos_bol; pos_lnum; pos_cnum; pos_fname } (* The main entry point for this module *) -let parse_comment ~location ~text = - let warnings = ref [] in +let parse_comment : location:Lexing.position -> text:string -> t = + fun ~location ~text -> let reversed_newlines = reversed_newlines ~input:text in - let token_stream = - let lexbuf = Lexing.from_string text in - let offset_to_location = - offset_to_location ~reversed_newlines ~comment_location:location - in - let input : Lexer.input = - { file = location.Lexing.pos_fname; offset_to_location; warnings; lexbuf } - in - Stream.from (fun _token_index -> Some (Lexer.token input lexbuf)) + let lexbuf = Lexing.from_string text in + (* We cannot directly pass parameters to Menhir without converting our parser + to a module functor. So we pass our current filename to the lexbuf here *) + Lexing.( + set_filename lexbuf location.pos_fname; + set_position lexbuf location); + let lexer_state = + Lexer. + { + warnings = []; + offset_to_location = + offset_to_location ~reversed_newlines ~comment_location:location; + file = Lexing.(location.pos_fname); + } + in + let ast, warnings = + Writer.run ~input:text @@ Parser.main (Lexer.token lexer_state) lexbuf in - let ast, warnings = Syntax.parse warnings token_stream in - { ast; warnings; reversed_newlines; original_pos = location } + { + ast; + warnings = warnings @ lexer_state.warnings; + reversed_newlines; + original_pos = location; + } (* Accessor functions, as [t] is opaque *) let warnings t = t.warnings diff --git a/src/parser/odoc_parser.mli b/src/parser/odoc_parser.mli index 5dbd4a0815..ccf6959e0b 100644 --- a/src/parser/odoc_parser.mli +++ b/src/parser/odoc_parser.mli @@ -19,6 +19,31 @@ val parse_comment : location:Lexing.position -> text:string -> t module Ast = Ast module Loc = Loc +module Tester : sig + type token = Tokens.token + module Lexer : sig + type input = { + file : string; + offset_to_location : int -> Loc.point; + mutable warnings : Warning.t list; + } + + val token : input -> Lexing.lexbuf -> Parser.token + end + val is_EOI : token -> bool + val pp_warning : Warning.t -> string + val reversed_newlines : input:string -> (int * int) list + val offset_to_location : + reversed_newlines:(int * int) list -> + comment_location:Lexing.position -> + int -> + Loc.point + val run : input:string -> Ast.t Writer.t -> Ast.t * Warning.t list + val main : (Lexing.lexbuf -> token) -> Lexing.lexbuf -> Ast.t Writer.t + val string_of_token : token -> string + val default_token : token +end + (** Warnings produced during parsing. *) module Warning : sig type t = Warning.t = { location : Loc.span; message : string } diff --git a/src/parser/parse_error.ml b/src/parser/parse_error.ml index a07a8a24bb..674713ca3d 100644 --- a/src/parser/parse_error.ml +++ b/src/parser/parse_error.ml @@ -1,5 +1,13 @@ let capitalize_ascii = Astring.String.Ascii.capitalize +let illegal : ?in_what:string -> string -> Loc.span -> Warning.t = + fun ?in_what illegal_section span -> + match in_what with + | Some ctx -> + Warning.make "Illegal character or syntax '%s' in %s" illegal_section ctx + span + | None -> Warning.make "Illegal character or syntax '%s'" illegal_section span + let bad_markup : ?suggestion:string -> string -> Loc.span -> Warning.t = fun ?suggestion -> Warning.make ?suggestion "'%s': bad markup." diff --git a/src/parser/parser.mly b/src/parser/parser.mly new file mode 100644 index 0000000000..ea1dd2c9c0 --- /dev/null +++ b/src/parser/parser.mly @@ -0,0 +1,925 @@ +%{ + open Parser_aux + open Writer.Prelude +%} + +%token RIGHT_BRACE "{" +%token RIGHT_CODE_DELIMITER "{[" + +%token Blank_line +%token Single_newline +%token Space " " + +%token Word (* Any space-delmited text *) + +%token MINUS "-" +%token PLUS "+" + +%token Style "{i" (* or '{b' etc *) + +(* or '{C' or '{R', but this syntax has been deprecated *) +%token Paragraph_style "{L" + +%token MODULES "{!modules:" + +%token Math_span "{m" +%token Math_block "{math" + +%token Verbatim "{v" + +%token <(string option * string) Tokens.with_start_point> Raw_markup "{%%}" + +%token Code_block "{[]}" (* Self-contained code-block *) +%token Code_block_with_output "{[][" (* Code block that expects some block elements *) +%token Code_span "[]" + +%token List "{ol" (* or '{ul' *) +%token LI "{li" +%token DASH "{-" + +%token TABLE_LIGHT "{t" +%token TABLE_HEAVY "{table" +%token TABLE_ROW "{tr" +%token BAR "|" + +%token Table_cell "{td" (* or '{th' for header *) + +(* Where N is an integer *) +%token <(int * string option) Tokens.with_start_point> Section_heading "{N:" + +(* Tags *) +%token Tag +%token Tag_with_content + +(* Links and references *) +%token Simple_ref "{!" +%token Ref_with_replacement "{{!" +%token Simple_link "{:" +%token Link_with_replacement "{{:" +%token <(Tokens.media * Tokens.media_target) Tokens.with_start_point> Media "{(format)!" +%token <(Tokens.media * Tokens.media_target * string) Tokens.with_start_point> Media_with_replacement "{(format):" (* where 'format' is audio, video, image *) + +%token END + +%type tag_bare +%type tag_with_content +%type tag + +%on_error_reduce nestable_block_element(paragraph) +%on_error_reduce tag_with_content +%on_error_reduce section_heading + +%start main + + +%% +(* ENTRY *) + +(* Consume any leading whitespace *) +let main := + | any_whitespace*; ~ = sequence_nonempty(toplevel); END; <> + | any_whitespace*; END; { return [] } + +let toplevel := + | block = nestable_block_element(paragraph); any_whitespace*; { (block :> Ast.block_element Loc.with_location Writer.t) } + | t = tag; line_break?; { Writer.map ~f:(fun loc -> Loc.{ loc with value = `Tag loc.value }) t } + | ~ = section_heading; line_break*; <> + | ~ = toplevel_error; line_break*; <> + +(* Tokens which cannot begin any block element *) +(* TODO: This should have cases for block elements on the same line: + nestable_block_element -> section_heading + nestable_block_element -> nestable_block_element + nestable_block_element -> tag + etc + To do this we will need to refactor so that this returns a list of elements, + and `toplevel` is an explicit left-recursive list rule + (see Menhir's `list` impl) that can concat those lists together +*) +let toplevel_error := + (* Stray heavy list items, `{li` or `{-` *) + | err = located(item_open); children = sequence_nonempty(inline_element(horizontal_whitespace)); endpos = located(RIGHT_BRACE)?; { + let default = + Writer.get children + |> List.rev + |> List.hd + |> Loc.map (Fun.const ()) + in + let endloc = Option.value ~default endpos in + let span = Loc.delimited err endloc in + let not_allowed = Writer.Warning ( + let what = Tokens.describe err.Loc.value in + let in_what = "top-level text" in + let suggestion = + Printf.sprintf + "Move %s into %s or %s" + what + (Tokens.describe @@ List Ordered) + (Tokens.describe @@ List Unordered) + in + Parse_error.not_allowed ~what ~in_what ~suggestion span) + in + let unclosed = Writer.Warning ( + Parse_error.unclosed_bracket ~bracket:(Tokens.print err.Loc.value) span) + in + Writer.bind children ~f:(fun c -> + let inner = { (paragraph c :> Ast.block_element Loc.with_location) with location = span } in + let m = Writer.return_warning inner not_allowed in + if Option.is_some endpos then m + else + Writer.warning unclosed m) + } + | errloc = position(RIGHT_BRACE); whitespace?; { + let span = Loc.of_position errloc in + let warning = + let what = Tokens.describe RIGHT_BRACE in + Writer.Warning (Parse_error.bad_markup what span) + in + let as_text = Loc.at span @@ `Word "}" in + let node = (Loc.same as_text @@ `Paragraph [ as_text ]) in + Writer.return_warning node warning + } + | errloc = position(RIGHT_CODE_DELIMITER); { + let span = Loc.of_position errloc in + let warning = + let what = Tokens.describe RIGHT_BRACE in + Writer.Warning (Parse_error.bad_markup what span) + in + let as_text = Loc.at span @@ `Word "{" in + let node = Loc.same as_text @@ `Paragraph [ as_text ] in + Writer.return_warning node warning + } + +(* SECTION HEADING *) + +let section_heading := + | content = Section_heading; children = sequence_nonempty(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { + let Tokens.{ inner = (num, title); start } = content in + let span = { endpos.Loc.location with start } in + Writer.map ~f:(fun c -> Loc.at span @@ `Heading (num, title, c)) children + } + | content = Section_heading; endpos = located(RIGHT_BRACE); { + let Tokens.{ inner = (num, title); start } = content in + let span = { endpos.Loc.location with start } in + let should_not_be_empty = + let what = Tokens.describe @@ Section_heading content in + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + let node = Loc.at span @@ `Heading (num, title, []) in + Writer.return_warning node should_not_be_empty + } + | content = with_position(Section_heading); end_pos = position(error); { + let Tokens.{ inner; start }, start_pos = content in + let (num, title) = inner in + let end_ = Loc.make_point @@ snd end_pos in + let file = (fst end_pos).Lexing.pos_fname in + let span = Loc.{ file; start; end_; } in + let start_pos = fst start_pos and end_pos = snd end_pos in + let illegal = Writer.InputNeeded (fun input -> + let err = Loc.extract ~input ~start_pos ~end_pos in + let in_what = Tokens.describe @@ Section_heading (fst content) in + Parse_error.illegal ~in_what err span) + in + let inner = Loc.at span @@ `Heading (num, title, []) in + Writer.return_warning inner illegal + } + +(* TAGS *) + +let tag := + | with_content = tag_with_content; { with_content } + | bare = tag_bare; { bare } + +let tag_with_content := tag = located(Tag_with_content); children = sequence(nestable_block_element(paragraph)); { + Writer.map children ~f:(fun children -> + let Loc.{ value; location } = tag in + let start = + tag_with_content_start_point value + |> Option.map (fun start -> { location with start }) + |> Option.value ~default:location + in + let span = Loc.span @@ start :: List.map Loc.location children in + Loc.at span @@ tag_with_content children value) + } + | tag = located(Tag_with_content); horizontal_whitespace?; { + return @@ { tag with Loc.value = tag_with_content [] tag.Loc.value } + } + (* TODO: (@FayCarsons) Right now this is the only way to accept a newline + after a tag_with_content, adding an optional newline causes unsolvable + reduce conflicts. + Maybe if the line break/whitespace handling for nestable block element were + refactored, we could remove this *) + | tag = located(Tag_with_content); Single_newline; children = sequence(nestable_block_element(paragraph)); { + Writer.map children ~f:(fun children -> + let Loc.{ value; location } = tag in + let start = tag_with_content_start_point value |> Option.map (fun start -> { location with start }) |> Option.value ~default:location in + let span = Loc.span @@ start :: List.map Loc.location children in + Loc.at span @@ tag_with_content children value) + } + +let tag_bare := tag = located(Tag); horizontal_whitespace?; { + return @@ Loc.map (Fun.const @@ tag_bare tag) tag +} + +(* INLINE ELEMENTS *) + +let inline_element(ws) := + | ~ = inline_element_without_whitespace; <> + | s = located(ws); { return s } + +let inline_element_without_whitespace := + (* Single token inline elements which are mostly handled in the lexer *) + | content = located(Code_span); { + let Loc.{ value = Tokens.{ inner; start }; location } = content in + return @@ Loc.at { location with start } (`Code_span inner) + } + | m = located(Raw_markup); { + let Loc.{ value = Tokens.{ inner; start }; location } = m in + return @@ Loc.at { location with start } (`Raw_markup inner) + } + | w = located(Word); { return @@ Loc.map (fun w -> `Word w) w } + | m = located(Math_span); { + let Loc.{ value = Tokens.{ start; inner }; location } = m in + return @@ Loc.at { location with start } (`Math_span inner) + } + (* More complex/recursive inline elements should have their own rule *) + | ~ = style; <> + | ~ = reference; <> + | ~ = link; <> + +let style := + | style = located(Style); children = sequence(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { + let span = Loc.delimited style endpos in + let style = style.Loc.value in + let warning = + let what = Tokens.describe @@ Style style in + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + Writer.ensure not_empty warning children + |> Writer.map ~f:(fun c -> Loc.at span @@ `Styled (ast_style style, trim_start c)) + } + | style = located(Style); endpos = located(RIGHT_BRACE); { + let span = Loc.delimited style endpos in + let style = style.Loc.value in + let warning = + let what = Tokens.describe @@ Style style in + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + let inner = Loc.at span @@ `Styled (ast_style style, []) in + Writer.return_warning inner warning + } + | style = located(Style); endpos = located(RIGHT_CODE_DELIMITER); { + let span = Loc.delimited style endpos in + let style = style.Loc.value in + let style_desc = Tokens.describe @@ Style style in + let not_allowed = + let what = Tokens.describe RIGHT_CODE_DELIMITER in + Writer.Warning (Parse_error.not_allowed ~what ~in_what:style_desc span) + in + let should_not_be_empty = + Writer.Warning (Parse_error.should_not_be_empty ~what:style_desc span) + in + let inner = Loc.at span @@ `Styled (ast_style style, []) in + return inner + |> Writer.warning not_allowed + |> Writer.warning should_not_be_empty + } + | style = located(Style); errloc = position(error); { + let span = Loc.span [style.Loc.location; Loc.of_position errloc] in + let illegal = Writer.InputNeeded (fun input -> + let in_what = Tokens.describe @@ Style style.Loc.value in + let (start_pos, end_pos) = errloc in + let illegal_section = Loc.extract ~input ~start_pos ~end_pos in + Parse_error.illegal ~in_what illegal_section span) + in + let inner = Loc.at span @@ `Styled (ast_style style.Loc.value, []) in + Writer.return_warning inner illegal + } + | style = located(Style); endpos = located(END); { + let span = Loc.delimited style endpos in + let style = style.Loc.value in + let warning = + let in_what = Tokens.describe @@ Style style in + Writer.Warning (Parse_error.end_not_allowed ~in_what span) + in + let inner = Loc.at span @@ `Styled (ast_style style, []) in + Writer.return_warning inner warning + } + +(* LINKS + REFS *) + +let reference := + | content = located(Simple_ref); { + let Loc.{ value = Tokens.{ inner; start }; location } = content in + let span = { location with start } in + return @@ Loc.at span @@ `Reference (`Simple, inner, []) + } + | content = Ref_with_replacement; children = sequence_nonempty(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { + Writer.bind children ~f:(fun c -> + let Tokens.{ inner; start } = content in + let span = { endpos.Loc.location with start } in + return @@ Loc.at span @@ `Reference (`With_text, inner, trim_start c)) + } + | content = Ref_with_replacement; endpos = located(RIGHT_BRACE); { + let Tokens.{ inner; start } = content in + let span = { endpos.Loc.location with start } in + let node = Loc.at span @@ `Reference (`With_text, inner, []) in + let warning = + let what = Tokens.describe @@ Ref_with_replacement content in + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + Writer.return_warning node warning + } + | content = Ref_with_replacement; children = sequence_nonempty(inline_element(whitespace))?; endpos = located(END); { + let Tokens.{ inner; start } = content in + let span = { endpos.Loc.location with start } in + let not_allowed = + let in_what = Tokens.describe (Ref_with_replacement content) in + Writer.Warning (Parse_error.end_not_allowed ~in_what span) + in + let* children = Option.value ~default:(return []) children in + let node = Loc.at span @@ `Reference (`With_text, inner, children) in + Writer.return_warning node not_allowed + } + | content = Ref_with_replacement; endpos = located(END); { + let Tokens.{ inner; start } = content in + let span = { endpos.Loc.location with start } in + let in_what = Tokens.describe @@ Ref_with_replacement content in + let end_not_allowed = Writer.Warning (Parse_error.end_not_allowed ~in_what span) in + `Reference (`With_text, inner, []) + |> Loc.at span + |> return + |> Writer.warning end_not_allowed + } + +let link := + | content = located(Simple_link); { + let Loc.{ value = Tokens.{ inner; start }; location } = content in + let span = { location with start } in + let node = Loc.at span @@ `Link (inner, []) in + let url = String.trim inner in + if "" = url then + let what = Tokens.describe @@ Simple_link content.Loc.value in + let warning = + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + Writer.return_warning node warning + else + return node + } + | content = Link_with_replacement; children = sequence_nonempty(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { + Writer.bind children ~f:(fun c -> + let Tokens.{ inner; start } = content in + let span = { endpos.Loc.location with start } in + let node = Loc.at span @@ `Link (inner, c) in + if "" = inner then + let what = Tokens.describe @@ Link_with_replacement content in + let warning = + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + Writer.return_warning node warning + else + return node) + } + | content = Link_with_replacement; endpos = located(RIGHT_BRACE); { + let Tokens.{ inner; start } = content in + let span = { endpos.Loc.location with start } in + let node = Loc.at span @@ `Link (inner, []) in + let what = Tokens.describe @@ Link_with_replacement content in + let should_not_be_empty = + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + Writer.return_warning node should_not_be_empty + } + | content = Link_with_replacement; endpos = located(END); { + let Tokens.{ inner; start } = content in + let span = { endpos.Loc.location with start } in + let in_what = Tokens.describe @@ Link_with_replacement content in + let end_not_allowed = Writer.Warning (Parse_error.end_not_allowed ~in_what span) in + `Link (inner, []) + |> Loc.at span + |> return + |> Writer.warning end_not_allowed + } + +(* LIST *) + +let list_light_start := + | MINUS; { Tokens.MINUS } + | PLUS; { Tokens.PLUS } + +let light_list_paragraph_item := + | ~ = inline_element(whitespace); <> + | ~ = symbol_as_word(bar); <> +let paragraph_no_list_symbols := horizontal_whitespace?; x = inline_element_without_whitespace; xs = sequence(light_list_paragraph_item); { + paragraph <$> Writer.map2 ~f:List.cons x xs +} + +let list_light_item := + | start = located(list_light_start); horizontal_whitespace?; item = nestable_block_element(paragraph_no_list_symbols); { + let Loc.{ value; location } = start in + Writer.map ~f:(fun item -> + (Loc.span [location; item.Loc.location], item) + ) (light_list_item value <$> item) + } + | horizontal_whitespace; start = located(list_light_start); item = nestable_block_element(paragraph_no_list_symbols); { + let should_begin_on_its_own_line = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.should_begin_on_its_own_line ~what:(Tokens.describe MINUS) span) + in + let Loc.{ value; location } = start in + Writer.map ~f:(fun item -> + (Loc.span [location; item.Loc.location], item) + ) (light_list_item value <$> item) + |> Writer.warning should_begin_on_its_own_line + } + +let list_light := + | children = sequence_separated_nonempty(whitespace*, list_light_item); { + Writer.map children ~f:(fun children -> + let spans, children = List.split children in + let span = Loc.span spans in + let list_kind, children = split_light_list_items children in + Loc.at span @@ `List (list_kind, `Light, [ children ])) + } + | children = sequence_separated_nonempty(whitespace*, list_light_item); errpos = position(error); { + Writer.bind children ~f:(fun children -> + let spans, children = List.split children in + + let start_pos, end_pos = errpos in + let errloc = Loc.of_position errpos in + let span = Loc.span (spans @ [errloc]) in + let list_kind, children = split_light_list_items children in + + let illegal = Writer.InputNeeded (fun input -> + let error_text = Loc.extract ~input ~start_pos ~end_pos in + let in_what = Tokens.describe @@ + match list_kind with `Ordered -> PLUS | `Unordered -> MINUS + in + Parse_error.illegal ~in_what error_text span) + in + `List (list_kind, `Light, [ children ]) + |> Loc.at span + |> return + |> Writer.warning illegal) + } + | start = located(list_light_start); horizontal_whitespace?; errpos = position(error); { + let Loc.{ value; location } = start in + let list_kind = + match value with + | PLUS -> `Ordered + | MINUS -> `Unordered + | _ -> assert false (* unreachable *) + in + let errloc = Loc.of_position errpos in + let span = Loc.span [location; errloc] in + let illegal = Writer.InputNeeded (fun input -> + let (start_pos, end_pos) = errpos in + let error_text = Loc.extract ~input ~start_pos ~end_pos in + let in_what = Tokens.describe value in + Parse_error.illegal ~in_what error_text span) + in + `List (list_kind, `Light, []) + |> Loc.at span + |> return + |> Writer.warning illegal + } + + +let item_open := + | LI; { Tokens.LI } + | DASH; { Tokens.DASH } + +let item_heavy := +| startpos = located(item_open); any_whitespace*; items = sequence(nestable_block_element(paragraph)); any_whitespace*; endpos = located(RIGHT_BRACE); { + let span = Loc.delimited startpos endpos in + let should_not_be_empty = + Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe LI) span) + in + Writer.ensure not_empty should_not_be_empty items + } + | startpos = located(item_open); any_whitespace*; items = sequence(nestable_block_element(paragraph))?; any_whitespace*; endpos = located(END); { + let end_not_allowed = + Writer.Warning (Parse_error.end_not_allowed ~in_what:(Tokens.describe DASH) endpos.Loc.location) + in + match items with + | Some writer -> + Writer.warning end_not_allowed writer + | None -> + let span = Loc.delimited startpos endpos in + let should_not_be_empty = + Writer.Warning (Parse_error.should_not_be_empty ~what:(Tokens.describe DASH) span) + in + Writer.return_warning [] should_not_be_empty + |> Writer.warning end_not_allowed + } + +let list_heavy := + | list_kind = located(List); whitespace*; items = sequence(item_heavy); whitespace*; endpos = located(RIGHT_BRACE); { + let span = Loc.delimited list_kind endpos in + let should_not_be_empty = + let what = Tokens.describe @@ List list_kind.Loc.value in + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + Writer.ensure not_empty should_not_be_empty items + |> Writer.bind ~f:(fun items -> + `List (Tokens.ast_list_kind list_kind.Loc.value, `Heavy, items) + |> Loc.at span + |> return) + } + | list_kind = located(List); whitespace*; items = sequence_nonempty(item_heavy); errloc = position(error); { + let span = Loc.(span [list_kind.location; of_position errloc]) in + let illegal = Writer.InputNeeded (fun input -> + let (start_pos, end_pos) = errloc in + let illegal_input = Loc.extract ~input ~start_pos ~end_pos in + let in_what = Tokens.describe @@ List list_kind.Loc.value in + Parse_error.illegal ~in_what illegal_input span) + in + let* items : Ast.nestable_block_element Loc.with_location list list = Writer.warning illegal items in + let inner = Loc.at span @@ `List (Tokens.ast_list_kind list_kind.Loc.value, `Heavy, items) in + return inner + } + | list_kind = located(List); errloc = position(error); { + let span = Loc.(span [list_kind.location; of_position errloc]) in + let illegal = Writer.InputNeeded (fun input -> + let (start_pos, end_pos) = errloc in + let illegal_input = Loc.extract ~input ~start_pos ~end_pos in + let in_what = Tokens.describe (List list_kind.Loc.value) in + Parse_error.illegal ~in_what illegal_input span) + in + let inner = Loc.at span @@ `List (Tokens.ast_list_kind list_kind.Loc.value, `Heavy, []) in + Writer.return_warning inner illegal + } + +let odoc_list := + | ~ = list_light; <> + | ~ = list_heavy; <> + +(* TABLES *) + +let cell_heavy := + | cell_kind = Table_cell; children = sequence_nonempty(nestable_block_element(paragraph)); RIGHT_BRACE; whitespace*; + { Writer.map ~f:(fun c -> (c, cell_kind)) children } + | cell_kind = Table_cell; RIGHT_BRACE; whitespace*; + { return ([], cell_kind) } + | cell_kind = Table_cell; children = sequence_nonempty(nestable_block_element(paragraph))?; errloc = position(error); { + let illegal = Writer.InputNeeded (fun input -> + let (start_pos, end_pos) as loc = errloc in + let illegal_input = Loc.extract ~input ~start_pos ~end_pos in + let span = Loc.of_position loc in + let in_what = Tokens.describe @@ Table_cell cell_kind in + Parse_error.illegal ~in_what illegal_input span) + in + Option.value ~default:(return []) children + |> Writer.map ~f:(fun c -> (c, cell_kind)) + |> Writer.warning illegal + } + +let row_heavy := + | TABLE_ROW; whitespace*; ~ = sequence_nonempty(cell_heavy); RIGHT_BRACE; whitespace*; <> + | TABLE_ROW; whitespace*; RIGHT_BRACE; whitespace*; { return [] } + | TABLE_ROW; children = sequence_nonempty(cell_heavy)?; errloc = position(error); { + let illegal = Writer.InputNeeded (fun input -> + let (start_pos, end_pos) as loc = errloc in + let illegal_input = Loc.extract ~input ~start_pos ~end_pos in + let span = Loc.of_position loc in + let in_what = Tokens.describe TABLE_ROW in + Parse_error.illegal ~in_what illegal_input span) + in + Option.value ~default:(return []) children + |> Writer.warning illegal + } + +let table_heavy := + | grid = delimited_location(TABLE_HEAVY, whitespace*; sequence_nonempty(row_heavy), RIGHT_BRACE); { + Loc.map (fun grid -> `Table ((grid, None), `Heavy)) <$> (Writer.sequence_loc grid) + } + | startpos = located(TABLE_HEAVY); endpos = located(RIGHT_BRACE); { + let span = Loc.(span [startpos.location; endpos.location]) in + let inner = Loc.at span @@ `Table (([], None), `Heavy) in + return inner + } + | startpos = located(TABLE_HEAVY); whitespace*; grid = sequence_nonempty(row_heavy)?; errloc = position(error); { + let illegal = Writer.InputNeeded (fun input -> + let (start_pos, end_pos) as loc = errloc in + let illegal_input = Loc.extract ~input ~start_pos ~end_pos in + let span = Loc.of_position loc in + let in_what = Tokens.describe TABLE_HEAVY in + Parse_error.illegal ~in_what illegal_input span) + in + let span = Loc.(span [startpos.location; (Loc.of_position errloc)]) in + Option.value ~default:(return []) grid + |> Writer.map ~f:(fun grid -> Loc.at span @@ `Table ((grid, None), `Heavy)) + |> Writer.warning illegal + } + +(* LIGHT TABLE *) + +let cell_inner := + | ~ = inline_element(horizontal_whitespace); <> + | s = located(symbols_without_bar); { + return @@ Loc.map (fun w -> `Word w) s + } + | (start_pos, end_pos) = position(error); { + let span = Loc.of_position (start_pos, end_pos) in + let illegal = Writer.InputNeeded (fun input -> + let text_span = Loc.extract ~start_pos ~end_pos ~input in + Parse_error.illegal ~in_what:(Tokens.describe TABLE_LIGHT) text_span span) + in + (* TODO: (@FayCarsons) + This is the best we can do right now. Accepting a `nestable_block_element`, + for example, causes a reduce/reduce conflict. + So we have to lose some information(what the invalid element was) via the + `error` keyword and return an empty word. + Maybe if we refactored the way that block elements/tags/sections handle + whitespace we could remove the conflict while still being to match on those + elements + *) + Writer.return_warning (Loc.at span @@ `Word "") illegal + } + +let cell_content_light := ~ = sequence_nonempty(cell_inner); <> + +let cell := + | ~ = cell_content_light; <> + | ~ = cell_content_light; BAR; <> + | BAR; { return [] } + +let cells := BAR?; ~ = sequence_nonempty(cell); <> + +let row_light := + | ~ = cells; <> + | ~ = cells; Single_newline; <> + | ~ = cells; Single_newline; Space; <> + | ~ = cells; Blank_line; <> + | ~ = cells; Blank_line; Single_newline; <> + +let rows_light := ~ = sequence_nonempty(row_light); <> + +let table_start_light := startpos = located(TABLE_LIGHT); any_whitespace*; { startpos } +let table_light := + | startpos = table_start_light; data = rows_light; endpos = located(RIGHT_BRACE); { + let span = Loc.delimited startpos endpos in + construct_table ~span <$> data + } + | startpos = table_start_light; endpos = located(RIGHT_BRACE); { + let span = Loc.delimited startpos endpos in + let inner = Loc.at span @@ `Table (([[]], None), `Light) in + return inner + } + | startpos = table_start_light; data = rows_light; endpos = located(END); { + let in_what = Tokens.describe TABLE_LIGHT in + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.end_not_allowed ~in_what span) + in + let span = Loc.delimited startpos endpos in + unclosed_table ~span ~data warning + } + | startpos = located(TABLE_LIGHT); any_whitespace?; endpos = located(END); { + let in_what = Tokens.describe TABLE_LIGHT in + let warning = + let span = Loc.of_position $sloc in + Writer.Warning (Parse_error.end_not_allowed ~in_what span) + in + let span = Loc.delimited startpos endpos in + unclosed_table ~span warning + } + +let table := + | ~ = table_heavy; <> + | ~ = table_light; <> + +(* MEDIA *) + +let media := + | content = located(Media); whitespace*; { + let Loc.{ value = Tokens.{ inner = ref_kind, media_kind; start }; location } = content in + let span = { location with start } in + let ref_kind = + let open Tokens in + match ref_kind with + | Reference refr -> Loc.map (fun r -> `Reference r) refr + | Link link -> Loc.map (fun l -> `Link l) link + in + let media_kind = + let open Tokens in + match media_kind with + | Audio -> `Audio + | Image -> `Image + | Video -> `Video + in + let inner = Loc.at span @@ `Media (`Simple, ref_kind, "", media_kind) in + return inner + } + | content = located(Media_with_replacement); whitespace*; { + let Loc.{ value = Tokens.{ inner = (ref_kind, media_kind, content); start }; location } = content in + let span = { location with start } in + let ref_kind = + let open Tokens in + match ref_kind with + | Reference refr -> Loc.map (fun r -> `Reference r) refr + | Link link -> Loc.map (fun l -> `Link l) link + in + let media_kind = + let open Tokens in + match media_kind with + | Audio -> `Audio + | Image -> `Image + | Video -> `Video + in + let inner = Loc.at span @@ `Media (`With_text, ref_kind, content, media_kind) in + return inner + } + +(* TOP-LEVEL ELEMENTS *) + +let nestable_block_element(paragraph) := ~ = nestable_block_element_inner(paragraph); <> +let nestable_block_element_inner(paragraph) := + | ~ = verbatim; <> + | ~ = code_block; <> + | ~ = odoc_list; <> + | ~ = table; <> + | ~ = media; <> + | ~ = math_block; <> + | ~ = paragraph; <> + | ~ = modules; <> + | ~ = paragraph_style; <> + +let paragraph_style := content = located(Paragraph_style); ws = paragraph; endpos = located(RIGHT_BRACE); { + let span = Loc.delimited content endpos in + let warning = + let what = Tokens.describe @@ Paragraph_style content.Loc.value in + Writer.Warning (Parse_error.markup_should_not_be_used span ~what) + in + let start = content.Loc.value.Tokens.start in + let endloc = endpos.Loc.location in + Writer.bind ws ~f:(fun ws -> + Writer.return_warning + { ws with Loc.location = { endloc with start }} + warning) + +} + +let verbatim := verbatim = located(Verbatim); { + let Loc.{ value; location } = verbatim in + let Tokens.{ start; inner } = value in + let span = { location with start } in + let what = Tokens.describe @@ Verbatim value in + let warning = + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + let verbatim = Loc.at span @@ `Verbatim inner in + Writer.ensure has_content warning (return inner) + *> return verbatim +} + +(* Split so that we can exclude bar in i.e. light tables *) +let symbols_without_bar := + | PLUS; { "+" } + | MINUS; { "-" } +let bar := + | BAR; { "|" } + +let symbols := + | ~ = symbols_without_bar; <> + | ~ = bar; <> +let symbol_as_word(symbols) == s = located(symbols); { return @@ Loc.map (fun w -> `Word w) s } +let paragraph_middle_element := + | ~ = inline_element(whitespace); <> + | ~ = symbol_as_word(symbols); <> + + +let paragraph := horizontal_whitespace?; x = inline_element_without_whitespace; xs = sequence(paragraph_middle_element); { + paragraph <$> Writer.map2 ~f:List.cons x xs +} + +let code_block := + | content = located(Code_block); { + let Loc.{ value = Tokens.{ inner; start }; location } = content in + let Tokens.{ metadata; delimiter; content } = inner in + let meta = Option.map (fun Tokens.{ language_tag; tags } -> Ast.{ language = language_tag; tags }) metadata in + let span = { location with start } in + let should_not_be_empty = Writer.Warning ( + let what = Tokens.describe @@ Code_block Tokens.{ inner; start } in + Parse_error.should_not_be_empty ~what span) + in + let node = Loc.at span @@ `Code_block Ast.{ meta; delimiter; content; output = None } in + Fun.const node <$> Writer.ensure (Loc.is has_content) should_not_be_empty (return content) + + } + | content = located(Code_block_with_output); output = sequence_nonempty(nestable_block_element(paragraph)); RIGHT_CODE_DELIMITER; { + let* output = Option.some <$> output in + let Loc.{ value = Tokens.{ inner; start }; location } = content in + let Tokens.{ metadata; delimiter; content } = inner in + let meta = Option.map (fun Tokens.{ language_tag; tags } -> Ast.{ language = language_tag; tags }) metadata in + let node = `Code_block Ast.{ meta; delimiter; content; output } in + return @@ Loc.at { location with start } node + } + +let math_block := inner = located(Math_block); { + let Loc.{ value; location } = inner in + let Tokens.{ start; inner } = value in + let span = { location with start } in + let what = Tokens.describe @@ Math_block value in + let warning = + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + Writer.ensure has_content warning (return inner) + |> Writer.map ~f:(fun m -> Loc.at span @@ `Math_block m) +} + +let modules := startpos = located(MODULES); modules = sequence(inline_element(whitespace)); endpos = located(RIGHT_BRACE); { + let in_what = Tokens.describe MODULES in + Writer.bind modules ~f:(fun m -> + let not_allowed = + let span = Loc.span @@ List.map Loc.location m in + let first_offending = + List.find_opt + (function + | `Word _ | `Space _ -> false + | _ -> true) + (List.map Loc.value m : Ast.inline_element list) + in + let what = + Option.map Tokens.describe_inline first_offending + |> Option.value ~default:String.empty + in + Writer.Warning (Parse_error.not_allowed ~what ~in_what span) + in + let is_empty = + let span = Loc.span @@ List.map Loc.location m in + let what = Tokens.describe MODULES in + Writer.Warning (Parse_error.should_not_be_empty ~what span) + in + let span = Loc.(span [startpos.location; endpos.location]) in + let inner = Loc.at span @@ `Modules (List.map (Loc.map inline_element_inner) m) in + (* Test the content for errors, throwing away the value afterwards with `*>` *) + (Writer.ensure not_empty is_empty (return m) + |> Writer.ensure legal_module_list not_allowed) + *> return inner) + } + | startpos = located(MODULES); modules = sequence(inline_element(whitespace)); endpos = located(END); { + let in_what = Tokens.describe MODULES in + Writer.bind modules ~f:(fun m -> + let span = Loc.span @@ List.map Loc.location m in + let not_allowed = + let first_offending = + List.find_opt + (function + | `Word _ | `Space _ -> false + | _ -> true) + (List.map Loc.value m : Ast.inline_element list) + in + let what = Option.map Tokens.describe_inline first_offending |> Option.value ~default:String.empty in + Writer.Warning (Parse_error.not_allowed ~what ~in_what span) + in + let unexpected_end = + Writer.Warning (Parse_error.end_not_allowed ~in_what:(Tokens.describe MODULES) span) + in + let span = Loc.(span [startpos.location; endpos.location]) in + let inner = Loc.at span @@ `Modules (List.map (Loc.map inline_element_inner) m) in + Writer.return_warning inner not_allowed + |> Writer.warning unexpected_end) + } + + +(* UTILITIES *) + +(* Utilities which wraps the return value of a rule in `Loc.with_location` *) +let locatedM(rule) == inner = rule; { wrap_location $sloc <$> inner } +let located(rule) == inner = rule; { wrap_location $sloc inner } +let delimited_location(opening, rule, closing) := startpos = located(opening); inner = rule; endpos = located(closing); { + let span = Loc.delimited startpos endpos in + Loc.at span inner +} + +(* When we have to handle errors with Menhir's `error` token, we need + `Lexing.position` as opposed to `Loc.with_location`. Because we can cleanly + take a slice of the input text with those positions, which would be + difficult using `Loc.with_location` *) +let with_position(rule) == inner = rule; { (inner, $sloc) } +let position(rule) == _ = rule; { $sloc } + + +(* Wrappers around Menhir's built-in utilities that make working inside the + Writer.t monad easier *) +let sequence(rule) == xs = list(rule); { Writer.sequence xs } +let sequence_nonempty(rule) == xs = nonempty_list(rule); { Writer.sequence xs } +let sequence_separated_nonempty(sep, rule) := xs = separated_nonempty_list(sep, rule); { Writer.sequence xs } +let sequence_separated(sep, rule) := xs = separated_list(sep, rule); { Writer.sequence xs } + +(* WHITESPACE *) +let horizontal_whitespace == ~ = Space; <`Space> + +let whitespace == + | ~ = horizontal_whitespace; <> + | ~ = Single_newline; <`Space> + +let any_whitespace := + | ~ = whitespace; <> + | ~ = Blank_line; <`Space> + +let line_break == + | ~ = Single_newline; <> + | ~ = Blank_line; <> diff --git a/src/parser/parser_aux.ml b/src/parser/parser_aux.ml new file mode 100644 index 0000000000..8b258d0b16 --- /dev/null +++ b/src/parser/parser_aux.ml @@ -0,0 +1,250 @@ +let wrap_location : + Lexing.position * Lexing.position -> 'a -> 'a Loc.with_location = + fun pos value -> + let location = Loc.of_position pos in + { location; value } + +let not_empty : 'a list -> bool = function _ :: _ -> true | _ -> false +let has_content : string -> bool = fun s -> String.length s > 0 + +let trim_start = function Loc.{ value = `Space _; _ } :: xs -> xs | xs -> xs + +(* Wrap a list of `inline_element` in a `Paragraph *) +let paragraph : + Ast.inline_element Loc.with_location list -> + Ast.nestable_block_element Loc.with_location = + fun elts -> + let span = Loc.span @@ List.map Loc.location elts in + let elts = trim_start elts in + Loc.at span @@ `Paragraph elts + +type align_error = + | Invalid_align (* An invalid align cell *) + | Not_align (* Not an align cell *) + +(* This could be made a bit more specific by allowing Space elements only at + the beginning and end *) +let valid_elements (cell : Ast.inline_element list) : string option = + let rec go acc = function + | `Word _ :: _ when Option.is_some acc -> None + | `Word word :: rest -> go (Some word) rest + | `Space _ :: rest -> go acc rest + | _ :: _ -> None + | [] -> acc + in + go None cell + +let valid_word word = + match String.length word with + | 0 -> Ok None + | 1 -> ( + match word.[0] with + | ':' -> Ok (Some `Center) + | '-' -> Ok None + | _ -> Error Not_align) + | len -> + if String.for_all (Char.equal '-') (String.sub word 1 (len - 2)) then + match (word.[0], word.[pred len]) with + | ':', '-' -> Ok (Some `Left) + | '-', ':' -> Ok (Some `Right) + | ':', ':' -> Ok (Some `Center) + | '-', '-' -> Ok None + | _ -> Error Invalid_align + else Error Not_align + +let valid_align_cell (cell : Ast.inline_element Loc.with_location list) = + match List.map Loc.value cell |> valid_elements with + | Some word -> valid_word word + | None -> Error Not_align + +let sequence : ('elt, 'err) result list -> ('elt list, 'err) result = + fun list -> + let rec go acc : ('elt, 'err) result list -> ('elt list, 'err) result = + function + | Ok x :: xs -> go (x :: acc) xs + | Error err :: _ -> Error err + | [] -> Ok (List.rev acc) + in + go [] list + +(* NOTE: (@FayCarsons) + When we get something that doesn't look like an align at all, we check to see if we've gotten + any valid aligns, if so we assume that the cell being considered is supposed to be an align and treat it as an error, + otherwise we assume the row is not supposed to be an align row + This is somewhat error prone, using i.e. '--' to denote an empty table cell seems reasonable +*) +let valid_align_row (row : Ast.inline_element Loc.with_location list list) : + (Ast.alignment option list, align_error) result = + let align, not_align = + List.map valid_align_cell row + |> List.partition (function + | Ok _ | Error Invalid_align -> true + | _ -> false) + in + match (align, not_align) with + | _ :: _, _ :: _ -> Error Invalid_align + | _ :: _, [] -> sequence align + | _ -> Error Not_align + +(* Merges inline elements within a cell into a single paragraph element, and tags cells w/ tag *) +let merged_tagged_row tag : 'a Loc.with_location list list -> 'b = + List.map (fun elts -> ([ paragraph elts ], tag)) +let as_data = merged_tagged_row `Data +let as_header = merged_tagged_row `Header + +let is_valid_align row = Result.is_ok @@ valid_align_row row + +(* + - If the first row is the alignment row then the rest should be data + - Otherwise the first should be the headers, the second align, and the rest data + - If there's only one row and it's not the align row, then it's data + *) +let construct_table : + span:Loc.span -> + Ast.inline_element Loc.with_location list list list -> + Ast.nestable_block_element Loc.with_location = + fun ~span grid -> + match grid with + | [ only_row ] -> ( + match valid_align_row only_row with + | Ok align -> Loc.at span @@ `Table (([ [] ], Some align), `Light) + | _ -> Loc.at span @@ `Table (([ as_data only_row ], None), `Light)) + | align :: data when is_valid_align align -> + let align = Result.get_ok @@ valid_align_row align in + Loc.at span @@ `Table ((List.map as_data data, Some align), `Light) + | header :: align :: data when is_valid_align align -> + let align = Result.get_ok @@ valid_align_row align in + Loc.at span + @@ `Table ((as_header header :: List.map as_data data, Some align), `Light) + | data -> Loc.at span @@ `Table ((List.map as_data data, None), `Light) + +let unclosed_table + ?(data : + Ast.inline_element Loc.with_location list list list Writer.t option) + ~span warning : Ast.nestable_block_element Loc.with_location Writer.t = + let node = + match data with + | Some data -> + Writer.map + ~f:(fun data -> + Loc.at span @@ `Table ((List.map as_data data, None), `Light)) + data + | None -> + let inner = Loc.at span @@ `Table (([], None), `Light) in + Writer.return inner + in + Writer.warning warning node + +let media_kind = + let open Tokens in + function Audio -> `Audio | Video -> `Video | Image -> `Image + +let href_of_media = + let open Tokens in + function Reference name -> `Reference name | Link uri -> `Link uri + +let split_simple_media Loc.{ location; value = media, target } = + (Loc.at location media, target) + +let split_replacement_media Loc.{ location; value = media, target, content } = + (Loc.at location media, target, content) + +let rec inline_element_inner : Ast.inline_element -> string = function + | `Space s -> s + | `Word s -> s + | `Styled (_, s) -> children s + | `Code_span s -> s + | `Raw_markup (_, s) -> s + | `Reference (_, _, s) -> children s + | `Link (_, s) -> children s + | `Math_span s -> s + +and children s = + List.fold_left + (fun acc elt -> acc ^ inline_element_inner (Loc.value elt)) + "" s + +let legal_module_list : Ast.inline_element Loc.with_location list -> bool = + fun xs -> + not_empty xs + && List.for_all (function `Word _ | `Space _ -> true | _ -> false) + @@ List.map Loc.value xs + +let light_list_item : + Tokens.token -> + Ast.nestable_block_element Loc.with_location -> + [< `Ordered of Ast.nestable_block_element + | `Unordered of Ast.nestable_block_element ] + Loc.with_location = + fun start -> + let open Tokens in + Loc.map (fun item -> + match start with + | MINUS -> `Unordered item + | PLUS -> `Ordered item + | _ -> assert false (* unreachable *)) + +let or_insert = function None -> Option.some | o -> Fun.const o + +let split_light_list_items : + [< `Ordered of Ast.nestable_block_element + | `Unordered of Ast.nestable_block_element ] + Loc.with_location + list -> + [< `Ordered | `Unordered ] + * Ast.nestable_block_element Loc.with_location list = + fun items -> + let rec go acc list_kind = function + | Loc.{ value = `Ordered x; location } :: xs -> + go (Loc.at location x :: acc) (or_insert list_kind `Ordered) xs + | Loc.{ value = `Unordered x; location } :: xs -> + go (Loc.at location x :: acc) (or_insert list_kind `Unordered) xs + | [] -> (Option.get list_kind, List.rev acc) + in + let list_kind, elements = go [] None items in + (list_kind, elements) + +let tag_with_content_start_point : Tokens.tag_with_content -> Loc.point option = + function + | Before { start; _ } + | Raise { start; _ } + | Param { start; _ } + | See { start; _ } -> + Some start + | _ -> None + +let to_ref : Tokens.uri_kind -> [ `Url | `File | `Document ] = function + | URL -> `Url + | File -> `File + | Document -> `Document + +let tag_with_content + (content : Ast.nestable_block_element Loc.with_location list) : + Tokens.tag_with_content -> Ast.tag = function + | DEPRECATED -> `Deprecated content + | Before { inner; _ } -> `Before (inner, content) + | Raise { inner; _ } -> `Raise (inner, content) + | Param { inner; _ } -> `Param (inner, content) + | See { inner = kind, href; _ } -> `See (to_ref kind, href, content) + | RETURN -> `Return content + | CHILDREN_ORDER -> `Children_order content + | TOC_STATUS -> `Toc_status content + | ORDER_CATEGORY -> `Order_category content + | SHORT_TITLE -> `Short_title content + +let tag_bare : Tokens.tag Loc.with_location -> Ast.tag = function + | { value = Author s; _ } -> `Author s.inner + | { value = Since s; _ } -> `Since s.inner + | { value = Version s; _ } -> `Version s.inner + | { value = Canonical s; _ } as loc -> `Canonical { loc with value = s.inner } + | { value = INLINE; _ } -> `Inline + | { value = OPEN; _ } -> `Open + | { value = CLOSED; _ } -> `Closed + | { value = HIDDEN; _ } -> `Hidden + +let ast_style : Tokens.style -> Ast.style = function + | Bold -> `Bold + | Italic -> `Italic + | Emphasis -> `Emphasis + | Superscript -> `Superscript + | Subscript -> `Subscript diff --git a/src/parser/syntax.ml b/src/parser/syntax.ml deleted file mode 100644 index 74ecf88a34..0000000000 --- a/src/parser/syntax.ml +++ /dev/null @@ -1,1504 +0,0 @@ -(* This module is a recursive descent parser for the ocamldoc syntax. The parser - consumes a token stream of type [Token.t Stream.t], provided by the lexer, - and produces a comment AST of the type defined in [Parser_.Ast]. - - The AST has two main levels: inline elements, which can appear inside - paragraphs, and are spaced horizontally when presented, and block elements, - such as paragraphs and lists, which are spaced vertically when presented. - Block elements contain inline elements, but not vice versa. - - Corresponding to this, the parser has three "main" functions: - - - [delimited_inline_element_list] parses a run of inline elements that is - delimited by curly brace markup ([{...}]). - - [paragraph] parses a run of inline elements that make up a paragraph, and - is not explicitly delimited with curly braces. - - [block_element_list] parses a sequence of block elements. A comment is a - sequence of block elements, so [block_element_list] is the top-level - parser. It is also used for list item and tag content. *) - -open! Compat - -type 'a with_location = 'a Loc.with_location - -(* {2 Input} *) - -type input = { - tokens : Token.t Loc.with_location Stream.t; - warnings : Warning.t list ref; -} - -(* {2 Output} *) - -let add_warning input warning = input.warnings := warning :: !(input.warnings) -let junk input = Stream.junk input.tokens - -let peek input = - match Stream.peek input.tokens with - | Some token -> token - | None -> assert false - -module Table = struct - module Light_syntax = struct - let valid_align = function - | [ { Loc.value = `Word w; _ } ] -> ( - match String.length w with - | 0 -> `Valid None - | 1 -> ( - match w with - | "-" -> `Valid None - | ":" -> `Valid (Some `Center) - | _ -> `Invalid) - | len -> - if String.for_all (Char.equal '-') (String.sub w 1 (len - 2)) then - match (String.get w 0, String.get w (len - 1)) with - | ':', ':' -> `Valid (Some `Center) - | ':', '-' -> `Valid (Some `Left) - | '-', ':' -> `Valid (Some `Right) - | '-', '-' -> `Valid None - | _ -> `Invalid - else `Invalid) - | _ -> `Invalid - - let valid_align_row lx = - let rec loop acc = function - | [] -> Some (List.rev acc) - | x :: q -> ( - match valid_align x with - | `Invalid -> None - | `Valid alignment -> loop (alignment :: acc) q) - in - loop [] lx - - let create ~grid ~align : Ast.table = - let cell_to_block (x, k) = - let whole_loc = Loc.span (List.map (fun x -> x.Loc.location) x) in - match x with - | [] -> ([], k) - | _ -> ([ Loc.at whole_loc (`Paragraph x) ], k) - in - let row_to_block = List.map cell_to_block in - let grid_to_block = List.map row_to_block in - ((grid_to_block grid, align), `Light) - - let with_kind kind : 'a with_location list list -> 'a Ast.row = - List.map (fun c -> (c, kind)) - - let from_raw_data grid : Ast.table = - match grid with - | [] -> create ~grid:[] ~align:None - | row1 :: rows2_N -> ( - match valid_align_row row1 with - (* If the first line is the align row, everything else is data. *) - | Some _ as align -> - create ~grid:(List.map (with_kind `Data) rows2_N) ~align - | None -> ( - match rows2_N with - (* Only 1 line, if this is not the align row this is data. *) - | [] -> create ~grid:[ with_kind `Data row1 ] ~align:None - | row2 :: rows3_N -> ( - match valid_align_row row2 with - (* If the second line is the align row, the first one is the - header and the rest is data. *) - | Some _ as align -> - let header = with_kind `Header row1 in - let data = List.map (with_kind `Data) rows3_N in - create ~grid:(header :: data) ~align - (* No align row in the first 2 lines, everything is considered - data. *) - | None -> - create ~grid:(List.map (with_kind `Data) grid) ~align:None - ))) - end - - module Heavy_syntax = struct - let create ~grid : Ast.table = ((grid, None), `Heavy) - let from_grid grid : Ast.table = create ~grid - end -end - -module Reader = struct - let until_rbrace_or_eof input acc = - let rec consume () = - let next_token = peek input in - match next_token.value with - | `Right_brace -> - junk input; - `End (acc, next_token.location) - | `End -> - Parse_error.end_not_allowed next_token.location ~in_what:"table" - |> add_warning input; - junk input; - `End (acc, next_token.location) - | `Space _ | `Single_newline _ | `Blank_line _ -> - junk input; - consume () - | _ -> `Token next_token - in - consume () - - module Infix = struct - let ( >>> ) consume if_token = - match consume with - | `End (ret, loc) -> (ret, loc) - | `Token t -> if_token t - end -end - -open Reader.Infix - -(* The last token in the stream is always [`End], and it is never consumed by - the parser, so the [None] case is impossible. *) - -let npeek n input = Stream.npeek n input.tokens - -(* {2 Non-link inline elements} *) -type style = [ `Bold | `Italic | `Emphasis | `Superscript | `Subscript ] - -(* Convenient abbreviation for use in patterns. *) -type token_that_always_begins_an_inline_element = - [ `Word of string - | `Code_span of string - | `Raw_markup of string option * string - | `Begin_style of style - | `Simple_reference of string - | `Begin_reference_with_replacement_text of string - | `Simple_link of string - | `Begin_link_with_replacement_text of string - | `Math_span of string ] - -(* Check that the token constructors above actually are all in [Token.t]. *) -let _check_subset : token_that_always_begins_an_inline_element -> Token.t = - fun t -> (t :> Token.t) - -(* Consumes tokens that make up a single non-link inline element: - - - a horizontal space ([`Space], significant in inline elements), - - a word (see [word]), - - a code span ([...], [`Code_span _]), or - - styled text ({e ...}). - - The latter requires a recursive call to [delimited_inline_element_list], - defined below. - - This should be part of [delimited_inline_element_list]; however, it is also - called by function [paragraph]. As a result, it is factored out, and made - mutually-recursive with [delimited_inline_element_list]. - - This is called only when it is known that the first token in the list is the - beginning of an inline element. In the case of [`Minus] and [`Plus], that - means the caller has determined that they are not a list bullet (i.e., not - the first non-whitespace tokens on their line). - - This function consumes exactly the tokens that make up the element. *) -let rec inline_element : - input -> Loc.span -> _ -> Ast.inline_element with_location = - fun input location next_token -> - match next_token with - | `Space _ as token -> - junk input; - Loc.at location token - | `Word _ as token -> - junk input; - Loc.at location token - (* This is actually the same memory representation as the token, complete - with location, and is probably the most common case. Perhaps the token - can be reused somehow. The same is true of [`Space], [`Code_span]. *) - | `Minus -> - junk input; - Loc.at location (`Word "-") - | `Plus -> - junk input; - Loc.at location (`Word "+") - | `Bar -> - junk input; - Loc.at location (`Word "|") - | (`Code_span _ | `Math_span _ | `Raw_markup _) as token -> - junk input; - Loc.at location token - | `Begin_style s as parent_markup -> - junk input; - - let requires_leading_whitespace = - match s with - | `Bold | `Italic | `Emphasis -> true - | `Superscript | `Subscript -> false - in - let content, brace_location = - delimited_inline_element_list ~parent_markup - ~parent_markup_location:location ~requires_leading_whitespace input - in - - let location = Loc.span [ location; brace_location ] in - - if content = [] then - Parse_error.should_not_be_empty - ~what:(Token.describe parent_markup) - location - |> add_warning input; - - Loc.at location (`Styled (s, content)) - | `Simple_reference r -> - junk input; - - let r_location = Loc.nudge_start (String.length "{!") location in - let r = Loc.at r_location r in - - Loc.at location (`Reference (`Simple, r, [])) - | `Begin_reference_with_replacement_text r as parent_markup -> - junk input; - - let r_location = Loc.nudge_start (String.length "{{!") location in - let r = Loc.at r_location r in - - let content, brace_location = - delimited_inline_element_list ~parent_markup - ~parent_markup_location:location ~requires_leading_whitespace:false - input - in - - let location = Loc.span [ location; brace_location ] in - - if content = [] then - Parse_error.should_not_be_empty - ~what:(Token.describe parent_markup) - location - |> add_warning input; - - Loc.at location (`Reference (`With_text, r, content)) - | `Simple_link u -> - junk input; - - let u = String.trim u in - - if u = "" then - Parse_error.should_not_be_empty - ~what:(Token.describe next_token) - location - |> add_warning input; - - Loc.at location (`Link (u, [])) - | `Begin_link_with_replacement_text u as parent_markup -> - junk input; - - let u = String.trim u in - - if u = "" then - Parse_error.should_not_be_empty - ~what:(Token.describe parent_markup) - location - |> add_warning input; - - let content, brace_location = - delimited_inline_element_list ~parent_markup - ~parent_markup_location:location ~requires_leading_whitespace:false - input - in - - `Link (u, content) |> Loc.at (Loc.span [ location; brace_location ]) - -(* Consumes tokens that make up a sequence of inline elements that is ended by - a '}', a [`Right_brace] token. The brace token is also consumed. - - The sequences are also preceded by some markup like '{b'. Some of these - markup tokens require whitespace immediately after the token, and others not. - The caller indicates which way that is through the - [~requires_leading_whitespace] argument. - - Whitespace is significant in inline element lists. In particular, "foo [bar]" - is represented as [`Word "foo"; `Space; `Code_span "bar"], while "foo[bar]" - is [`Word "foo"; `Code_span "bar"]. It doesn't matter how much whitespace is - there, just whether it is present or not. Single newlines and horizontal - space in any amount are allowed. Blank lines are not, as these are separators - for {e block} elements. - - In correct input, the first and last elements emitted will not be [`Space], - i.e. [`Space] appears only between other non-link inline elements. In - incorrect input, there might be [`Space] followed immediately by something - like an @author tag. - - The [~parent_markup] and [~parent_markup_location] arguments are used for - generating error messages. *) -and delimited_inline_element_list : - parent_markup:[< Token.t ] -> - parent_markup_location:Loc.span -> - requires_leading_whitespace:bool -> - input -> - Ast.inline_element with_location list * Loc.span = - fun ~parent_markup ~parent_markup_location ~requires_leading_whitespace input -> - (* [~at_start_of_line] is used to interpret [`Minus] and [`Plus]. These are - word tokens if not the first non-whitespace tokens on their line. Then, - they are allowed in a non-link element list. *) - let rec consume_elements : - at_start_of_line:bool -> - Ast.inline_element with_location list -> - Ast.inline_element with_location list * Loc.span = - fun ~at_start_of_line acc -> - let next_token = peek input in - match next_token.value with - | `Right_brace -> - junk input; - (List.rev acc, next_token.location) - (* The [`Space] token is not space at the beginning or end of line, because - that is combined into [`Single_newline] or [`Blank_line] tokens. It is - also not at the beginning of markup (after e.g. '{b'), because that is - handled separately before calling - [consume_non_link_inline_elements], and not immediately before '}', - because that is combined into the [`Right_brace] token by the lexer. So, - it is an internal space, and we want to add it to the non-link inline - element list. *) - | (`Space _ | #token_that_always_begins_an_inline_element) as token -> - let acc = inline_element input next_token.location token :: acc in - consume_elements ~at_start_of_line:false acc - | `Single_newline ws -> - junk input; - let element = Loc.same next_token (`Space ws) in - consume_elements ~at_start_of_line:true (element :: acc) - | `Blank_line ws as blank -> - Parse_error.not_allowed ~what:(Token.describe blank) - ~in_what:(Token.describe parent_markup) - next_token.location - |> add_warning input; - - junk input; - let element = Loc.same next_token (`Space ws) in - consume_elements ~at_start_of_line:true (element :: acc) - | `Bar as token -> - let acc = inline_element input next_token.location token :: acc in - consume_elements ~at_start_of_line:false acc - | (`Minus | `Plus) as bullet -> - (if at_start_of_line then - let suggestion = - Printf.sprintf "move %s so it isn't the first thing on the line." - (Token.print bullet) - in - Parse_error.not_allowed ~what:(Token.describe bullet) - ~in_what:(Token.describe parent_markup) - ~suggestion next_token.location - |> add_warning input); - - let acc = inline_element input next_token.location bullet :: acc in - consume_elements ~at_start_of_line:false acc - | other_token -> - Parse_error.not_allowed - ~what:(Token.describe other_token) - ~in_what:(Token.describe parent_markup) - next_token.location - |> add_warning input; - - let last_location = - match acc with - | last_token :: _ -> last_token.location - | [] -> parent_markup_location - in - - (List.rev acc, last_location) - in - - let first_token = peek input in - match first_token.value with - | `Space _ -> - junk input; - consume_elements ~at_start_of_line:false [] - (* [~at_start_of_line] is [false] here because the preceding token was some - some markup like '{b', and we didn't move to the next line, so the next - token will not be the first non-whitespace token on its line. *) - | `Single_newline _ -> - junk input; - consume_elements ~at_start_of_line:true [] - | `Blank_line _ as blank -> - (* In case the markup is immediately followed by a blank line, the error - message printed by the catch-all case below can be confusing, as it will - suggest that the markup must be followed by a newline (which it is). It - just must not be followed by two newlines. To explain that clearly, - handle that case specifically. *) - Parse_error.not_allowed ~what:(Token.describe blank) - ~in_what:(Token.describe parent_markup) - first_token.location - |> add_warning input; - - junk input; - consume_elements ~at_start_of_line:true [] - | `Right_brace -> - junk input; - ([], first_token.location) - | _ -> - if requires_leading_whitespace then - Parse_error.should_be_followed_by_whitespace - ~what:(Token.print parent_markup) - parent_markup_location - |> add_warning input; - consume_elements ~at_start_of_line:false [] - -(* {2 Paragraphs} *) - -(* Consumes tokens that make up a paragraph. - - A paragraph is a sequence of inline elements that ends on a blank line, or - explicit block markup such as a verbatim block on a new line. - - Because of the significance of newlines, paragraphs are parsed line-by-line. - The function [paragraph] is called only when the current token is the first - non-whitespace token on its line, and begins an inline element. [paragraph] - then parses a line of inline elements. Afterwards, it looks ahead to the next - line. If that line also begins with an inline element, it parses that line, - and so on. *) -let paragraph : input -> Ast.nestable_block_element with_location = - fun input -> - (* Parses a single line of a paragraph, consisting of inline elements. The - only valid ways to end a paragraph line are with [`End], [`Single_newline], - [`Blank_line], and [`Right_brace]. Everything else either belongs in the - paragraph, or signifies an attempt to begin a block element inside a - paragraph line, which is an error. These errors are caught elsewhere; the - paragraph parser just stops. *) - let rec paragraph_line : - Ast.inline_element with_location list -> - Ast.inline_element with_location list = - fun acc -> - let next_token = peek input in - match next_token.value with - | ( `Space _ | `Minus | `Plus | `Bar - | #token_that_always_begins_an_inline_element ) as token -> - let element = inline_element input next_token.location token in - paragraph_line (element :: acc) - | _ -> acc - in - - (* After each line is parsed, decides whether to parse more lines. *) - let rec additional_lines : - Ast.inline_element with_location list -> - Ast.inline_element with_location list = - fun acc -> - match npeek 2 input with - | { value = `Single_newline ws; location } - :: { value = #token_that_always_begins_an_inline_element | `Bar; _ } - :: _ -> - junk input; - let acc = Loc.at location (`Space ws) :: acc in - let acc = paragraph_line acc in - additional_lines acc - | _ -> List.rev acc - in - - let elements = paragraph_line [] |> additional_lines in - `Paragraph elements |> Loc.at (Loc.span (List.map Loc.location elements)) - -(* {2 Block elements} *) - -(* {3 Helper types} *) - -(* The interpretation of tokens in the block parser depends on where on a line - each token appears. The six possible "locations" are: - - - [`At_start_of_line], when only whitespace has been read on the current - line. - - [`After_tag], when a valid tag token, such as [@deprecated], has been read, - and only whitespace has been read since. - - [`After_shorthand_bullet], when a valid shorthand list item bullet, such as - [-], has been read, and only whitespace has been read since. - - [`After_explicit_list_bullet], when a valid explicit bullet, such as [{li], - has been read, and only whitespace has been read since. - - [`After_table_cell], when a table cell opening markup ('{th' or '{td') has been read. - - [`After_text], when any other valid non-whitespace token has already been - read on the current line. - - Here are some examples of how this affects the interpretation of tokens: - - - A paragraph can start anywhere except [`After_text] (two paragraphs cannot - be on the same line, but paragraphs can be nested in just about anything). - - [`Minus] is interpreted as a list item bullet [`At_start_of_line], - [`After_tag], and [`After_explicit_list_bullet]. - - Tags are only allowed [`At_start_of_line]. - - To track the location accurately, the functions that make up the block parser - pass explicit [where_in_line] values around and return them. - - In a few cases, [where_in_line] can be inferred from what helper was called. - For example, the [paragraph] parser always stops on the same line as the last - significant token that is in the paragraph it consumed, so the location must - be [`After_text]. *) -type where_in_line = - [ `At_start_of_line - | `After_tag - | `After_shorthand_bullet - | `After_explicit_list_bullet - | `After_table_cell - | `After_text ] - -(* The block parsing loop, function [block_element_list], stops when it - encounters certain tokens. - - When it is called for the whole comment, or for in explicit list item - ([{li foo}]), it can only stop on end of input or a right brace. - - When it is called inside a shorthand list item ([- foo]), it stops on end of - input, right brace, a blank line (indicating end of shorthand list), plus or - minus (indicating the start of the next list item), or a section heading or - tag, which cannot be nested in list markup. - - The block parser [block_element_list] explicitly returns the token that - stopped it, with a type more precise than [Token.t stream_head]: if it was - called for the whole comment or an explicit list item, the stop token will - have type [stops_at_delimiters stream_head], and if it was called for a - shorthand list item, the stop token will have type - [implicit_stop stream_head]. This allows the calling parsers to write precise - cases for exactly the tokens that might be at the front of the stream after - the block parser returns. *) -type stops_at_delimiters = [ `End | `Right_brace ] -type code_stop = [ `End | `Right_code_delimiter ] - -type stopped_implicitly = - [ `End - | `Blank_line of string - | `Right_brace - | `Minus - | `Plus - | Token.section_heading - | Token.media_markup - | Token.tag ] - -(* Ensure that the above two types are really subsets of [Token.t]. *) -let _check_subset : stops_at_delimiters -> Token.t = fun t -> (t :> Token.t) -let _check_subset : stopped_implicitly -> Token.t = fun t -> (t :> Token.t) - -(* The different contexts in which the block parser [block_element_list] can be - called. The block parser's behavior depends somewhat on the context. For - example, while paragraphs are allowed anywhere, shorthand lists are not - allowed immediately inside other shorthand lists, while tags are not allowed - anywhere except at the comment top level. - - Besides telling the block parser how to behave, each context also carries two - types, which determine the return type of the block parser: - - - The type of blocks the parser returns. Note that [nestable_block_element] - is included in [block_element]. However, the extra block kinds in - [block_element] are only allowed at the comment top level. - - The type of token that the block parser stops at. See discussion above. *) -type ('block, 'stops_at_which_tokens) context = - | Top_level : (Ast.block_element, stops_at_delimiters) context - | In_implicitly_ended : - [ `Tag | `Shorthand_list ] - -> (Ast.nestable_block_element, stopped_implicitly) context - | In_explicit_list : (Ast.nestable_block_element, stops_at_delimiters) context - | In_table_cell : (Ast.nestable_block_element, stops_at_delimiters) context - | In_code_results : (Ast.nestable_block_element, code_stop) context - -(* This is a no-op. It is needed to prove to the type system that nestable block - elements are acceptable block elements in all contexts. *) -let accepted_in_all_contexts : - type block stops_at_which_tokens. - (block, stops_at_which_tokens) context -> - Ast.nestable_block_element -> - block = - fun context block -> - match context with - | Top_level -> (block :> Ast.block_element) - | In_implicitly_ended (`Tag | `Shorthand_list) -> block - | In_explicit_list -> block - | In_table_cell -> block - | In_code_results -> block - -(* Converts a tag to a series of words. This is used in error recovery, when a - tag cannot be generated. *) -let tag_to_words = function - | `Author s -> [ `Word "@author"; `Space " "; `Word s ] - | `Before s -> [ `Word "@before"; `Space " "; `Word s ] - | `Canonical s -> [ `Word "@canonical"; `Space " "; `Word s ] - | `Deprecated -> [ `Word "@deprecated" ] - | `Inline -> [ `Word "@inline" ] - | `Open -> [ `Word "@open" ] - | `Closed -> [ `Word "@closed" ] - | `Hidden -> [ `Word "@hidden" ] - | `Param s -> [ `Word "@param"; `Space " "; `Word s ] - | `Raise s -> [ `Word "@raise"; `Space " "; `Word s ] - | `Return -> [ `Word "@return" ] - | `See (`Document, s) -> [ `Word "@see"; `Space " "; `Word ("\"" ^ s ^ "\"") ] - | `See (`File, s) -> [ `Word "@see"; `Space " "; `Word ("'" ^ s ^ "'") ] - | `See (`Url, s) -> [ `Word "@see"; `Space " "; `Word ("<" ^ s ^ ">") ] - | `Since s -> [ `Word "@since"; `Space " "; `Word s ] - | `Version s -> [ `Word "@version"; `Space " "; `Word s ] - | `Children_order -> [ `Word "@children_order" ] - | `Toc_status -> [ `Word "@toc_status" ] - | `Order_category -> [ `Word "@order_category" ] - | `Short_title -> [ `Word "@short_title" ] - -(* {3 Block element lists} *) - -(* Consumes tokens making up a sequence of block elements. These are: - - - paragraphs, - - code blocks, - - verbatim text blocks, - - tables, - - lists, and - - section headings. *) -let rec block_element_list : - type block stops_at_which_tokens. - (block, stops_at_which_tokens) context -> - parent_markup:[< Token.t | `Comment ] -> - input -> - block with_location list - * stops_at_which_tokens with_location - * where_in_line = - fun context ~parent_markup input -> - let rec consume_block_elements : - where_in_line -> - block with_location list -> - block with_location list - * stops_at_which_tokens with_location - * where_in_line = - fun where_in_line acc -> - let describe token = - match token with - | #token_that_always_begins_an_inline_element -> "paragraph" - | _ -> Token.describe token - in - - let warn_if_after_text { Loc.location; value = token } = - if where_in_line = `After_text then - Parse_error.should_begin_on_its_own_line ~what:(describe token) location - |> add_warning input - in - - let warn_because_not_at_top_level { Loc.location; value = token } = - let suggestion = - Printf.sprintf "move %s outside of any other markup." - (Token.print token) - in - Parse_error.not_allowed ~what:(Token.describe token) - ~in_what:(Token.describe parent_markup) - ~suggestion location - |> add_warning input - in - - match peek input with - (* Terminators: the two tokens that terminate anything. *) - | { value = `End; _ } as next_token -> ( - match context with - | Top_level -> (List.rev acc, next_token, where_in_line) - | In_implicitly_ended (`Tag | `Shorthand_list) -> - (List.rev acc, next_token, where_in_line) - | In_explicit_list -> (List.rev acc, next_token, where_in_line) - | In_table_cell -> (List.rev acc, next_token, where_in_line) - | In_code_results -> (List.rev acc, next_token, where_in_line)) - | { value = `Right_brace; _ } as next_token -> ( - (* This little absurdity is needed to satisfy the type system. Without it, - OCaml is unable to prove that [stream_head] has the right type for all - possible values of [context]. *) - match context with - | Top_level -> (List.rev acc, next_token, where_in_line) - | In_implicitly_ended (`Tag | `Shorthand_list) -> - (List.rev acc, next_token, where_in_line) - | In_explicit_list -> (List.rev acc, next_token, where_in_line) - | In_table_cell -> (List.rev acc, next_token, where_in_line) - | In_code_results -> - junk input; - consume_block_elements where_in_line acc) - | { value = `Right_code_delimiter; _ } as next_token -> ( - match context with - | In_code_results -> (List.rev acc, next_token, where_in_line) - | _ -> - junk input; - consume_block_elements where_in_line acc) - (* Whitespace. This can terminate some kinds of block elements. It is also - necessary to track it to interpret [`Minus] and [`Plus] correctly, as - well as to ensure that all block elements begin on their own line. *) - | { value = `Space _; _ } -> - junk input; - consume_block_elements where_in_line acc - | { value = `Single_newline _; _ } -> - junk input; - consume_block_elements `At_start_of_line acc - | { value = `Blank_line _; _ } as next_token -> ( - match context with - (* Blank lines terminate shorthand lists ([- foo]) and tags. They also - terminate paragraphs, but the paragraph parser is aware of that - internally. *) - | In_implicitly_ended (`Tag | `Shorthand_list) -> - (List.rev acc, next_token, where_in_line) - (* Otherwise, blank lines are pretty much like single newlines. *) - | _ -> - junk input; - consume_block_elements `At_start_of_line acc) - (* Explicit list items ([{li ...}] and [{- ...}]) can never appear directly - in block content. They can only appear inside [{ul ...}] and [{ol ...}]. - So, catch those. *) - | { value = `Begin_list_item _ as token; location } -> - let suggestion = - Printf.sprintf "move %s into %s, or use %s." (Token.print token) - (Token.describe (`Begin_list `Unordered)) - (Token.describe `Minus) - in - Parse_error.not_allowed ~what:(Token.describe token) - ~in_what:(Token.describe parent_markup) - ~suggestion location - |> add_warning input; - - junk input; - consume_block_elements where_in_line acc - (* Table rows ([{tr ...}]) can never appear directly - in block content. They can only appear inside [{table ...}]. *) - | { value = `Begin_table_row as token; location } -> - let suggestion = - Printf.sprintf "move %s into %s." (Token.print token) - (Token.describe `Begin_table_heavy) - in - Parse_error.not_allowed ~what:(Token.describe token) - ~in_what:(Token.describe parent_markup) - ~suggestion location - |> add_warning input; - junk input; - consume_block_elements where_in_line acc - (* Table cells ([{th ...}] and [{td ...}]) can never appear directly - in block content. They can only appear inside [{tr ...}]. *) - | { value = `Begin_table_cell _ as token; location } -> - let suggestion = - Printf.sprintf "move %s into %s." (Token.print token) - (Token.describe `Begin_table_row) - in - Parse_error.not_allowed ~what:(Token.describe token) - ~in_what:(Token.describe parent_markup) - ~suggestion location - |> add_warning input; - junk input; - consume_block_elements where_in_line acc - (* Tags. These can appear at the top level only. *) - | { value = `Tag tag as token; location } as next_token -> ( - let recover_when_not_at_top_level context = - warn_because_not_at_top_level next_token; - junk input; - let words = List.map (Loc.at location) (tag_to_words tag) in - let paragraph = - `Paragraph words - |> accepted_in_all_contexts context - |> Loc.at location - in - consume_block_elements `At_start_of_line (paragraph :: acc) - in - - match context with - (* Tags cannot make sense in an explicit list ([{ul {li ...}}]). *) - | In_explicit_list -> recover_when_not_at_top_level context - (* If a tag starts at the beginning of a line, it terminates the preceding - tag and/or the current shorthand list. In this case, return to the - caller, and let the caller decide how to interpret the tag token. *) - | In_implicitly_ended (`Tag | `Shorthand_list) -> - if where_in_line = `At_start_of_line then - (List.rev acc, next_token, where_in_line) - else recover_when_not_at_top_level context - | In_table_cell -> recover_when_not_at_top_level context - | In_code_results -> recover_when_not_at_top_level context - (* If this is the top-level call to [block_element_list], parse the - tag. *) - | Top_level -> ( - if where_in_line <> `At_start_of_line then - Parse_error.should_begin_on_its_own_line - ~what:(Token.describe token) location - |> add_warning input; - - junk input; - - match tag with - | (`Author s | `Since s | `Version s | `Canonical s) as tag -> - let s = String.trim s in - if s = "" then - Parse_error.should_not_be_empty ~what:(Token.describe token) - location - |> add_warning input; - let tag = - match tag with - | `Author _ -> `Author s - | `Since _ -> `Since s - | `Version _ -> `Version s - | `Canonical _ -> - (* TODO The location is only approximate, as we need lexer - cooperation to get the real location. *) - let r_location = - Loc.nudge_start (String.length "@canonical ") location - in - `Canonical (Loc.at r_location s) - in - - let tag = Loc.at location (`Tag tag) in - consume_block_elements `After_text (tag :: acc) - | ( `Deprecated | `Return | `Children_order | `Short_title - | `Toc_status | `Order_category ) as tag -> - let content, _stream_head, where_in_line = - block_element_list (In_implicitly_ended `Tag) - ~parent_markup:token input - in - let tag = - match tag with - | `Deprecated -> `Deprecated content - | `Toc_status -> `Toc_status content - | `Return -> `Return content - | `Children_order -> `Children_order content - | `Short_title -> `Short_title content - | `Order_category -> `Order_category content - in - let location = - location :: List.map Loc.location content |> Loc.span - in - let tag = Loc.at location (`Tag tag) in - consume_block_elements where_in_line (tag :: acc) - | (`Param _ | `Raise _ | `Before _) as tag -> - let content, _stream_head, where_in_line = - block_element_list (In_implicitly_ended `Tag) - ~parent_markup:token input - in - let tag = - match tag with - | `Param s -> `Param (s, content) - | `Raise s -> `Raise (s, content) - | `Before s -> `Before (s, content) - in - let location = - location :: List.map Loc.location content |> Loc.span - in - let tag = Loc.at location (`Tag tag) in - consume_block_elements where_in_line (tag :: acc) - | `See (kind, target) -> - let content, _next_token, where_in_line = - block_element_list (In_implicitly_ended `Tag) - ~parent_markup:token input - in - let location = - location :: List.map Loc.location content |> Loc.span - in - let tag = `Tag (`See (kind, target, content)) in - let tag = Loc.at location tag in - consume_block_elements where_in_line (tag :: acc) - | (`Inline | `Open | `Closed | `Hidden) as tag -> - let tag = Loc.at location (`Tag tag) in - consume_block_elements `After_text (tag :: acc))) - | ( { value = #token_that_always_begins_an_inline_element; _ } - | { value = `Bar; _ } ) as next_token -> - warn_if_after_text next_token; - - let block = paragraph input in - let block = Loc.map (accepted_in_all_contexts context) block in - let acc = block :: acc in - consume_block_elements `After_text acc - | { value = `Verbatim s as token; location } as next_token -> - warn_if_after_text next_token; - if s = "" then - Parse_error.should_not_be_empty ~what:(Token.describe token) location - |> add_warning input; - - junk input; - let block = accepted_in_all_contexts context token in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements `After_text acc - | { value = `Math_block s as token; location } as next_token -> - warn_if_after_text next_token; - if s = "" then - Parse_error.should_not_be_empty ~what:(Token.describe token) location - |> add_warning input; - - junk input; - let block = accepted_in_all_contexts context token in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements `After_text acc - | { - value = - `Code_block (meta, delim, { value = s; location = v_loc }, has_outputs) - as token; - location; - } as next_token -> - warn_if_after_text next_token; - junk input; - let delimiter = if delim = "" then None else Some delim in - let output, location = - if not has_outputs then (None, location) - else - let content, next_token, _where_in_line = - block_element_list In_code_results ~parent_markup:token input - in - junk input; - let locations = - location :: List.map (fun content -> content.Loc.location) content - in - let location = Loc.span locations in - let location = { location with end_ = next_token.location.end_ } in - (Some content, location) - in - - if s = "" then - Parse_error.should_not_be_empty ~what:(Token.describe token) location - |> add_warning input; - - let meta = - match meta with - | None -> None - | Some (language, tags) -> Some { Ast.language; tags } - in - let block = - accepted_in_all_contexts context - (`Code_block - { - Ast.meta; - delimiter; - content = { value = s; location = v_loc }; - output; - }) - in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements `After_text acc - | { value = `Modules s as token; location } as next_token -> - warn_if_after_text next_token; - - junk input; - - (* TODO Use some library for a splitting function, or move this out into a - Util module. *) - let split_string delimiters s = - let rec scan_delimiters acc index = - if index >= String.length s then List.rev acc - else if String.contains delimiters s.[index] then - scan_delimiters acc (index + 1) - else scan_word acc index (index + 1) - and scan_word acc start_index index = - if index >= String.length s then - let word = String.sub s start_index (index - start_index) in - List.rev (word :: acc) - else if String.contains delimiters s.[index] then - let word = String.sub s start_index (index - start_index) in - scan_delimiters (word :: acc) (index + 1) - else scan_word acc start_index (index + 1) - in - - scan_delimiters [] 0 - in - - (* TODO Correct locations await a full implementation of {!modules} - parsing. *) - let modules = - split_string " \t\r\n" s |> List.map (fun r -> Loc.at location r) - in - - if modules = [] then - Parse_error.should_not_be_empty ~what:(Token.describe token) location - |> add_warning input; - - let block = accepted_in_all_contexts context (`Modules modules) in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements `After_text acc - | { value = `Begin_list kind as token; location } as next_token -> - warn_if_after_text next_token; - - junk input; - - let items, brace_location = - explicit_list_items ~parent_markup:token input - in - if items = [] then - Parse_error.should_not_be_empty ~what:(Token.describe token) location - |> add_warning input; - - let location = Loc.span [ location; brace_location ] in - let block = `List (kind, `Heavy, items) in - let block = accepted_in_all_contexts context block in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements `After_text acc - | { value = (`Begin_table_light | `Begin_table_heavy) as token; location } - as next_token -> - warn_if_after_text next_token; - junk input; - let block, brace_location = - let parent_markup = token in - let parent_markup_location = location in - match token with - | `Begin_table_light -> - light_table input ~parent_markup ~parent_markup_location - | `Begin_table_heavy -> - heavy_table input ~parent_markup ~parent_markup_location - in - let location = Loc.span [ location; brace_location ] in - let block = accepted_in_all_contexts context (`Table block) in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements `After_text acc - | { value = (`Minus | `Plus) as token; location } as next_token -> ( - (match where_in_line with - | `After_text | `After_shorthand_bullet -> - Parse_error.should_begin_on_its_own_line - ~what:(Token.describe token) location - |> add_warning input - | _ -> ()); - - match context with - | In_implicitly_ended `Shorthand_list -> - (List.rev acc, next_token, where_in_line) - | _ -> - let items, where_in_line = - shorthand_list_items next_token where_in_line input - in - let kind = - match token with `Minus -> `Unordered | `Plus -> `Ordered - in - let location = - location :: List.map Loc.location (List.flatten items) |> Loc.span - in - let block = `List (kind, `Light, items) in - let block = accepted_in_all_contexts context block in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements where_in_line acc) - | { value = `Begin_section_heading (level, label) as token; location } as - next_token -> ( - let recover_when_not_at_top_level context = - warn_because_not_at_top_level next_token; - junk input; - let content, brace_location = - delimited_inline_element_list ~parent_markup:token - ~parent_markup_location:location ~requires_leading_whitespace:true - input - in - let location = Loc.span [ location; brace_location ] in - let paragraph = - `Paragraph content - |> accepted_in_all_contexts context - |> Loc.at location - in - consume_block_elements `At_start_of_line (paragraph :: acc) - in - - match context with - | In_implicitly_ended (`Tag | `Shorthand_list) -> - if where_in_line = `At_start_of_line then - (List.rev acc, next_token, where_in_line) - else recover_when_not_at_top_level context - | In_explicit_list -> recover_when_not_at_top_level context - | In_table_cell -> recover_when_not_at_top_level context - | In_code_results -> recover_when_not_at_top_level context - | Top_level -> - if where_in_line <> `At_start_of_line then - Parse_error.should_begin_on_its_own_line - ~what:(Token.describe token) location - |> add_warning input; - - let label = - match label with - | Some "" -> - Parse_error.should_not_be_empty ~what:"heading label" location - |> add_warning input; - None - | _ -> label - in - - junk input; - - let content, brace_location = - delimited_inline_element_list ~parent_markup:token - ~parent_markup_location:location - ~requires_leading_whitespace:true input - in - if content = [] then - Parse_error.should_not_be_empty ~what:(Token.describe token) - location - |> add_warning input; - - let location = Loc.span [ location; brace_location ] in - let heading = `Heading (level, label, content) in - let heading = Loc.at location heading in - let acc = heading :: acc in - consume_block_elements `After_text acc) - | { value = `Begin_paragraph_style _ as token; location } -> - junk input; - let content, brace_location = - delimited_inline_element_list ~parent_markup:token - ~parent_markup_location:location ~requires_leading_whitespace:true - input - in - let location = Loc.span [ location; brace_location ] in - - Parse_error.markup_should_not_be_used ~what:(Token.describe token) - location - |> add_warning input; - - let paragraph = - `Paragraph content - |> accepted_in_all_contexts context - |> Loc.at location - in - consume_block_elements `At_start_of_line (paragraph :: acc) - | { - location; - value = `Media_with_replacement_text (href, media, content) as token; - } -> - junk input; - - let r_location = - Loc.nudge_start - (String.length @@ Token.s_of_media `Replaced media) - location - |> Loc.nudge_end (String.length content + 1) - (* +1 for closing character *) - in - let c_location = - Loc.nudge_start - (String.length (Token.s_of_media `Replaced media) - + String.length (match href with `Reference s | `Link s -> s)) - location - |> Loc.nudge_end 1 - in - let content = String.trim content in - let href = href |> Loc.at r_location in - - if content = "" then - Parse_error.should_not_be_empty ~what:(Token.describe token) - c_location - |> add_warning input; - - let block = `Media (`Simple, href, content, media) in - let block = accepted_in_all_contexts context block in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements `After_text acc - | { location; value = `Simple_media (href, media) } -> - junk input; - - let r_location = - Loc.nudge_start - (String.length @@ Token.s_of_media `Simple media) - location - |> Loc.nudge_end 1 - in - let href = href |> Loc.at r_location in - let block = `Media (`Simple, href, "", media) in - let block = accepted_in_all_contexts context block in - let block = Loc.at location block in - let acc = block :: acc in - consume_block_elements `After_text acc - in - - let where_in_line = - match context with - | Top_level -> `At_start_of_line - | In_implicitly_ended `Shorthand_list -> `After_shorthand_bullet - | In_explicit_list -> `After_explicit_list_bullet - | In_table_cell -> `After_table_cell - | In_code_results -> `After_tag - | In_implicitly_ended `Tag -> `After_tag - in - - consume_block_elements where_in_line [] - -(* {3 Lists} *) - -(* Consumes a sequence of implicit list items. Each one consists of a [`Minus] - or [`Plus] token, followed by block elements until: - - - a blank line, or - - a list bullet of the opposite kind (e.g. [`Plus] for a [`Minus] list). - - This function is called when the next token is known to be [`Minus] or - [`Plus]. It consumes that token, and calls the block element parser (see - above). That parser returns to [implicit_list_items] only on [`Blank_line], - [`End], [`Minus] or [`Plus] at the start of a line, or [`Right_brace]. *) -and shorthand_list_items : - [ `Minus | `Plus ] with_location -> - where_in_line -> - input -> - Ast.nestable_block_element with_location list list * where_in_line = - fun first_token where_in_line input -> - let bullet_token = first_token.value in - - let rec consume_list_items : - [> ] with_location -> - where_in_line -> - Ast.nestable_block_element with_location list list -> - Ast.nestable_block_element with_location list list * where_in_line = - fun next_token where_in_line acc -> - match next_token.value with - | `End | `Right_brace | `Blank_line _ | `Tag _ | `Begin_section_heading _ - | `Simple_media _ | `Media_with_replacement_text _ -> - (List.rev acc, where_in_line) - | (`Minus | `Plus) as bullet -> - if bullet = bullet_token then ( - junk input; - - let content, stream_head, where_in_line = - block_element_list (In_implicitly_ended `Shorthand_list) - ~parent_markup:bullet input - in - if content = [] then - Parse_error.should_not_be_empty ~what:(Token.describe bullet) - next_token.location - |> add_warning input; - - let acc = content :: acc in - consume_list_items stream_head where_in_line acc) - else (List.rev acc, where_in_line) - in - - consume_list_items - (first_token :> stopped_implicitly with_location) - where_in_line [] - -(* Consumes a sequence of explicit list items (starting with '{li ...}' and - '{-...}', which are represented by [`Begin_list_item _] tokens). - - This function is called immediately after '{ul' or '{ol' ([`Begin_list _]) is - read. The only "valid" way to exit is by reading a [`Right_brace] token, - which is consumed. - - Whitespace inside the list, but outside list items, is not significant – this - parsing function consumes all of it. Otherwise, only list item start tokens - are accepted. Everything else is an error. *) -and explicit_list_items : - parent_markup:[< Token.t ] -> - input -> - Ast.nestable_block_element with_location list list * Loc.span = - fun ~parent_markup input -> - let rec consume_list_items : - Ast.nestable_block_element with_location list list -> - Ast.nestable_block_element with_location list list * Loc.span = - fun acc -> - let next_token = peek input in - match next_token.value with - | `End -> - Parse_error.end_not_allowed next_token.location - ~in_what:(Token.describe parent_markup) - |> add_warning input; - (List.rev acc, next_token.location) - | `Right_brace -> - junk input; - (List.rev acc, next_token.location) - | `Space _ | `Single_newline _ | `Blank_line _ -> - junk input; - consume_list_items acc - | `Begin_list_item kind as token -> - junk input; - - (* '{li', represented by [`Begin_list_item `Li], must be followed by - whitespace. *) - (if kind = `Li then - match (peek input).value with - | `Space _ | `Single_newline _ | `Blank_line _ | `Right_brace -> - () - (* The presence of [`Right_brace] above requires some explanation: - - - It is better to be silent about missing whitespace if the next - token is [`Right_brace], because the error about an empty list - item will be generated below, and that error is more important to - the user. - - The [`Right_brace] token also happens to include all whitespace - before it, as a convenience for the rest of the parser. As a - result, not ignoring it could be wrong: there could in fact be - whitespace in the concrete syntax immediately after '{li', just - it is not represented as [`Space], [`Single_newline], or - [`Blank_line]. *) - | _ -> - Parse_error.should_be_followed_by_whitespace next_token.location - ~what:(Token.print token) - |> add_warning input); - - let content, token_after_list_item, _where_in_line = - block_element_list In_explicit_list ~parent_markup:token input - in - - if content = [] then - Parse_error.should_not_be_empty next_token.location - ~what:(Token.describe token) - |> add_warning input; - - (match token_after_list_item.value with - | `Right_brace -> junk input - | `End -> - Parse_error.end_not_allowed token_after_list_item.location - ~in_what:(Token.describe token) - |> add_warning input); - - let acc = content :: acc in - consume_list_items acc - | token -> - let suggestion = - match token with - | `Begin_section_heading _ | `Tag _ -> - Printf.sprintf "move %s outside the list." (Token.describe token) - | _ -> - Printf.sprintf "move %s into a list item, %s or %s." - (Token.describe token) - (Token.print (`Begin_list_item `Li)) - (Token.print (`Begin_list_item `Dash)) - in - Parse_error.not_allowed next_token.location ~what:(Token.describe token) - ~in_what:(Token.describe parent_markup) - ~suggestion - |> add_warning input; - - junk input; - consume_list_items acc - in - - consume_list_items [] - -(* Consumes a sequence of table rows that might start with [`Bar]. - - This function is called immediately after '{t' ([`Begin_table `Light]) is - read. The only "valid" way to exit is by reading a [`Right_brace] token, - which is consumed. *) -and light_table ~parent_markup ~parent_markup_location input = - let rec consume_rows acc ~last_loc = - Reader.until_rbrace_or_eof input acc >>> fun next_token -> - match next_token.Loc.value with - | `Bar | #token_that_always_begins_an_inline_element -> ( - let next, row, last_loc = - light_table_row ~parent_markup ~last_loc input - in - match next with - | `Continue -> consume_rows (row :: acc) ~last_loc - | `Stop -> (row :: acc, last_loc)) - | other_token -> - Parse_error.not_allowed next_token.location - ~what:(Token.describe other_token) - ~in_what:(Token.describe parent_markup) - |> add_warning input; - junk input; - consume_rows acc ~last_loc - in - let rows, brace_location = consume_rows [] ~last_loc:parent_markup_location in - let grid = List.rev rows in - (Table.Light_syntax.from_raw_data grid, brace_location) - -(* Consumes a table row that might start with [`Bar]. *) -and light_table_row ~parent_markup ~last_loc input = - let rec consume_row acc_row acc_cell acc_space ~new_line ~last_loc = - let push_cells row cell = - match cell with [] -> row | _ -> List.rev cell :: row - in - let return row cell = List.rev (push_cells row cell) in - let next_token = peek input in - match next_token.value with - | `End -> - Parse_error.end_not_allowed next_token.location ~in_what:"table" - |> add_warning input; - junk input; - (`Stop, return acc_row acc_cell, next_token.location) - | `Right_brace -> - junk input; - (`Stop, return acc_row acc_cell, next_token.location) - | `Space _ as token -> - junk input; - let i = Loc.at next_token.location token in - consume_row acc_row acc_cell (i :: acc_space) ~new_line ~last_loc - | `Single_newline _ | `Blank_line _ -> - junk input; - (`Continue, return acc_row acc_cell, last_loc) - | `Bar -> - junk input; - let acc_row = if new_line then [] else List.rev acc_cell :: acc_row in - consume_row acc_row [] [] ~new_line:false ~last_loc - | #token_that_always_begins_an_inline_element as token -> - let i = inline_element input next_token.location token in - if Loc.spans_multiple_lines i then - Parse_error.not_allowed - ~what:(Token.describe (`Single_newline "")) - ~in_what:(Token.describe `Begin_table_light) - i.location - |> add_warning input; - let acc_cell = - if acc_cell = [] then [ i ] else (i :: acc_space) @ acc_cell - in - consume_row acc_row acc_cell [] ~new_line:false - ~last_loc:next_token.location - | other_token -> - Parse_error.not_allowed next_token.location - ~what:(Token.describe other_token) - ~in_what:(Token.describe parent_markup) - |> add_warning input; - junk input; - consume_row acc_row acc_cell acc_space ~new_line ~last_loc - in - consume_row [] [] [] ~new_line:true ~last_loc - -(* Consumes a sequence of table rows (starting with '{tr ...}', which are - represented by [`Begin_table_row] tokens). - - This function is called immediately after '{table' ([`Begin_table `Heavy]) is - read. The only "valid" way to exit is by reading a [`Right_brace] token, - which is consumed. *) -and heavy_table ~parent_markup ~parent_markup_location input = - let rec consume_rows acc ~last_loc = - Reader.until_rbrace_or_eof input acc >>> fun next_token -> - match next_token.Loc.value with - | `Begin_table_row as token -> - junk input; - let items, last_loc = heavy_table_row ~parent_markup:token input in - consume_rows (List.rev items :: acc) ~last_loc - | token -> - Parse_error.not_allowed next_token.location ~what:(Token.describe token) - ~in_what:(Token.describe parent_markup) - ~suggestion:"Move outside of {table ...}, or inside {tr ...}" - |> add_warning input; - junk input; - consume_rows acc ~last_loc - in - let rows, brace_location = consume_rows [] ~last_loc:parent_markup_location in - let grid = List.rev rows in - (Table.Heavy_syntax.from_grid grid, brace_location) - -(* Consumes a sequence of table cells (starting with '{th ...}' or '{td ... }', - which are represented by [`Begin_table_cell] tokens). - - This function is called immediately after '{tr' ([`Begin_table_row]) is - read. The only "valid" way to exit is by reading a [`Right_brace] token, - which is consumed. *) -and heavy_table_row ~parent_markup input = - let rec consume_cell_items acc = - Reader.until_rbrace_or_eof input acc >>> fun next_token -> - match next_token.Loc.value with - | `Begin_table_cell kind as token -> - junk input; - let content, token_after_list_item, _where_in_line = - block_element_list In_table_cell ~parent_markup:token input - in - (match token_after_list_item.value with - | `Right_brace -> junk input - | `End -> - Parse_error.not_allowed token_after_list_item.location - ~what:(Token.describe `End) ~in_what:(Token.describe token) - |> add_warning input); - consume_cell_items ((content, kind) :: acc) - | token -> - Parse_error.not_allowed next_token.location ~what:(Token.describe token) - ~in_what:(Token.describe parent_markup) - ~suggestion: - "Move outside of {table ...}, or inside {td ...} or {th ...}" - |> add_warning input; - junk input; - consume_cell_items acc - in - consume_cell_items [] - -(* {2 Entry point} *) - -let parse warnings tokens = - let input : input = { tokens; warnings } in - - let rec parse_block_elements () = - let elements, last_token, _where_in_line = - block_element_list Top_level ~parent_markup:`Comment input - in - - match last_token.value with - | `End -> elements - | `Right_brace -> - Parse_error.unpaired_right_brace last_token.location - |> add_warning input; - - let block = - Loc.same last_token (`Paragraph [ Loc.same last_token (`Word "}") ]) - in - - junk input; - elements @ (block :: parse_block_elements ()) - in - let ast = parse_block_elements () in - (ast, List.rev !(input.warnings)) diff --git a/src/parser/syntax.mli b/src/parser/syntax.mli deleted file mode 100644 index a40b698410..0000000000 --- a/src/parser/syntax.mli +++ /dev/null @@ -1,6 +0,0 @@ -(* Internal module, not exposed *) - -val parse : - Warning.t list ref -> - Token.t Loc.with_location Stream.t -> - Ast.t * Warning.t list diff --git a/src/parser/test/serialize.ml b/src/parser/test/serialize.ml new file mode 100644 index 0000000000..7449d1545c --- /dev/null +++ b/src/parser/test/serialize.ml @@ -0,0 +1,180 @@ +open Odoc_parser + +type sexp = Sexplib0.Sexp.t = Atom of string | List of sexp list + +module Location_to_sexp = struct + let point : Loc.point -> sexp = + fun { line; column } -> + List [ Atom (string_of_int line); Atom (string_of_int column) ] + + let span : Loc.span -> sexp = + fun { file; start; end_ } -> List [ Atom file; point start; point end_ ] + + let at : ('a -> sexp) -> 'a Loc.with_location -> sexp = + fun f { location; value } -> List [ span location; f value ] +end + +module Ast_to_sexp = struct + (* let at = Location_to_sexp.at *) + type at = { at : 'a. ('a -> sexp) -> 'a Loc.with_location -> sexp } + + let loc_at = { at = Location_to_sexp.at } + let str s = Atom s + let opt f s = match s with Some s -> List [ f s ] | None -> List [] + + let style : Ast.style -> sexp = function + | `Bold -> Atom "bold" + | `Italic -> Atom "italic" + | `Emphasis -> Atom "emphasis" + | `Superscript -> Atom "superscript" + | `Subscript -> Atom "subscript" + + let alignment : Ast.alignment option -> sexp = function + | Some `Left -> Atom "left" + | Some `Center -> Atom "center" + | Some `Right -> Atom "right" + | None -> Atom "default" + + let reference_kind : Ast.reference_kind -> sexp = function + | `Simple -> Atom "simple" + | `With_text -> Atom "with_text" + + let rec inline_element at : Ast.inline_element -> sexp = function + | `Space _ -> Atom "space" + | `Word w -> List [ Atom "word"; Atom w ] + | `Code_span c -> List [ Atom "code_span"; Atom c ] + | `Raw_markup (target, s) -> + List [ Atom "raw_markup"; opt str target; Atom s ] + | `Math_span s -> List [ Atom "math_span"; Atom s ] + | `Styled (s, es) -> + List [ style s; List (List.map (at.at (inline_element at)) es) ] + | `Reference (kind, r, es) -> + List + [ + reference_kind kind; + at.at str r; + List (List.map (at.at (inline_element at)) es); + ] + | `Link (u, es) -> + List [ str u; List (List.map (at.at (inline_element at)) es) ] + + let code_block_lang at { Ast.language; tags } = + List [ at.at str language; opt (at.at str) tags ] + + let media : Ast.media -> sexp = function + | `Image -> Atom "image" + | `Video -> Atom "video" + | `Audio -> Atom "audio" + + let media_href = function + | `Reference href -> List [ Atom "Reference"; Atom href ] + | `Link href -> List [ Atom "Link"; Atom href ] + + let rec nestable_block_element at : Ast.nestable_block_element -> sexp = + function + | `Paragraph es -> + List + [ Atom "paragraph"; List (List.map (at.at (inline_element at)) es) ] + | `Math_block s -> List [ Atom "math_block"; Atom s ] + | `Code_block { Ast.meta = None; content; output = None; _ } -> + List [ Atom "code_block"; at.at str content ] + | `Code_block { meta = Some meta; content; output = None; _ } -> + List [ Atom "code_block"; code_block_lang at meta; at.at str content ] + | `Code_block { meta = Some meta; content; output = Some output; _ } -> + List + [ + Atom "code_block"; + code_block_lang at meta; + at.at str content; + List + (List.map (fun v -> nestable_block_element at v.Loc.value) output); + ] + | `Code_block { meta = None; content = _; output = Some _output; _ } -> + List [ Atom "code_block_err" ] + | `Verbatim t -> List [ Atom "verbatim"; Atom t ] + | `Modules ps -> List [ Atom "modules"; List (List.map (at.at str) ps) ] + | `List (kind, weight, items) -> + let kind = + match kind with `Unordered -> "unordered" | `Ordered -> "ordered" + in + let weight = + match weight with `Light -> "light" | `Heavy -> "heavy" + in + let items = + items + |> List.map (fun item -> + List (List.map (at.at (nestable_block_element at)) item)) + |> fun items -> List items + in + List [ Atom kind; Atom weight; items ] + | `Table ((grid, align), s) -> + let syntax = function `Light -> "light" | `Heavy -> "heavy" in + let kind = function `Header -> "header" | `Data -> "data" in + let map name x f = List [ Atom name; List (List.map f x) ] in + let alignment = + match align with + | None -> List [ Atom "align"; Atom "no alignment" ] + | Some align -> map "align" align @@ alignment + in + List + [ + Atom "table"; + List [ Atom "syntax"; Atom (syntax s) ]; + ( map "grid" grid @@ fun row -> + map "row" row @@ fun (cell, k) -> + map (kind k) cell @@ at.at (nestable_block_element at) ); + alignment; + ] + | `Media (kind, href, c, m) -> + List [ reference_kind kind; at.at media_href href; Atom c; media m ] + + let tag at : Ast.tag -> sexp = function + | `Author s -> List [ Atom "@author"; Atom s ] + | `Deprecated es -> + List + (Atom "@deprecated" :: List.map (at.at (nestable_block_element at)) es) + | `Param (s, es) -> + List + ([ Atom "@param"; Atom s ] + @ List.map (at.at (nestable_block_element at)) es) + | `Raise (s, es) -> + List + ([ Atom "@raise"; Atom s ] + @ List.map (at.at (nestable_block_element at)) es) + | `Return es -> + List (Atom "@return" :: List.map (at.at (nestable_block_element at)) es) + | `See (kind, s, es) -> + let kind = + match kind with + | `Url -> "url" + | `File -> "file" + | `Document -> "document" + in + List + ([ Atom "@see"; Atom kind; Atom s ] + @ List.map (at.at (nestable_block_element at)) es) + | `Since s -> List [ Atom "@since"; Atom s ] + | `Before (s, es) -> + List + ([ Atom "@before"; Atom s ] + @ List.map (at.at (nestable_block_element at)) es) + | `Version s -> List [ Atom "@version"; Atom s ] + | `Canonical p -> List [ Atom "@canonical"; at.at str p ] + | `Inline -> Atom "@inline" + | `Open -> Atom "@open" + | `Closed -> Atom "@closed" + | `Hidden -> Atom "@hidden" + | _ -> failwith "TODO" + + let block_element at : Ast.block_element -> sexp = function + | #Ast.nestable_block_element as e -> nestable_block_element at e + | `Heading (level, label, es) -> + let label = List [ Atom "label"; opt str label ] in + let level = string_of_int level in + List + [ Atom level; label; List (List.map (at.at (inline_element at)) es) ] + | `Tag t -> tag at t + + let docs at : Ast.t -> sexp = + fun f -> List (List.map (at.at (block_element at)) f) +end diff --git a/src/parser/test/test.ml b/src/parser/test/test.ml index fcda47cc80..0b9b2b56f9 100644 --- a/src/parser/test/test.ml +++ b/src/parser/test/test.ml @@ -1,197 +1,7 @@ open Odoc_parser +open Serialize -type sexp = Sexplib0.Sexp.t = Atom of string | List of sexp list - -module Location_to_sexp = struct - let point : Loc.point -> sexp = - fun { line; column } -> - List [ Atom (string_of_int line); Atom (string_of_int column) ] - - let span : Loc.span -> sexp = - fun { file; start; end_ } -> List [ Atom file; point start; point end_ ] - - let at : ('a -> sexp) -> 'a Loc.with_location -> sexp = - fun f { location; value } -> List [ span location; f value ] -end - -module Ast_to_sexp = struct - (* let at = Location_to_sexp.at *) - type at = { at : 'a. ('a -> sexp) -> 'a Loc.with_location -> sexp } - - let loc_at = { at = Location_to_sexp.at } - let str s = Atom s - let opt f s = match s with Some s -> List [ f s ] | None -> List [] - - let style : Ast.style -> sexp = function - | `Bold -> Atom "bold" - | `Italic -> Atom "italic" - | `Emphasis -> Atom "emphasis" - | `Superscript -> Atom "superscript" - | `Subscript -> Atom "subscript" - - let alignment : Ast.alignment option -> sexp = function - | Some `Left -> Atom "left" - | Some `Center -> Atom "center" - | Some `Right -> Atom "right" - | None -> Atom "default" - - let reference_kind : Ast.reference_kind -> sexp = function - | `Simple -> Atom "simple" - | `With_text -> Atom "with_text" - - let media : Ast.media -> sexp = function - | `Image -> Atom "image" - | `Video -> Atom "video" - | `Audio -> Atom "audio" - - let rec inline_element at : Ast.inline_element -> sexp = function - | `Space _ -> Atom "space" - | `Word w -> List [ Atom "word"; Atom w ] - | `Code_span c -> List [ Atom "code_span"; Atom c ] - | `Raw_markup (target, s) -> - List [ Atom "raw_markup"; opt str target; Atom s ] - | `Math_span s -> List [ Atom "math_span"; Atom s ] - | `Styled (s, es) -> - List [ style s; List (List.map (at.at (inline_element at)) es) ] - | `Reference (kind, r, es) -> - List - [ - reference_kind kind; - at.at str r; - List (List.map (at.at (inline_element at)) es); - ] - | `Link (u, es) -> - List [ str u; List (List.map (at.at (inline_element at)) es) ] - - let code_block_lang at { Ast.language; tags } = - List [ at.at str language; opt (at.at str) tags ] - - let media_href = function - | `Reference href -> List [ Atom "Reference"; Atom href ] - | `Link href -> List [ Atom "Link"; Atom href ] - - let rec nestable_block_element at : Ast.nestable_block_element -> sexp = - function - | `Paragraph es -> - List - [ Atom "paragraph"; List (List.map (at.at (inline_element at)) es) ] - | `Math_block s -> List [ Atom "math_block"; Atom s ] - | `Code_block { Ast.meta = None; content; output = None; _ } -> - List [ Atom "code_block"; at.at str content ] - | `Code_block { meta = Some meta; content; output = None; _ } -> - List [ Atom "code_block"; code_block_lang at meta; at.at str content ] - | `Code_block { meta = Some meta; content; output = Some output; _ } -> - List - [ - Atom "code_block"; - code_block_lang at meta; - at.at str content; - List - (List.map (fun v -> nestable_block_element at v.Loc.value) output); - ] - | `Code_block { meta = None; content = _; output = Some _output; _ } -> - List [ Atom "code_block_err" ] - | `Verbatim t -> List [ Atom "verbatim"; Atom t ] - | `Modules ps -> List [ Atom "modules"; List (List.map (at.at str) ps) ] - | `List (kind, weight, items) -> - let kind = - match kind with `Unordered -> "unordered" | `Ordered -> "ordered" - in - let weight = - match weight with `Light -> "light" | `Heavy -> "heavy" - in - let items = - items - |> List.map (fun item -> - List (List.map (at.at (nestable_block_element at)) item)) - |> fun items -> List items - in - List [ Atom kind; Atom weight; items ] - | `Table ((grid, align), s) -> - let syntax = function `Light -> "light" | `Heavy -> "heavy" in - let kind = function `Header -> "header" | `Data -> "data" in - let map name x f = List [ Atom name; List (List.map f x) ] in - let alignment = - match align with - | None -> List [ Atom "align"; Atom "no alignment" ] - | Some align -> map "align" align @@ alignment - in - List - [ - Atom "table"; - List [ Atom "syntax"; Atom (syntax s) ]; - ( map "grid" grid @@ fun row -> - map "row" row @@ fun (cell, k) -> - map (kind k) cell @@ at.at (nestable_block_element at) ); - alignment; - ] - | `Media (kind, href, c, m) -> - List [ reference_kind kind; at.at media_href href; Atom c; media m ] - - let tag at : Ast.tag -> sexp = function - | `Author s -> List [ Atom "@author"; Atom s ] - | `Deprecated es -> - List - (Atom "@deprecated" :: List.map (at.at (nestable_block_element at)) es) - | `Param (s, es) -> - List - ([ Atom "@param"; Atom s ] - @ List.map (at.at (nestable_block_element at)) es) - | `Raise (s, es) -> - List - ([ Atom "@raise"; Atom s ] - @ List.map (at.at (nestable_block_element at)) es) - | `Return es -> - List (Atom "@return" :: List.map (at.at (nestable_block_element at)) es) - | `Children_order es -> - List - (Atom "@children_order" - :: List.map (at.at (nestable_block_element at)) es) - | `Toc_status es -> - List - (Atom "@toc_status" :: List.map (at.at (nestable_block_element at)) es) - | `Order_category es -> - List - (Atom "@order_category" - :: List.map (at.at (nestable_block_element at)) es) - | `Short_title es -> - List - (Atom "@short_title" - :: List.map (at.at (nestable_block_element at)) es) - | `See (kind, s, es) -> - let kind = - match kind with - | `Url -> "url" - | `File -> "file" - | `Document -> "document" - in - List - ([ Atom "@see"; Atom kind; Atom s ] - @ List.map (at.at (nestable_block_element at)) es) - | `Since s -> List [ Atom "@since"; Atom s ] - | `Before (s, es) -> - List - ([ Atom "@before"; Atom s ] - @ List.map (at.at (nestable_block_element at)) es) - | `Version s -> List [ Atom "@version"; Atom s ] - | `Canonical p -> List [ Atom "@canonical"; at.at str p ] - | `Inline -> Atom "@inline" - | `Open -> Atom "@open" - | `Closed -> Atom "@closed" - | `Hidden -> Atom "@hidden" - - let block_element at : Ast.block_element -> sexp = function - | #Ast.nestable_block_element as e -> nestable_block_element at e - | `Heading (level, label, es) -> - let label = List [ Atom "label"; opt str label ] in - let level = string_of_int level in - List - [ Atom level; label; List (List.map (at.at (inline_element at)) es) ] - | `Tag t -> tag at t - - let docs at : Ast.t -> sexp = - fun f -> List (List.map (at.at (block_element at)) f) -end +module Serialize = Serialize let error err = Atom (Odoc_parser.Warning.to_string err) diff --git a/src/parser/test/test.mli b/src/parser/test/test.mli index 4f83ff600f..bad1943eff 100644 --- a/src/parser/test/test.mli +++ b/src/parser/test/test.mli @@ -1 +1,2 @@ val test : ?location:Odoc_parser.Loc.point -> string -> unit +module Serialize = Serialize diff --git a/src/parser/test_driver/dune b/src/parser/test_driver/dune new file mode 100644 index 0000000000..b567c975bf --- /dev/null +++ b/src/parser/test_driver/dune @@ -0,0 +1,5 @@ +(executable + (name tester) + (public_name tester) + (package odoc) + (libraries odoc_parser odoc_parser_test)) diff --git a/src/parser/test_driver/tester.ml b/src/parser/test_driver/tester.ml new file mode 100644 index 0000000000..dd65fe1c69 --- /dev/null +++ b/src/parser/test_driver/tester.ml @@ -0,0 +1,454 @@ +open! Odoc_parser_test + +module Loc = Odoc_parser.Loc +module Parser = Odoc_parser.Tester + +(* + NOTE (@FayCarsons): for anyone working on this parser - this is probably + the easiest way to check if something is working. The tests are numerous + and difficult to read. A test suite will fail in its entirety if one case + throws an exception. + + If you need to test a specific parser rule or test case, add it here and + `dune exec tester -- {test cases you want to run}` +*) + +let code_cases = + [ + ("basic", "{[foo]}"); + ("empty", "{[]}"); + ("whitespace only", "{[ ]}"); + ("blank line only", "{[\n \n]}"); + ("whitespace", "{[foo bar]}"); + ("newline", "{[foo\nbar]}"); + ("carriage return", "{[foo\r\nbar]}"); + ("contains blank line", "{[foo\n\nbar]}"); + ("leading whitespace", "{[ foo]}"); + ] + +let error_recovery = + [ + ("Empty @author", "@author"); + ("Empty @version", "@version"); + ("Code block w/ blank line", "[\n\n]"); + ("Empty style", "{i}"); + ("Empty ref", "{{!www.google.com}}"); + ("Empty link", "{{:www.google.com}}"); + ("List item not at beginning of line", "- foo\n - bar\n- baz"); + ("Empty list item", "{ol {li} }"); + ("'{li' not followed by whitespace", "{ol {lifoo bar baz} }"); + ("End not allowed in table", "{t \n| -- | :--: |\n| a | b"); + ("Empty modules", "{!modules: }"); + ("EOI in modules", "{!modules: Foo Bar"); + ] + +let open_t = + [ + ("basic", "@open"); + ("prefix", "@openfoo"); + ("extra_whitespace", "@open"); + ("followed_by_junk", "@open foo"); + ("followed_by_paragraph", "@open\nfoo"); + ("followed_by_tag", "@open\n@deprecated"); + ("with_list", "@open - foo"); + ] + +let tags = + let raise_tests = + [ + ("raise basic", "@raise Foo"); + ("raise bare", "@raise"); + ("raise words", "@raise foo bar baz"); + ("raise prefix", "@raisefoo"); + ] + and return_tests = + [ + ("return basic", "@return"); + ("return words", "@return foo bar"); + ("return prefix", "@returnfoo"); + ] + and before_tests = + [ + ("before basic", "@before Foo"); + ("before bare", "@before"); + ("before words", "@before foo bar baz"); + ("before prefix", "@beforefoo"); + ] + and param_tests = + [ + ("param basic", "@param foo"); + ("param bare", "@param"); + ("param bare_with_whitespace", "@param"); + ("param immediate_newline", "@param\nfoo"); + ("param followed_by_whitespace", "@param foo"); + ("param extra_whitespace", "@param foo"); + ("param words", "@param foo bar baz"); + ("param multiline", "@param foo\nbar\nbaz"); + ("param paragraphs", "@param foo bar\n\nbaz"); + ("param two", "@param foo\n@param bar"); + ("param nested", "@param foo @param bar"); + ("param preceded_by_paragraph", "foo\n@param bar"); + ("param prefix", "@paramfoo"); + ("param after_code_block", "{[foo]} @param foo"); + ] + and deprecated_tests = + [ + ("basic", "@deprecated"); + ("words", "@deprecated foo bar"); + ("multiline", "@deprecated foo\nbar"); + ("paragraphs", "@deprecated foo\n\nbar"); + ("whitespace_only", "@deprecated"); + ("immediate_newline", "@deprecated\nfoo"); + ("immediate_cr_lf", "@deprecated\r\nfoo"); + ("immediate_blank_line", "@deprecated\n\nfoo"); + ("extra_whitespace", "@deprecated foo"); + ("followed_by_deprecated", "@deprecated foo\n@deprecated bar"); + ("followed_by_deprecated_cr_lf", "@deprecated foo\r\n@deprecated bar"); + ("nested_in_self", "@deprecated foo @deprecated bar"); + ("nested_in_self_at_start", "@deprecated @deprecated foo"); + ("preceded_by_paragraph", "foo\n@deprecated"); + ("preceded_by_shorthand_list", "- foo\n@deprecated"); + ("with_shorthand_list", "@deprecated - foo"); + ("with_shorthand_list_after_newline", "@deprecated\n- foo"); + ("prefix", "@deprecatedfoo"); + ("after_code_block", "{[foo]} @deprecated"); + ("followed_by_section", "@deprecated foo\n{2 Bar}"); + ] + in + raise_tests @ return_tests @ before_tests @ param_tests @ deprecated_tests + +let utf8 = + [ + ("lambda", "\xce\xbb"); + ("words", "\xce\xbb \xce\xbb"); + ("no_validation", "Î"); + ("escapes", "\xce\xbb\\}"); + ("newline", "\xce\xbb \n \xce\xbb"); + ("paragraphs", "\xce\xbb \n\n \xce\xbb"); + ("code_span", "[\xce\xbb]"); + ("minus", "\xce\xbb-\xce\xbb"); + ("shorthand_list", "- \xce\xbb"); + ("styled", "{b \xce\xbb}"); + ("reference_target", "{!\xce\xbb}"); + ("code_block", "{[\xce\xbb]}"); + ("verbatim", "{v \xce\xbb v}"); + ("label", "{2:\xce\xbb Bar}"); + ("author", "@author \xce\xbb"); + ("param", "@param \xce\xbb"); + ("raise", "@raise \xce\xbb"); + ("see", "@see <\xce\xbb>"); + ("since", "@since \xce\xbb"); + ("before", "@before \xce\xbb"); + ("version", "@version \xce\xbb"); + ("right_brace", "\xce\xbb}"); + ] + +let bad_markup = + [ + ("left_brace", "{"); + ("left_brace_with_letter", "{g"); + ("left_brace_with_letters", "{gg"); + ("empty_braces", "{}"); + ("left_space", "{ foo}"); + ("left_spaces", "{ foo}"); + ("left_space_eof", "{"); + ("braces_instead_of_brackets", "{foo}"); + ("right_brace", "}"); + ("right_brace_in_paragraph", "foo}"); + ("multiple_right_brace", "foo } bar } baz"); + ("right_brace_in_list_item", "- foo}"); + ("right_brace_in_code_span", "[foo}]"); + ("right_brace_in_code_block", "{[foo}]}"); + ("right_brace_in_verbatim_text", "{v foo} v}"); + ("right_brace_in_author", "@author Foo}"); + ("right_brace_in_deprecated", "@deprecated }"); + ("right_bracket", "]"); + ("right_bracket_in_paragraph", "foo]"); + ("right_bracket_in_shorthand_list", "- foo]"); + ("right_bracket_in_code_span", "[]]"); + ("right_bracket_in_style", "{b]}"); + ("right_bracket_in_verbatim", "{v ] v}"); + ("right_bracket_in_list", "{ul ]}"); + ("right_bracket_in_list_item", "{ul {li ]}}"); + ("right_bracket_in_heading", "{2 ]}"); + ("right_bracket_in_author", "@author Foo]"); + ("at", "@"); + ("cr", ""); + ] + +let dashes = + [ + ("minus_in_word", "foo-bar"); + ("minus_as_word", "foo -"); + ("plus_in_word", "foo+bar"); + ("plus_as_word", "foo +"); + ("bar_in_word", "foo|bar"); + ("escaped_bar_in_word", "foo\\|bar"); + ("bar_as_word", "foo |"); + ("negative_number", "-3.14 -1337"); + ("n_em_dash", "-- ---"); + ("minus_at", "-@"); + ("at_minus", "-@-"); + ("option", "--option"); + ] + +let light_table_tests = + [ + ("empty_table_light", "{t }"); + ("unclosed_table", "{t "); + ("simple", {| + {t + | a | + } + |}); + ("stars", "{t\n |a| *b*|\n |*c| d* |\n}"); + ("backquotes", "{t\n | `a |`\n}"); + ("no_header", "{t\n |---|---|\n | x | y |\n}"); + ("no_align", "{t\n | x | y |\n | x | y |\n}"); + ("only_align", "{t\n |--|--|\n}"); + ("no_data", "{t\n | x | y |\n |---|---|\n}"); + ("alignment", "{t\n | a | b | c | d |\n |---|:--|--:|:-:|\n}"); + ("no_bars", "{t\n a | b | c | d\n ---|:--|--:|:-:\n a | b | c | d\n}"); + ( "light_table_new_lines", + "{t\n\n\ + \ | a | b | c | d |\n\n\ + \ |---|---|---|---|\n\n\ + \ | a | b | c | d |\n\n\ + }" ); + ( "light_table_markup", + "{t\n\ + \ | {i a} {:google.com} \\t | | {m b} {e c} {% xyz %} | {b d} [foo] |\n\ + \ |---|---|---|---|\n\ + }" ); + ( "light_table_markup_with_newlines", + "{t | h1 | h2 |\n\ + \ |--------------|-------------|\n\ + \ | {e with\n\ + \ newlines} | {b d} [foo] |\n\ + }" ); + ("no_space", "{t\n | a | b |c| d |\n |---|--:|:--|:-:|\n}"); + ( "multiple_headers", + "{t\n||a|b|\n|:-|---:|\n|c|d|\n|cc|dd|\n|-:|:-:|\n|e|f|\n|g|h||\n}" ); + ("block_element_in_cell", "{t\n| {[ a ]} | b |\n|---|---|\n}"); + ("block_element_in_row", "{t\n{[ a ]}\n| a | b |\n|---|---|\n}"); + ("more_cells_later", "{t\n | x | y |\n |---|---|\n | x | y | z |\n}"); + ("less_cells_later", "{t\n | x | y |\n |---|---|\n x \n}"); + ( "multiple_word", + "{t\n\ + | Header and other word |\n\ + |-----------------------|\n\ + | cell and other words |\n\ + }" ); + ( "multiple_word_header", + "{t\n\ + | Header other word |\n\ + |-------------------|\n\ + | Header other word |\n\ + }" ); + ] + +let explicit_list_tests = + [ + ("basic", "{ul {li foo}}"); + ("ordered", "{ol {li foo}}"); + ("two_items", "{ul {li foo} {li bar}}"); + ("items_on_separate_lines", "{ul {li foo}\n{li bar}}"); + ("blank_line", "{ul {li foo}\n\n{li bar}}"); + ("blank_line_in_item", "{ul {li foo\n\nbar}}"); + ("junk", "{ul foo}"); + ("junk_with_no_whitespace", "{ulfoo}"); + ("empty", "{ul}"); + ("unterminated_list", "{ul"); + ("no_whitespace", "{ul{li foo}}"); + ("whitespace_at_end_of_item", "{ul {li foo\n\n\n}}"); + ("unterminated_li_syntax", "{ul {li foo"); + ("unterminated_left_curly_brace", "{ul {- foo"); + ("empty_li_styntax", "{ul {li }}"); + ("empty_left_curly_brace", "{ul {- }}"); + ("li_syntax_without_whitespace", "{ul {lifoo}}"); + ("li_syntax_followed_by_newline", "{ul {li\nfoo}}"); + ("li_syntax_followed_by_cr_lf", "{ul {li\r\nfoo}}"); + ("li_syntax_followed_by_blank_line", "{ul {li\n\nfoo}}"); + ("left_curly_brace_without_whitespace", "{ul {-foo}}"); + ("mixed_list_items", "{ul {li foo} {- bar}}"); + ("nested", "{ul {li {ul {li foo}}}}"); + ("shorthand_in_explicit", "{ul {li - foo\n- bar}}"); + ("explicit_in_shorthand", "- {ul {li foo}}"); + ("bare_li_syntax", "{li foo}"); + ("bare_left_curly_brace", "{- foo"); + ("after_code_block", "{[foo]} {ul {li bar}}"); + ] + +let isolated = + [ + ("tag in list", "- @deprecated foo bar baz"); + ("section in list", "- {2 Foo}"); + ("Paragraph after section", "{2 Foo} bar"); + ] + +(* Cases (mostly) taken from the 'odoc for library authors' document *) +let documentation_cases = + [ + ("Light list", "- foo\n- bar"); + ("Heavy list", "{ul {li foo} {li bar} {li baz}}"); + ("Simple ref", "{!Stdlib.Buffer}"); + ("Ref w/ replacement", "{{: https://ocaml.org/ }the OCaml website}"); + ("Modules", "{!modules: Foo Bar Baz}"); + ("Block tag", "@see bar baz quux\n"); + ("Inline tag", "@author Fay Carsons"); + ("Simple tag", "@open"); + ("Math", "{math \\sum_{i=0}^n x^i%}"); + ("Inline_math", "{m x + 4}"); + ( "Heavy table", + "{table\n\ + \ {tr\n\ + \ {th Header 1}\n\ + \ {th Header 2}\n\ + \ {th Header 3}\n\ + \ }\n\ + \ {tr\n\ + \ {td Cell 1}\n\ + \ {td Cell with {e emphasized content}}\n\ + \ {td {v a block v} }\n\ + \ }\n\ + \ }" ); + ( "Light table", + {|{t + | Header 1 | Header 2 | Header 3 | Header 4| + | :------: | --------:|:---------|---------| + | centered | right | left | default | + omitted | bar at | start and| finish + | {e emph} | and | unaligned | bars | + }|} + ); + ("Styled", "{i italicized}"); + ("Inline code", "[fmap Fun.id None]"); + ("Code block", "{[\n let foo = 0 \n]}"); + ] + +let all_tests = + code_cases @ error_recovery @ open_t @ tags @ utf8 @ bad_markup + @ documentation_cases @ explicit_list_tests + +open Test.Serialize + +let error err = Atom (Odoc_parser.Warning.to_string err) +let parser_output formatter (ast, warnings) = + let value = Ast_to_sexp.(docs loc_at ast) in + let warnings = List (List.map error warnings) in + let output = + List [ List [ Atom "output"; value ]; List [ Atom "warnings"; warnings ] ] + in + Sexplib0.Sexp.pp_hum formatter output; + Format.pp_print_flush formatter () + +type failure = { + exn : string; + label : string; + offending_token : Parser.token; + failure_index : int; + tokens : Parser.token list; +} + +let mkfailure label exn last failure_index tokens = + { offending_token = last; failure_index; tokens; exn; label } + +let run_test (label, case) = + let open Either in + let reversed_newlines = Parser.reversed_newlines ~input:case in + let lexbuf = Lexing.from_string case in + let file = "Tester" in + Lexing.set_filename lexbuf file; + let tokens = ref [] in + let input = + Parser.Lexer. + { + warnings = []; + offset_to_location = + Parser.offset_to_location ~reversed_newlines + ~comment_location:lexbuf.lex_curr_p; + file; + } + in + let failure_index = ref (-1) in + let offending_token = ref None in + let get_tok _ = + incr failure_index; + let tok = Parser.Lexer.token input lexbuf in + tokens := tok :: !tokens; + offending_token := Some tok; + tok + in + try + let ast, warnings = Parser.run ~input:case @@ Parser.main get_tok lexbuf in + let warnings = warnings @ input.warnings in + let output = Format.asprintf "%a" parser_output (ast, warnings) in + Left (label, output) + with e -> + let exns = Printexc.to_string e in + (* + let offending_token = List.hd @@ tokbuf.cache in + let tokens = List.map Loc.value @@ TokBuf.cache_rest tokbuf in + *) + Right + (mkfailure label exns + (Option.get !offending_token) + !failure_index (List.rev !tokens)) + +let sep = String.init 80 @@ Fun.const '-' + +let format_successes = + let go acc (label, output) = + Printf.sprintf "%s%s:\n%s\n%s\n" acc label output sep + in + List.fold_left go "" + +let failure_string { exn; label; offending_token; tokens; failure_index } = + Printf.sprintf + {|>>> Case '%s' failed with exn: <<< +%s +Offending token: +%d: '%s' +Tokens: +%s|} + label exn failure_index + (Parser.string_of_token offending_token) + (List.fold_left + (fun acc t -> acc ^ "\n" ^ Parser.string_of_token t) + "" tokens) + +let format_failures = + let go acc x = acc ^ "\n" ^ sep ^ "\n" ^ x in + List.fold_left go "" + +let () = + let cases = + if Array.length Sys.argv > 1 then ( + match Sys.argv.(1) with + | "code" -> code_cases + | "recovery" | "r" -> error_recovery + | "docs" | "d" -> documentation_cases + | "open" | "o" -> open_t + | "utf8" | "u" -> utf8 + | "isolated" | "i" -> isolated + | "tags" | "t" -> tags + | "light" | "lt" -> light_table_tests + | "all" -> all_tests + | "dashes" | "ds" -> dashes + | "list" | "l" -> explicit_list_tests + | _ -> + print_endline "unrecognized argument - running documentation_cases"; + documentation_cases) + else bad_markup + in + let sucesses, failures = List.partition_map run_test cases in + let sucesses = format_successes sucesses in + let failures = format_failures @@ List.map failure_string failures in + Printf.printf + {|<<<<----SUCCESS---->>>> +%s + +<<<<----FAILURE---->>>> + %s + |} + sucesses failures diff --git a/src/parser/token.ml b/src/parser/token.ml deleted file mode 100644 index ac23cd76ba..0000000000 --- a/src/parser/token.ml +++ /dev/null @@ -1,256 +0,0 @@ -(* This module contains the token type, emitted by the lexer, and consumed by - the comment syntax parser. It also contains two functions that format tokens - for error messages. *) - -type section_heading = [ `Begin_section_heading of int * string option ] -type style = [ `Bold | `Italic | `Emphasis | `Superscript | `Subscript ] -type paragraph_style = [ `Left | `Center | `Right ] - -type tag = - [ `Tag of - [ `Author of string - | `Deprecated - | `Param of string - | `Raise of string - | `Return - | `See of [ `Url | `File | `Document ] * string - | `Since of string - | `Before of string - | `Version of string - | `Canonical of string - | `Children_order - | `Toc_status - | `Order_category - | `Short_title - | `Inline - | `Open - | `Closed - | `Hidden ] ] - -type media = [ `Audio | `Video | `Image ] -type media_href = [ `Reference of string | `Link of string ] - -type media_markup = - [ `Simple_media of media_href * media - | `Media_with_replacement_text of media_href * media * string ] - -let s_of_media kind media = - match (kind, media) with - | `Simple, `Audio -> "{audio!" - | `Simple, `Video -> "{video!" - | `Simple, `Image -> "{image!" - | `Replaced, `Audio -> "{{audio!" - | `Replaced, `Video -> "{{video!" - | `Replaced, `Image -> "{{image!" - -type t = - [ (* End of input. *) - `End - | (* Runs of whitespace. [Blank_line] is any run of whitespace that contains two - or more newline characters. [Single_newline] is any run of whitespace that - contains exactly one newline character. [Space] is any run of whitespace - that contains no newline characters. - - It is an important invariant in the parser that no adjacent whitespace - tokens are emitted by the lexer. Otherwise, there would be the need for - unbounded lookahead, a (co-?)ambiguity between - [Single_newline Single_newline] and [Blank_line], and other problems. *) - `Space of - string - | `Single_newline of string - | `Blank_line of string - | (* A right curly brace ([}]), i.e. end of markup. *) - `Right_brace - | `Right_code_delimiter - | (* Words are anything that is not whitespace or markup. Markup symbols can be - be part of words if escaped. - - Words can contain plus and minus symbols, but those are emitted as [Plus] - and [Minus] tokens. The parser combines plus and minus into words, except - when they appear first on a line, in which case the tokens are list item - bullets. *) - `Word of - string - | `Code_span of string - | `Raw_markup of string option * string - | `Math_span of string - | `Math_block of string - | `Begin_style of style - | `Begin_paragraph_style of paragraph_style - | (* Other inline element markup. *) - `Simple_reference of string - | `Begin_reference_with_replacement_text of string - | `Simple_link of string - | `Begin_link_with_replacement_text of string - | media_markup - | (* Leaf block element markup. *) - `Code_block of - (string Loc.with_location * string Loc.with_location option) option - * string - * string Loc.with_location - * bool - | `Verbatim of string - | `Modules of string - | (* List markup. *) - `Begin_list of [ `Unordered | `Ordered ] - | `Begin_list_item of [ `Li | `Dash ] - | (* Table markup. *) - `Begin_table_light - | `Begin_table_heavy - | `Begin_table_row - | `Begin_table_cell of [ `Header | `Data ] - | `Minus - | `Plus - | `Bar - | section_heading - | tag ] - -let print : [< t ] -> string = function - | `Begin_paragraph_style `Left -> "'{L'" - | `Begin_paragraph_style `Center -> "'{C'" - | `Begin_paragraph_style `Right -> "'{R'" - | `Begin_style `Bold -> "'{b'" - | `Begin_style `Italic -> "'{i'" - | `Begin_style `Emphasis -> "'{e'" - | `Begin_style `Superscript -> "'{^'" - | `Begin_style `Subscript -> "'{_'" - | `Begin_reference_with_replacement_text _ -> "'{{!'" - | `Begin_link_with_replacement_text _ -> "'{{:'" - | `Begin_list_item `Li -> "'{li ...}'" - | `Begin_list_item `Dash -> "'{- ...}'" - | `Begin_table_light -> "{t" - | `Begin_table_heavy -> "{table" - | `Begin_table_row -> "'{tr'" - | `Begin_table_cell `Header -> "'{th'" - | `Begin_table_cell `Data -> "'{td'" - | `Minus -> "'-'" - | `Plus -> "'+'" - | `Bar -> "'|'" - | `Begin_section_heading (level, label) -> - let label = match label with None -> "" | Some label -> ":" ^ label in - Printf.sprintf "'{%i%s'" level label - | `Tag (`Author _) -> "'@author'" - | `Tag `Deprecated -> "'@deprecated'" - | `Tag (`Param _) -> "'@param'" - | `Tag (`Raise _) -> "'@raise'" - | `Tag `Return -> "'@return'" - | `Tag `Children_order -> "'@children_order'" - | `Tag `Order_category -> "'@order_category'" - | `Tag `Toc_status -> "'@toc_status'" - | `Tag `Short_title -> "'@short_title'" - | `Tag (`See _) -> "'@see'" - | `Tag (`Since _) -> "'@since'" - | `Tag (`Before _) -> "'@before'" - | `Tag (`Version _) -> "'@version'" - | `Tag (`Canonical _) -> "'@canonical'" - | `Tag `Inline -> "'@inline'" - | `Tag `Open -> "'@open'" - | `Tag `Closed -> "'@closed'" - | `Tag `Hidden -> "'@hidden" - | `Raw_markup (None, _) -> "'{%...%}'" - | `Raw_markup (Some target, _) -> "'{%" ^ target ^ ":...%}'" - | `Simple_media (`Reference _, `Image) -> "{image!...}" - | `Simple_media (`Reference _, `Audio) -> "{audio!...}" - | `Simple_media (`Reference _, `Video) -> "{video!...}" - | `Simple_media (`Link _, `Image) -> "{image:...}" - | `Simple_media (`Link _, `Audio) -> "{audio:...}" - | `Simple_media (`Link _, `Video) -> "{video:...}" - | `Media_with_replacement_text (`Reference _, `Image, _) -> - "{{image!...} ...}" - | `Media_with_replacement_text (`Reference _, `Audio, _) -> - "{{audio!...} ...}" - | `Media_with_replacement_text (`Reference _, `Video, _) -> - "{{video!...} ...}" - | `Media_with_replacement_text (`Link _, `Image, _) -> "{{image:...} ...}" - | `Media_with_replacement_text (`Link _, `Audio, _) -> "{{audio:...} ...}" - | `Media_with_replacement_text (`Link _, `Video, _) -> "{{video:...} ...}" - -(* [`Minus] and [`Plus] are interpreted as if they start list items. Therefore, - for error messages based on [Token.describe] to be accurate, formatted - [`Minus] and [`Plus] should always be plausibly list item bullets. *) -let describe : [< t | `Comment ] -> string = function - | `Word w -> Printf.sprintf "'%s'" w - | `Code_span _ -> "'[...]' (code)" - | `Raw_markup _ -> "'{%...%}' (raw markup)" - | `Begin_paragraph_style `Left -> "'{L ...}' (left alignment)" - | `Begin_paragraph_style `Center -> "'{C ...}' (center alignment)" - | `Begin_paragraph_style `Right -> "'{R ...}' (right alignment)" - | `Begin_style `Bold -> "'{b ...}' (boldface text)" - | `Begin_style `Italic -> "'{i ...}' (italic text)" - | `Begin_style `Emphasis -> "'{e ...}' (emphasized text)" - | `Begin_style `Superscript -> "'{^...}' (superscript)" - | `Begin_style `Subscript -> "'{_...}' (subscript)" - | `Math_span _ -> "'{m ...}' (math span)" - | `Math_block _ -> "'{math ...}' (math block)" - | `Simple_reference _ -> "'{!...}' (cross-reference)" - | `Begin_reference_with_replacement_text _ -> - "'{{!...} ...}' (cross-reference)" - | `Simple_media (`Reference _, `Image) -> "'{image!...}' (image-reference)" - | `Simple_media (`Reference _, `Audio) -> "'{audio!...}' (audio-reference)" - | `Simple_media (`Reference _, `Video) -> "'{video!...}' (video-reference)" - | `Simple_media (`Link _, `Image) -> "'{image:...}' (image-link)" - | `Simple_media (`Link _, `Audio) -> "'{audio:...}' (audio-link)" - | `Simple_media (`Link _, `Video) -> "'{video:...}' (video-link)" - | `Media_with_replacement_text (`Reference _, `Image, _) -> - "'{{image!...} ...}' (image-reference)" - | `Media_with_replacement_text (`Reference _, `Audio, _) -> - "'{{audio!...} ...}' (audio-reference)" - | `Media_with_replacement_text (`Reference _, `Video, _) -> - "'{{video!...} ...}' (video-reference)" - | `Media_with_replacement_text (`Link _, `Image, _) -> - "'{{image:...} ...}' (image-link)" - | `Media_with_replacement_text (`Link _, `Audio, _) -> - "'{{audio:...} ...}' (audio-link)" - | `Media_with_replacement_text (`Link _, `Video, _) -> - "'{{video:...} ...}' (video-link)" - | `Simple_link _ -> "'{:...} (external link)'" - | `Begin_link_with_replacement_text _ -> "'{{:...} ...}' (external link)" - | `End -> "end of text" - | `Space _ -> "whitespace" - | `Single_newline _ -> "line break" - | `Blank_line _ -> "blank line" - | `Right_brace -> "'}'" - | `Right_code_delimiter -> "']}'" - | `Code_block _ -> "'{[...]}' (code block)" - | `Verbatim _ -> "'{v ... v}' (verbatim text)" - | `Modules _ -> "'{!modules ...}'" - | `Begin_list `Unordered -> "'{ul ...}' (bulleted list)" - | `Begin_list `Ordered -> "'{ol ...}' (numbered list)" - | `Begin_list_item `Li -> "'{li ...}' (list item)" - | `Begin_list_item `Dash -> "'{- ...}' (list item)" - | `Begin_table_light -> "'{t ...}' (table)" - | `Begin_table_heavy -> "'{table ...}' (table)" - | `Begin_table_row -> "'{tr ...}' (table row)" - | `Begin_table_cell `Header -> "'{th ... }' (table header cell)" - | `Begin_table_cell `Data -> "'{td ... }' (table data cell)" - | `Minus -> "'-' (bulleted list item)" - | `Plus -> "'+' (numbered list item)" - | `Bar -> "'|'" - | `Begin_section_heading (level, _) -> - Printf.sprintf "'{%i ...}' (section heading)" level - | `Tag (`Author _) -> "'@author'" - | `Tag `Deprecated -> "'@deprecated'" - | `Tag (`Param _) -> "'@param'" - | `Tag (`Raise _) -> "'@raise'" - | `Tag `Return -> "'@return'" - | `Tag (`See _) -> "'@see'" - | `Tag (`Since _) -> "'@since'" - | `Tag (`Before _) -> "'@before'" - | `Tag (`Version _) -> "'@version'" - | `Tag (`Canonical _) -> "'@canonical'" - | `Tag `Inline -> "'@inline'" - | `Tag `Open -> "'@open'" - | `Tag `Closed -> "'@closed'" - | `Tag `Hidden -> "'@hidden" - | `Tag `Children_order -> "'@children_order" - | `Tag `Toc_status -> "'@toc_status" - | `Tag `Order_category -> "'@order_category" - | `Tag `Short_title -> "'@short_title" - | `Comment -> "top-level text" - -let describe_element = function - | `Reference (`Simple, _, _) -> describe (`Simple_reference "") - | `Reference (`With_text, _, _) -> - describe (`Begin_reference_with_replacement_text "") - | `Link _ -> describe (`Begin_link_with_replacement_text "") - | `Heading (level, _, _) -> describe (`Begin_section_heading (level, None)) diff --git a/src/parser/tokens.ml b/src/parser/tokens.ml new file mode 100644 index 0000000000..9bf4d6acff --- /dev/null +++ b/src/parser/tokens.ml @@ -0,0 +1,378 @@ +type ref_kind = Simple | With_replacement + +type alignment = Left | Center | Right + +type style = Bold | Italic | Emphasis | Superscript | Subscript +type table_cell_kind = Header | Data + +type list_kind = Ordered | Unordered + +type uri_kind = URL | File | Document +type math = { start : Loc.point; content : string } + +let ast_list_kind : list_kind -> Ast.list_kind = function + | Ordered -> `Ordered + | Unordered -> `Unordered + +type 'a with_start_point = { start : Loc.point; inner : 'a } +let with_start_point : Lexing.position -> 'a -> 'a with_start_point = + fun start inner -> { start = Loc.make_point start; inner } + +type code_block = { + metadata : meta option; + delimiter : string option; + content : string Loc.with_location; +} +and meta = { + language_tag : string Loc.with_location; + tags : string Loc.with_location option; +} + +(* Token names follow Menhir conventions where ALL_CAPS denote a unit variant, + in this case generally representing a delimiter *) +type token = + | Space of string (* ' ' *) + | Single_newline of string (* '\n' with optional leading horizontal space *) + | Blank_line of string + (* '\n\n' with optional leading horizontal space for both newlines *) + | Simple_ref of string Loc.with_location with_start_point (* '{!Foo}' *) + | Ref_with_replacement of + string Loc.with_location with_start_point (* '{{!Foo} bar}' *) + | Simple_link of string with_start_point (* '{:janestreet.com}' *) + | Link_with_replacement of + string with_start_point (* '{{:janestreet.com}Jane Street}' *) + | MODULES (* {!modules *) + | Media of + (media * media_target) with_start_point (* i.e. '{audio!foo.wav}' *) + | Media_with_replacement of (media * media_target * string) with_start_point + (* i.e. '{{audio!foo.wav} presentation on monadic fixpoints}' *) + | Math_span of string with_start_point (* '{m 2^32-1 }' *) + | Math_block of string with_start_point (* '{math \\sum_{i=0}^n x^i%}}' *) + | Code_span of string with_start_point (* '[bind m f]' *) + | Code_block of code_block with_start_point (* '{@haskell[fix f = f f]}' *) + | Code_block_with_output of code_block with_start_point + (* '{@haskell[fix f = f f][Haskell can be cool]}' *) + | Word of string (* Any whitespace separated word *) + | Verbatim of string with_start_point (* '{v let foo = bar v}' *) + | RIGHT_CODE_DELIMITER (* ']}' *) + | RIGHT_BRACE (* '}' *) + | Paragraph_style of alignment with_start_point (* i.e. '{L' *) + | Style of style (* i.e. '{i italic}' or '{_ subscript}' *) + | List of list_kind (* '{ul' or '{ol' *) + | LI (* '{li' *) + | DASH (* '{-' *) + | TABLE_LIGHT (* '{t' *) + | TABLE_HEAVY (* '{table' *) + | TABLE_ROW (* '{tr' *) + | Table_cell of Ast.table_cell_kind (* '{td' *) + | MINUS (* '-' *) + | PLUS (* '+' *) + | BAR (* '|' *) + | Section_heading of (int * string option) with_start_point (* '{2 Foo}' *) + | Tag of tag (* '@author Fay Carsons' *) + | Tag_with_content of tag_with_content (* i.e. '@raises Foo' *) + | Raw_markup of (string option * string) with_start_point + (* '{%

inline html!<\p> %}' *) + | END (* End of input *) +and tag = + | Author of string with_start_point (* '@author' *) + | Since of string with_start_point (* '@since' *) + | Version of string with_start_point (* '@version' *) + | Canonical of string with_start_point (* '@canonical' *) + | INLINE (* '@inline' *) + | OPEN (* '@open' *) + | CLOSED (* '@closed' *) + | HIDDEN (* '@hidden' *) + +(* A tag with content is a tag which may be followed by some number of block + elements delimited by a blank line *) +and tag_with_content = + | DEPRECATED (* '@deprecated with the release of 1.0.1' *) + | Before of + string with_start_point (* '@before 1.0.1 this did something else' *) + | Raise of string with_start_point (* '@raises Foo' *) + | Param of string with_start_point (* '@param foo is a monad' *) + | See of (uri_kind * string) with_start_point + (* '@see for more info' *) + | RETURN (* '@return a monad' *) + | CHILDREN_ORDER (* '@children_order' *) + | TOC_STATUS (* '@toc_status' *) + | ORDER_CATEGORY (* '@order_category' *) + | SHORT_TITLE (* '@short_title' *) +and media = + | Reference of string Loc.with_location + | Link of string Loc.with_location +and media_target = Audio | Video | Image + +let media_description ref_kind media_kind = + let media_kind = + match media_kind with + | Audio -> "audio" + | Video -> "video" + | Image -> "image" + in + let ref_kind = match ref_kind with Reference _ -> "!" | Link _ -> ":" in + (ref_kind, media_kind) + +let print : token -> string = function + | Space _ -> "\t" + | Single_newline _ -> "\n" + | Blank_line _ -> "\n\n" + | Simple_ref _ -> "{!" + | Ref_with_replacement _ -> "{{!" + | Simple_link _ -> "{:" + | Link_with_replacement _ -> "{{:" + | MODULES -> "{!modules:" + | Media { inner = ref_kind, media_kind; _ } -> + let ref_kind, media_kind = media_description ref_kind media_kind in + Printf.sprintf "{%s%s" media_kind ref_kind + | Media_with_replacement { inner = ref_kind, media_kind, _; _ } -> + let ref_kind, media_kind = media_description ref_kind media_kind in + Printf.sprintf "{{%s%s" media_kind ref_kind + | Math_span _ -> "{m" + | Math_block _ -> "{math" + | Code_span _ -> "[" + | Code_block _ | Code_block_with_output _ -> "{[" + | Word w -> w + | Verbatim _ -> "{v" + | RIGHT_CODE_DELIMITER -> "]}" + | RIGHT_BRACE -> "}" + | Paragraph_style { inner = Left; _ } -> "'{L'" + | Paragraph_style { inner = Center; _ } -> "'{C'" + | Paragraph_style { inner = Right; _ } -> "'{R'" + | Style Bold -> "'{b'" + | Style Italic -> "'{i'" + | Style Emphasis -> "'{e'" + | Style Superscript -> "'{^'" + | Style Subscript -> "'{_'" + | List Ordered -> "{ol" + | List Unordered -> "{ul" + | LI -> "'{li ...}'" + | DASH -> "'{- ...}'" + | TABLE_LIGHT -> "{t" + | TABLE_HEAVY -> "{table" + | TABLE_ROW -> "'{tr'" + | Table_cell `Header -> "'{th'" + | Table_cell `Data -> "'{td'" + | MINUS -> "'-'" + | PLUS -> "'+'" + | BAR -> "'|'" + | Section_heading { inner = level, label; _ } -> + let label = match label with None -> "" | Some label -> ":" ^ label in + Printf.sprintf "'{%i%s'" level label + | Tag (Author _) -> "'@author'" + | Tag_with_content DEPRECATED -> "'@deprecated'" + | Tag_with_content (Param _) -> "'@param'" + | Tag_with_content (Raise _) -> "'@raise'" + | Tag_with_content RETURN -> "'@return'" + | Tag_with_content (See _) -> "'@see'" + | Tag_with_content (Before _) -> "'@before'" + | Tag_with_content CHILDREN_ORDER -> "'@children_order" + | Tag_with_content TOC_STATUS -> "'@toc_status'" + | Tag_with_content ORDER_CATEGORY -> "'@order_category'" + | Tag_with_content SHORT_TITLE -> "@'short_title'" + | Tag (Since _) -> "'@since'" + | Tag (Version _) -> "'@version'" + | Tag (Canonical _) -> "'@canonical'" + | Tag INLINE -> "'@inline'" + | Tag OPEN -> "'@open'" + | Tag CLOSED -> "'@closed'" + | Tag HIDDEN -> "'@hidden'" + | Raw_markup { inner = None, _; _ } -> "'{%...%}'" + | Raw_markup { inner = Some target, _; _ } -> "'{%" ^ target ^ ":...%}'" + | END -> "EOI" + +(* [`Minus] and [`Plus] are interpreted as if they start list items. Therefore, + for error messages based on [Token.describe] to be accurate, formatted + [`Minus] and [`Plus] should always be plausibly list item bullets. *) +let describe : token -> string = function + | Space _ -> "(horizontal space)" + | Media { inner = ref_kind, media_kind; _ } -> + let ref_kind, media_kind = media_description ref_kind media_kind in + Printf.sprintf "{%s%s" media_kind ref_kind + | Media_with_replacement { inner = ref_kind, media_kind, _; _ } -> + let ref_kind, media_kind = media_description ref_kind media_kind in + Printf.sprintf "{{%s%s" media_kind ref_kind + | Word w -> Printf.sprintf "'%s'" w + | Code_span _ -> "'[...]' (code)" + | Raw_markup _ -> "'{%...%}' (raw markup)" + | Paragraph_style { inner = Left; _ } -> "'{L ...}' (left alignment)" + | Paragraph_style { inner = Center; _ } -> "'{C ...}' (center alignment)" + | Paragraph_style { inner = Right; _ } -> "'{R ...}' (right alignment)" + | Style Bold -> "'{b ...}' (boldface text)" + | Style Italic -> "'{i ...}' (italic text)" + | Style Emphasis -> "'{e ...}' (emphasized text)" + | Style Superscript -> "'{^...}' (superscript)" + | Style Subscript -> "'{_...}' (subscript)" + | Math_span _ -> "'{m ...}' (math span)" + | Math_block _ -> "'{math ...}' (math block)" + | Simple_ref _ -> "'{!...}' (cross-reference)" + | Ref_with_replacement _ -> "'{{!...} ...}' (cross-reference)" + | Simple_link _ -> "'{:...} (external link)'" + | Link_with_replacement _ -> "'{{:...} ...}' (external link)" + | END -> "end of text" + | Single_newline _ -> "newline" + | Blank_line _ -> "blank line" + | RIGHT_BRACE -> "'}'" + | RIGHT_CODE_DELIMITER -> "']}'" + | Code_block _ | Code_block_with_output _ -> "'{[...]}' (code block)" + | Verbatim _ -> "'{v ... v}' (verbatim text)" + | MODULES -> "'{!modules ...}'" + | List Unordered -> "'{ul ...}' (bulleted list)" + | List Ordered -> "'{ol ...}' (numbered list)" + | LI -> "'{li ...}' (list item)" + | DASH -> "'{- ...}' (list item)" + | TABLE_LIGHT -> "'{t ...}' (table)" + | TABLE_HEAVY -> "'{table ...}' (table)" + | TABLE_ROW -> "'{tr ...}' (table row)" + | Table_cell `Header -> "'{th ... }' (table header cell)" + | Table_cell `Data -> "'{td ... }' (table data cell)" + | MINUS -> "'-' (bulleted list item)" + | PLUS -> "'+' (numbered list item)" + | BAR -> "'|'" + | Section_heading { inner = level, _; _ } -> + Printf.sprintf "'{%i ...}' (section heading)" level + | Tag (Author _) -> "'@author'" + | Tag_with_content DEPRECATED -> "'@deprecated'" + | Tag_with_content (Param _) -> "'@param'" + | Tag_with_content (Raise _) -> "'@raise'" + | Tag_with_content RETURN -> "'@return'" + | Tag_with_content (See _) -> "'@see'" + | Tag_with_content (Before _) -> "'@before'" + | Tag_with_content CHILDREN_ORDER -> "'@children_order" + | Tag_with_content TOC_STATUS -> "'@toc_status'" + | Tag_with_content ORDER_CATEGORY -> "'@order_category'" + | Tag_with_content SHORT_TITLE -> "@'short_title'" + | Tag (Since _) -> "'@since'" + | Tag (Version _) -> "'@version'" + | Tag (Canonical _) -> "'@canonical'" + | Tag INLINE -> "'@inline'" + | Tag OPEN -> "'@open'" + | Tag CLOSED -> "'@closed'" + | Tag HIDDEN -> "'@hidden'" + +(* Functions and helpers for describing a parsed AST node + This is useful in error handling inside the Menhir parser *) + +let empty_code_block = + { + inner = + { + metadata = None; + delimiter = None; + content = + Loc.at + Loc.{ start = Loc.dummy_pos; end_ = Loc.dummy_pos; file = "" } + ""; + }; + start = Loc.dummy_pos; + } + +let of_ast_style : Ast.style -> style = function + | `Bold -> Bold + | `Italic -> Italic + | `Emphasis -> Emphasis + | `Superscript -> Superscript + | `Subscript -> Subscript + +let dummy_ref inner = + { + inner = + Loc. + { + value = inner; + location = { start = Loc.dummy_pos; end_ = Loc.dummy_pos; file = "" }; + }; + start = Loc.dummy_pos; + } + +let describe_inline : Ast.inline_element -> string = function + | `Word w -> describe @@ Word w + | `Space _ -> describe @@ Space "" + | `Styled (style, _) -> describe @@ Style (of_ast_style style) + | `Code_span _ -> describe @@ Code_span { inner = ""; start = Loc.dummy_pos } + | `Math_span _ -> describe @@ Math_span { start = Loc.dummy_pos; inner = "" } + | `Raw_markup inner -> describe @@ Raw_markup { inner; start = Loc.dummy_pos } + | `Link (inner, []) -> + describe @@ Simple_link { inner; start = Loc.dummy_pos } + | `Link (inner, _ :: _) -> + describe @@ Link_with_replacement { inner; start = Loc.dummy_pos } + | `Reference (`Simple, { value; _ }, _) -> + describe @@ Simple_ref (dummy_ref value) + | `Reference (`With_text, { value; _ }, _) -> + describe @@ Ref_with_replacement (dummy_ref value) + +let of_href = + let open Loc in + function + | { value = `Reference value; location } -> Reference { value; location } + | { value = `Link value; location } -> Link { value; location } + +let of_media_kind = function + | `Audio -> Audio + | `Image -> Image + | `Video -> Video + +let of_media : + [> `Media of + Ast.reference_kind + * Ast.media_href Loc.with_location + * string + * Ast.media ] -> + token = function + | `Media (_, href, _, media_kind) -> + Media + { + inner = (of_href href, of_media_kind media_kind); + start = Loc.dummy_pos; + } + +let describe_nestable_block : Ast.nestable_block_element -> string = function + | `Paragraph ws -> ( + match ws with + | Loc.{ value; _ } :: _ -> describe_inline value + | [] -> describe @@ Word "") + | `Code_block _ -> describe @@ Code_block empty_code_block + | `Verbatim _ -> describe @@ Verbatim { inner = ""; start = Loc.dummy_pos } + | `Modules _ -> describe MODULES + | `List (ordering, `Light, _) -> + describe @@ if ordering = `Ordered then PLUS else MINUS + | `List (ordering, `Heavy, _) -> + describe @@ List (if ordering = `Ordered then Ordered else Unordered) + | `Table (_, kind) -> + describe @@ if kind = `Light then TABLE_LIGHT else TABLE_HEAVY + | `Math_block _ -> + describe @@ Math_block { start = Loc.dummy_pos; inner = "" } + | `Media _ as media -> describe @@ of_media media + +let of_ast_ref : [ `Document | `File | `Url ] -> uri_kind = function + | `Document -> Document + | `File -> File + | `Url -> URL + +let describe_tag : Ast.tag -> string = function + | `See (kind, _, _) -> + describe + @@ Tag_with_content + (See { inner = (of_ast_ref kind, ""); start = Loc.dummy_pos }) + | `Author inner -> describe @@ Tag (Author { inner; start = Loc.dummy_pos }) + | `Deprecated _ -> describe @@ Tag_with_content DEPRECATED + | `Param (inner, _) -> + describe @@ Tag_with_content (Param { inner; start = Loc.dummy_pos }) + | `Raise (inner, _) -> + describe @@ Tag_with_content (Raise { inner; start = Loc.dummy_pos }) + | `Return _ -> describe @@ Tag_with_content RETURN + | `Since inner -> describe @@ Tag (Since { inner; start = Loc.dummy_pos }) + | `Before (inner, _) -> + describe @@ Tag_with_content (Before { inner; start = Loc.dummy_pos }) + | `Version inner -> describe @@ Tag (Version { inner; start = Loc.dummy_pos }) + | `Closed -> describe @@ Tag CLOSED + | `Open -> describe @@ Tag OPEN + | `Canonical Loc.{ value = inner; _ } -> + describe @@ Tag (Canonical { inner; start = Loc.dummy_pos }) + | `Hidden -> describe @@ Tag HIDDEN + | `Inline -> describe @@ Tag INLINE + | `Children_order _ -> describe @@ Tag_with_content CHILDREN_ORDER + | `Toc_status _ -> describe @@ Tag_with_content TOC_STATUS + | `Order_category _ -> describe @@ Tag_with_content ORDER_CATEGORY + | `Short_title _ -> describe @@ Tag_with_content SHORT_TITLE diff --git a/src/parser/writer.ml b/src/parser/writer.ml new file mode 100644 index 0000000000..847ba565ab --- /dev/null +++ b/src/parser/writer.ml @@ -0,0 +1,60 @@ +(** An implementation of the Writer monad for parser error reporting *) + +type +'a t = Writer of ('a * warning list) + +(** A warning can either be totally self-contained, or a function requiring + input. This is so that we can pass the input string in later in order + to extract spans where errors occur *) +and warning = InputNeeded of (string -> Warning.t) | Warning of Warning.t + +let return : 'a -> 'a t = fun x -> Writer (x, []) +let bind : 'a t -> f:('a -> 'b t) -> 'b t = + fun (Writer (node, warnings)) ~f -> + let (Writer (next, next_warnings)) = f node in + Writer (next, warnings @ next_warnings) + +let map : f:('a -> 'b) -> 'a t -> 'b t = + fun ~f (Writer (x, ws)) -> Writer (f x, ws) + +let map2 : f:('a -> 'b -> 'c) -> 'a t -> 'b t -> 'c t = + fun ~f (Writer (a, aws)) (Writer (b, bws)) -> Writer (f a b, aws @ bws) + +let ( <$> ) f w = map ~f w + +let seq : 'a t -> 'b t -> 'b t = + fun (Writer (_, ws)) (Writer (x, ws2)) -> Writer (x, ws @ ws2) +let ( *> ) = seq + +(** Useful functions for working with Writer.t *) +module Prelude = struct + let return = return + let ( let* ) w f = bind w ~f + let ( <$> ) = ( <$> ) + let ( *> ) = ( *> ) +end + +let sequence : 'a t list -> 'a list t = + fun xs -> + let xs, ws = List.map (fun (Writer (x, ws)) -> (x, ws)) xs |> List.split in + Writer (xs, List.flatten ws) + +let sequence_loc : 'a t Loc.with_location -> 'a Loc.with_location t = + fun { value; location } -> Loc.at location <$> value + +(** [warning Warning.t Writer.t] is equivalent to Haskell's [tell] *) +let warning warning (Writer (n, ws)) = Writer (n, warning :: ws) + +let return_warning node warning = Writer (node, [ warning ]) + +(** [ensure pred warning Writer] Logs a warning if the predicate returns true + for the contained value *) +let ensure : ('a -> bool) -> warning -> 'a t -> 'a t = + fun pred warning (Writer (x, ws) as self) -> + if pred x then self else Writer (x, warning :: ws) + +let run : input:string -> 'a t -> 'a * Warning.t list = + fun ~input (Writer (tree, warnings)) -> + let go input = function InputNeeded f -> f input | Warning w -> w in + (tree, List.map (go input) warnings) + +let get : 'a t -> 'a = fun (Writer (x, _)) -> x