(** Lexer using sedlex *) open Tokens open StdLabels exception UnclosedQuote exception LexError of string exception EOF (* Extract the location name from the pattern *) let location_name = Str.regexp {|# *\(.*\)|} (** Remove all the expression state when we are leaving the expression itself. *) let rec leave_expression buffer = match Lexbuf.state buffer with | Some Lexbuf.Expression -> Lexbuf.leave_state buffer; leave_expression buffer | _ -> () (** Try to read the identifier and check if this is a function, a keyword, or just a variable. See the tests [Syntax.Operator2] and [Syntax.Call Nl] for two cases. *) let build_ident buffer = let id = Lexbuf.content buffer |> String.uppercase_ascii in try let value = Hashtbl.find Idents.keyword_table id in let _ = match value with | IF | ELIF -> Lexbuf.enter_state buffer Lexbuf.Expression | _ -> () in value with Not_found -> (* If the identifier does not match a keyword and start with [*], then try it as a '*' operator. *) if Char.equal '*' id.[0] then ( Lexbuf.rollback buffer; let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with '*' -> STAR | _ -> IDENT id) else IDENT id let wait_balance : (Buffer.t -> Lexbuf.t -> 'a) -> Lexbuf.t -> 'a = fun rule lexbuf -> let _, position = Lexbuf.positions lexbuf in Lexbuf.set_start_position lexbuf position; try let token = rule (Buffer.create 256) lexbuf in token with Not_found -> raise UnclosedQuote let space = [%sedlex.regexp? ' ' | '\t'] let eol = [%sedlex.regexp? '\r' | '\n' | "\r\n"] let coma = [%sedlex.regexp? ','] let digit = [%sedlex.regexp? '0' .. '9'] let letters = [%sedlex.regexp? 'a' .. 'z' | 'A' .. 'Z' | '_'] let ident = [%sedlex.regexp? Opt ('$' | '*'), letters, Star (digit | letters)] let location_ident = [%sedlex.regexp? letters | digit] let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^'] let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident] (** Read a quote started with '{' The function return the parsed string, but the closing token has been rollbacked, leaving the state in [Lexbuf.EndString _]. The next call to [main] will call the associated function, effectively leaving the string mode in the parser. @param nested tell with started another block of string inside this one *) let rec read_long_string : ?nested:bool -> int -> Buffer.t -> Lexbuf.t -> token = fun ?(nested = false) level buf buffer -> let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | "<<" -> if not nested then ( match Buffer.length buf with | 0 -> Lexbuf.enter_state buffer Lexbuf.Token; ENTER_EMBED | _ -> let result = Tokens.LITERAL (Buffer.contents buf) in Buffer.reset buf; Lexbuf.rollback buffer; result) else ( Buffer.add_string buf (Lexbuf.content buffer); read_long_string ~nested level buf buffer) | '{' -> Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); read_long_string ~nested (level + 1) buf buffer | '}' -> ( match level with | 0 -> Lexbuf.leave_state buffer; Lexbuf.enter_state buffer (Lexbuf.EndString Lex_state.readLongStringWraper); (* rollback the latest token *) Lexbuf.rollback buffer; LITERAL (Buffer.contents buf) | _ -> (* We have nested strings. Do not terminate end *) Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf); read_long_string ~nested (level - 1) buf buffer) | '\'' | '"' -> Buffer.add_string buf (Lexbuf.content buffer); read_long_string ~nested:true level buf buffer | eol -> Buffer.add_string buf (Lexbuf.content buffer); read_long_string ~nested level buf buffer | any -> Buffer.add_string buf (Lexbuf.content buffer); read_long_string ~nested level buf buffer | _ -> raise Not_found (** Read the text inside a ['] or ['"'] The function return the parsed string, but the closing token has been rollbacked, leaving the state in [Lexbuf.EndString _]. The next call to [main] will call the associated function, effectively leaving the string mode in the parser. *) let rec read_quoted_string : Lexbuf.stringWraper -> Lexbuf.buffer_builder = fun f ?(nested = false) buf buffer -> let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | "<<" -> (* Enter into embed code. We enter here into a new state until the matching >>. If we already got some text before, report the literal token, then rollback to read the embeded code again. *) if not nested then ( match Buffer.length buf with | 0 -> Lexbuf.enter_state buffer Lexbuf.Token; ENTER_EMBED | _ -> let result = Tokens.LITERAL (Buffer.contents buf) in Buffer.reset buf; Lexbuf.rollback buffer; result) else ( Buffer.add_string buf (Lexbuf.content buffer); (f.wrap (read_quoted_string f)) buf buffer) | eol | any -> Buffer.add_string buf (Lexbuf.content buffer); (f.wrap (read_quoted_string f)) buf buffer | _ -> raise Not_found (** Track the kind of nested string inside a multiline string inside a comment. Some constructions are not allowed in this specific case (see later) *) type commentedString = None | Quote | DQuote let rec skip_comment buffer = (* Simplified way to skip the content of a string until the end marker. (expect the string to be well formed) *) let rec parse_until_end : (Buffer.t -> Lexbuf.t -> token) -> token = fun f -> let token = wait_balance f buffer in match Lexbuf.state buffer with | Some Lexbuf.Token -> Lexbuf.leave_state buffer; parse_until_end f | Some (Lexbuf.EndString _) -> token | _ -> parse_until_end f in let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | '{' -> (* There are illegal constructions inside a comment containing {} block. Every opening text marker shall have the corresponding closing token (but inside a nested block we can have unmatched double quote). [! { ' }] this gives an error [! { ' ' }] is ok *) let token = parse_until_end (read_long_string 0) in let () = match token with | Tokens.LITERAL (content : string) -> ( (* Ensure every opening quote is closed before the end of the comment *) let string_state = String.fold_left content ~init:None ~f:(fun state c -> match (state, c) with | None, '\'' -> Quote | None, '"' -> DQuote | Quote, '\'' -> None | DQuote, '"' -> None | _ -> state) in match string_state with | None -> () | _ -> Lexbuf.leave_state buffer; raise UnclosedQuote) | _ -> () in let () = try ignore (Lex_state.readLongStringWraper.end_string buffer) with _ -> () in skip_comment buffer | '\'' -> let _ = parse_until_end (Lex_state.quotedStringWraper.wrap (read_quoted_string Lex_state.quotedStringWraper)) in let _ = Lex_state.quotedStringWraper.end_string buffer in skip_comment buffer | '"' -> let _ = parse_until_end (Lex_state.dQuotedStringWraper.wrap (read_quoted_string Lex_state.dQuotedStringWraper)) in let _ = Lex_state.dQuotedStringWraper.end_string buffer in skip_comment buffer | eol -> (* Ugly hack used in order to put the eol in the front of the next parsing. This is required because the eol is also a part of the syntax, and do cannot be discard with the end of the comment. *) Lexbuf.rollback buffer; COMMENT | any -> skip_comment buffer | _ -> raise Not_found (** Main lexer *) let rec parse_token : Lexbuf.t -> token = fun buffer -> let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | 0Xfeff -> (* Ignore the BOM *) parse_token buffer | '#', Star space, location -> (* Extract the location name *) let ident = Lexbuf.content buffer in let ident_name = match Str.string_match location_name ident 0 with | false -> ident | true -> Str.matched_group 1 ident in (* Restart the line number (new location here) *) Lexbuf.start buffer; LOCATION_START (fun () -> Sedlexing.set_filename lexbuf ident_name; (* Restart the line number (new location here) *) ident_name) | '_', Star space, eol, Star space -> (* The _ character can be used to break lines *) parse_token buffer | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> leave_expression buffer; LOCATION_END | Plus digit -> INTEGER (Lexbuf.content buffer) | '+' -> PLUS | '-' -> MINUS | "+=" -> INCR | "-=" -> DECR | "*=" -> MULT_EQUAL | "/=" -> DIV_EQUAL | '/' -> DIV | '*' -> STAR | ':' -> (* We are leaving the block, the comment will be handled again *) Lexbuf.leave_state buffer; COLUMN | '[' -> L_BRACKET | ']' -> R_BRACKET | '(' -> Lexbuf.enter_state buffer Lexbuf.Expression; L_PAREN | ')' -> Lexbuf.leave_state buffer; R_PAREN | ">>" -> (* Leave the expression if we have any*) leave_expression buffer; (* Now leave the token mode and return to the string *) Lexbuf.leave_state buffer; LEAVE_EMBED | '<' -> LT | '>' -> GT | coma -> COMA | '=' -> Lexbuf.enter_state buffer Lexbuf.Expression; EQUAL | ident -> build_ident buffer | eol -> leave_expression buffer; EOL | '&' -> leave_expression buffer; AMPERSAND | '!' -> ( match Lexbuf.state buffer with | Some Lexbuf.Expression -> EXCLAMATION | _ -> skip_comment buffer) | '}' -> TEXT_MARKER | eof -> raise EOF | _ -> let tok = Lexbuf.content buffer in let msg = Format.asprintf "Unexpected character %S" tok in raise @@ LexError msg let main buffer = match Lexbuf.state buffer with | Some (Lexbuf.String w) -> wait_balance (w.wrap @@ read_quoted_string w) buffer | Some (Lexbuf.MString level) -> wait_balance (read_long_string level) buffer | Some (Lexbuf.EndString w) -> w.end_string buffer | _ -> let parser = parse_token |> Lex_state.defaultWraper.start_string |> Lexbuf.overlay buffer in parser buffer let rec discard buffer = let () = Lexbuf.start_recovery buffer in let lexbuf = Lexbuf.buffer buffer in match%sedlex lexbuf with | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) -> (* If something looks like the end of a location, get out the discard mode. We can’t really be sure if it is effectively the end of a location (because we can be in a text), but trying to figure if it is or not just don’t make sense. We are here because an error was raised, so can have any situation (for example a missing quote). *) leave_expression buffer; () | any -> discard buffer | _ -> raise EOF