123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353 |
- (**
- Lexer using sedlex
- *)
- open Tokens
- open StdLabels
- exception UnclosedQuote
- exception LexError of string
- exception EOF
- (* Extract the location name from the pattern *)
- let location_name = Str.regexp {|# *\(.*\)|}
- (** Remove all the expression state when we are leaving the expression itself. *)
- let rec leave_expression buffer =
- match Lexbuf.state buffer with
- | Some Lexbuf.Expression ->
- Lexbuf.leave_state buffer;
- leave_expression buffer
- | _ -> ()
- (** Try to read the identifier and check if this is a function, a keyword, or
- just a variable.
- See the tests [Syntax.Operator2] and [Syntax.Call Nl] for two cases. *)
- let build_ident buffer =
- let id = Lexbuf.content buffer |> String.uppercase_ascii in
- try
- let value = Hashtbl.find Idents.keyword_table id in
- let _ =
- match value with
- | IF | ELIF -> Lexbuf.enter_state buffer Lexbuf.Expression
- | _ -> ()
- in
- value
- with Not_found ->
- (* If the identifier does not match a keyword and start with [*], then
- try it as a '*' operator. *)
- if Char.equal '*' id.[0] then (
- Lexbuf.rollback buffer;
- let lexbuf = Lexbuf.buffer buffer in
- match%sedlex lexbuf with '*' -> STAR | _ -> IDENT id)
- else IDENT id
- let wait_balance : (Buffer.t -> Lexbuf.t -> 'a) -> Lexbuf.t -> 'a =
- fun rule lexbuf ->
- let _, position = Lexbuf.positions lexbuf in
- Lexbuf.set_start_position lexbuf position;
- try
- let token = rule (Buffer.create 256) lexbuf in
- token
- with Not_found -> raise UnclosedQuote
- let space = [%sedlex.regexp? ' ' | '\t']
- let eol = [%sedlex.regexp? '\r' | '\n' | "\r\n"]
- let coma = [%sedlex.regexp? ',']
- let digit = [%sedlex.regexp? '0' .. '9']
- let letters = [%sedlex.regexp? 'a' .. 'z' | 'A' .. 'Z' | '_']
- let ident = [%sedlex.regexp? Opt ('$' | '*'), letters, Star (digit | letters)]
- let location_ident = [%sedlex.regexp? letters | digit]
- let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^']
- let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident]
- (** Read a quote started with '{'
- The function return the parsed string, but the closing token has been
- rollbacked, leaving the state in [Lexbuf.EndString _].
- The next call to [main] will call the associated function, effectively
- leaving the string mode in the parser.
- @param nested tell with started another block of string inside this one *)
- let rec read_long_string : ?nested:bool -> int -> Buffer.t -> Lexbuf.t -> token
- =
- fun ?(nested = false) level buf buffer ->
- let lexbuf = Lexbuf.buffer buffer in
- match%sedlex lexbuf with
- | "<<" ->
- if not nested then (
- match Buffer.length buf with
- | 0 ->
- Lexbuf.enter_state buffer Lexbuf.Token;
- ENTER_EMBED
- | _ ->
- let result = Tokens.LITERAL (Buffer.contents buf) in
- Buffer.reset buf;
- Lexbuf.rollback buffer;
- result)
- else (
- Buffer.add_string buf (Lexbuf.content buffer);
- read_long_string ~nested level buf buffer)
- | '{' ->
- Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
- read_long_string ~nested (level + 1) buf buffer
- | '}' -> (
- match level with
- | 0 ->
- Lexbuf.leave_state buffer;
- Lexbuf.enter_state buffer
- (Lexbuf.EndString Lex_state.readLongStringWraper);
- (* rollback the latest token *)
- Lexbuf.rollback buffer;
- LITERAL (Buffer.contents buf)
- | _ ->
- (* We have nested strings. Do not terminate end *)
- Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
- read_long_string ~nested (level - 1) buf buffer)
- | '\'' | '"' ->
- Buffer.add_string buf (Lexbuf.content buffer);
- read_long_string ~nested:true level buf buffer
- | eol ->
- Buffer.add_string buf (Lexbuf.content buffer);
- read_long_string ~nested level buf buffer
- | any ->
- Buffer.add_string buf (Lexbuf.content buffer);
- read_long_string ~nested level buf buffer
- | _ -> raise Not_found
- (** Read the text inside a ['] or ['"']
- The function return the parsed string, but the closing token has been
- rollbacked, leaving the state in [Lexbuf.EndString _].
- The next call to [main] will call the associated function, effectively
- leaving the string mode in the parser.
- *)
- let rec read_quoted_string : Lexbuf.stringWraper -> Lexbuf.buffer_builder =
- fun f ?(nested = false) buf buffer ->
- let lexbuf = Lexbuf.buffer buffer in
- match%sedlex lexbuf with
- | "<<" ->
- (* Enter into embed code. We enter here into a new state until the
- matching >>.
- If we already got some text before, report the literal token, then
- rollback to read the embeded code again. *)
- if not nested then (
- match Buffer.length buf with
- | 0 ->
- Lexbuf.enter_state buffer Lexbuf.Token;
- ENTER_EMBED
- | _ ->
- let result = Tokens.LITERAL (Buffer.contents buf) in
- Buffer.reset buf;
- Lexbuf.rollback buffer;
- result)
- else (
- Buffer.add_string buf (Lexbuf.content buffer);
- (f.wrap (read_quoted_string f)) buf buffer)
- | eol | any ->
- Buffer.add_string buf (Lexbuf.content buffer);
- (f.wrap (read_quoted_string f)) buf buffer
- | _ -> raise Not_found
- (** Track the kind of nested string inside a multiline string inside a
- comment.
- Some constructions are not allowed in this specific case (see later)
- *)
- type commentedString = None | Quote | DQuote
- let rec skip_comment buffer =
- (* Simplified way to skip the content of a string until the end marker.
- (expect the string to be well formed) *)
- let rec parse_until_end : (Buffer.t -> Lexbuf.t -> token) -> token =
- fun f ->
- let token = wait_balance f buffer in
- match Lexbuf.state buffer with
- | Some Lexbuf.Token ->
- Lexbuf.leave_state buffer;
- parse_until_end f
- | Some (Lexbuf.EndString _) -> token
- | _ -> parse_until_end f
- in
- let lexbuf = Lexbuf.buffer buffer in
- match%sedlex lexbuf with
- | '{' ->
- (* There are illegal constructions inside a comment containing {}
- block.
- Every opening text marker shall have the corresponding closing token
- (but inside a nested block we can have unmatched double quote).
- [! { ' }] this gives an error
- [! { ' ' }] is ok
- *)
- let token = parse_until_end (read_long_string 0) in
- let () =
- match token with
- | Tokens.LITERAL (content : string) -> (
- (* Ensure every opening quote is closed before the end of the
- comment *)
- let string_state =
- String.fold_left content ~init:None ~f:(fun state c ->
- match (state, c) with
- | None, '\'' -> Quote
- | None, '"' -> DQuote
- | Quote, '\'' -> None
- | DQuote, '"' -> None
- | _ -> state)
- in
- match string_state with
- | None -> ()
- | _ ->
- Lexbuf.leave_state buffer;
- raise UnclosedQuote)
- | _ -> ()
- in
- let () =
- try ignore (Lex_state.readLongStringWraper.end_string buffer)
- with _ -> ()
- in
- skip_comment buffer
- | '\'' ->
- let _ =
- parse_until_end
- (Lex_state.quotedStringWraper.wrap
- (read_quoted_string Lex_state.quotedStringWraper))
- in
- let _ = Lex_state.quotedStringWraper.end_string buffer in
- skip_comment buffer
- | '"' ->
- let _ =
- parse_until_end
- (Lex_state.dQuotedStringWraper.wrap
- (read_quoted_string Lex_state.dQuotedStringWraper))
- in
- let _ = Lex_state.dQuotedStringWraper.end_string buffer in
- skip_comment buffer
- | eol ->
- (* Ugly hack used in order to put the eol in the front of the next
- parsing.
- This is required because the eol is also a part of the syntax, and do
- cannot be discard with the end of the comment. *)
- Lexbuf.rollback buffer;
- COMMENT
- | any -> skip_comment buffer
- | _ -> raise Not_found
- (** Main lexer *)
- let rec parse_token : Lexbuf.t -> token =
- fun buffer ->
- let lexbuf = Lexbuf.buffer buffer in
- match%sedlex lexbuf with
- | 0Xfeff ->
- (* Ignore the BOM *)
- parse_token buffer
- | '#', Star space, location ->
- (* Extract the location name *)
- let ident = Lexbuf.content buffer in
- let ident_name =
- match Str.string_match location_name ident 0 with
- | false -> ident
- | true -> Str.matched_group 1 ident
- in
- (* Restart the line number (new location here) *)
- Lexbuf.start buffer;
- LOCATION_START
- (fun () ->
- Sedlexing.set_filename lexbuf ident_name;
- (* Restart the line number (new location here) *)
- ident_name)
- | '_', Star space, eol, Star space ->
- (* The _ character can be used to break lines *)
- parse_token buffer
- | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) ->
- leave_expression buffer;
- LOCATION_END
- | Plus digit -> INTEGER (Lexbuf.content buffer)
- | '+' -> PLUS
- | '-' -> MINUS
- | "+=" -> INCR
- | "-=" -> DECR
- | "*=" -> MULT_EQUAL
- | "/=" -> DIV_EQUAL
- | '/' -> DIV
- | '*' -> STAR
- | ':' ->
- (* We are leaving the block, the comment will be handled again *)
- Lexbuf.leave_state buffer;
- COLUMN
- | '[' -> L_BRACKET
- | ']' -> R_BRACKET
- | '(' ->
- Lexbuf.enter_state buffer Lexbuf.Expression;
- L_PAREN
- | ')' ->
- Lexbuf.leave_state buffer;
- R_PAREN
- | ">>" ->
- (* Leave the expression if we have any*)
- leave_expression buffer;
- (* Now leave the token mode and return to the string *)
- Lexbuf.leave_state buffer;
- LEAVE_EMBED
- | '<' -> LT
- | '>' -> GT
- | coma -> COMA
- | '=' ->
- Lexbuf.enter_state buffer Lexbuf.Expression;
- EQUAL
- | ident -> build_ident buffer
- | eol ->
- leave_expression buffer;
- EOL
- | '&' ->
- leave_expression buffer;
- AMPERSAND
- | '!' -> (
- match Lexbuf.state buffer with
- | Some Lexbuf.Expression -> EXCLAMATION
- | _ -> skip_comment buffer)
- | '}' -> TEXT_MARKER
- | eof -> raise EOF
- | _ ->
- let tok = Lexbuf.content buffer in
- let msg = Format.asprintf "Unexpected character %S" tok in
- raise @@ LexError msg
- let main buffer =
- match Lexbuf.state buffer with
- | Some (Lexbuf.String w) ->
- wait_balance (w.wrap @@ read_quoted_string w) buffer
- | Some (Lexbuf.MString level) -> wait_balance (read_long_string level) buffer
- | Some (Lexbuf.EndString w) -> w.end_string buffer
- | _ ->
- let parser =
- parse_token |> Lex_state.defaultWraper.start_string
- |> Lexbuf.overlay buffer
- in
- parser buffer
- let rec discard buffer =
- let () = Lexbuf.start_recovery buffer in
- let lexbuf = Lexbuf.buffer buffer in
- match%sedlex lexbuf with
- | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) ->
- (* If something looks like the end of a location, get out the discard
- mode.
- We can’t really be sure if it is effectively the end of a
- location (because we can be in a text), but trying to figure if it is
- or not just don’t make sense.
- We are here because an error was raised, so can have any situation
- (for example a missing quote). *)
- leave_expression buffer;
- ()
- | any -> discard buffer
- | _ -> raise EOF
|