123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213 |
- (**
- Lexer using sedlex
- *)
- open Tokens
- exception UnclosedQuote
- exception LexError of string
- exception EOF
- (* Extract the location name from the pattern *)
- let location_name = Str.regexp {|.* \(.*\)|}
- (** Try to read the identifier and check if this is a function, a keyword, or
- just a variable.
- See the tests [Syntax.Operator2] and [Syntax.Call Nl] for two cases. *)
- let build_ident buffer =
- let id = Lexbuf.content buffer |> String.uppercase_ascii in
- try
- let value = Hashtbl.find Idents.keyword_table id in
- let _ =
- match value with IF | ELIF -> Lexbuf.incr_level buffer | _ -> ()
- in
- value
- with Not_found ->
- (* If the identifier does not match a keyword and start with [*], then
- try it as a '*' operator. *)
- if Char.equal '*' id.[0] then (
- Lexbuf.rollback buffer;
- let lexbuf = Lexbuf.buffer buffer in
- match%sedlex lexbuf with '*' -> STAR | _ -> IDENT id)
- else IDENT id
- let wait_balance : (Buffer.t -> Lexbuf.t -> 'a) -> Lexbuf.t -> 'a =
- fun rule lexbuf ->
- let _, position = Lexbuf.positions lexbuf in
- Lexbuf.set_start_position lexbuf position;
- try
- let token = rule (Buffer.create 256) lexbuf in
- token
- with Not_found -> raise UnclosedQuote
- let space = [%sedlex.regexp? ' ' | '\t']
- let eol = [%sedlex.regexp? '\r' | '\n' | "\r\n"]
- let coma = [%sedlex.regexp? ',']
- let digit = [%sedlex.regexp? '0' .. '9']
- let letters = [%sedlex.regexp? 'a' .. 'z' | 'A' .. 'Z' | '_']
- let spaces = [%sedlex.regexp? Plus space]
- let ident = [%sedlex.regexp? Opt ('$' | '*'), letters, Star (digit | letters)]
- let location_ident = [%sedlex.regexp? letters | digit]
- let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^']
- let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident]
- let rec read_long_string level buf buffer =
- let lexbuf = Lexbuf.buffer buffer in
- match%sedlex lexbuf with
- | '{' ->
- Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
- read_long_string (level + 1) buf buffer
- | '}' -> (
- match level with
- | 0 -> Buffer.contents buf
- | _ ->
- Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
- read_long_string (level - 1) buf buffer)
- | eol ->
- Buffer.add_string buf (Lexbuf.content buffer);
- read_long_string level buf buffer
- | any ->
- Buffer.add_string buf (Lexbuf.content buffer);
- read_long_string level buf buffer
- | _ -> raise Not_found
- let rec read_dquoted_string buf buffer =
- let lexbuf = Lexbuf.buffer buffer in
- match%sedlex lexbuf with
- | "\"\"" ->
- Buffer.add_char buf '"';
- read_dquoted_string buf buffer
- | '"' -> Buffer.contents buf
- | any ->
- Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
- read_dquoted_string buf buffer
- | _ -> raise Not_found
- let rec read_quoted_string buf buffer =
- let lexbuf = Lexbuf.buffer buffer in
- match%sedlex lexbuf with
- | "''" ->
- Buffer.add_char buf '\'';
- read_quoted_string buf buffer
- | '\'' -> Buffer.contents buf
- | eol ->
- Buffer.add_string buf (Lexbuf.content buffer);
- read_quoted_string buf buffer
- | any ->
- Buffer.add_string buf (Lexbuf.content buffer);
- read_quoted_string buf buffer
- | _ -> raise Not_found
- let rec skip_comment buffer =
- let lexbuf = Lexbuf.buffer buffer in
- match%sedlex lexbuf with
- | '{' ->
- let _ = wait_balance (read_long_string 0) buffer in
- skip_comment buffer
- | '\'' ->
- let _ = wait_balance read_quoted_string buffer in
- skip_comment buffer
- | '"' ->
- let _ = wait_balance read_dquoted_string buffer in
- skip_comment buffer
- | eol ->
- (* Ugly hack used in order to put the eol in the front of the next
- parsing. *)
- Lexbuf.rollback buffer;
- COMMENT
- | any -> skip_comment buffer
- | _ -> raise Not_found
- (** Main lexer *)
- let rec token : Lexbuf.t -> token =
- fun buffer ->
- let lexbuf = Lexbuf.buffer buffer in
- match%sedlex lexbuf with
- | 0Xfeff ->
- (* Ignore the BOM *)
- token buffer
- | '#', Star space, location ->
- (* Extract the location name *)
- let ident = Lexbuf.content buffer in
- let () =
- match Str.string_match location_name ident 0 with
- | false -> ()
- | true -> Sedlexing.set_filename lexbuf (Str.matched_group 1 ident)
- in
- (* Restart the line number (new location here) *)
- Lexbuf.start buffer;
- LOCATION_START ident
- | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) ->
- Lexbuf.reset_level buffer;
- LOCATION_END
- | Plus digit -> INTEGER (Lexbuf.content buffer)
- | '+' -> PLUS
- | '-' -> MINUS
- | "+=" -> INCR
- | "-=" -> DECR
- | "*=" -> MULT_EQUAL
- | '/' -> DIV
- | '*' -> STAR
- | ':' ->
- (* We are leaving the block, the comment will be handled again *)
- Lexbuf.decr_level buffer;
- COLUMN
- | '[' -> L_BRACKET
- | ']' -> R_BRACKET
- | '(' ->
- Lexbuf.incr_level buffer;
- L_PAREN
- | ')' ->
- Lexbuf.decr_level buffer;
- R_PAREN
- | '<' -> LT
- | '>' -> GT
- | coma -> COMA
- | '=' ->
- Lexbuf.incr_level buffer;
- EQUAL
- | ident -> build_ident buffer
- | eol ->
- Lexbuf.reset_level buffer;
- EOL
- | '&' ->
- Lexbuf.reset_level buffer;
- AMPERSAND
- | '!' -> if Lexbuf.level buffer > 0 then EXCLAMATION else skip_comment buffer
- | spaces -> token buffer
- | '\'' -> LITERAL (wait_balance read_quoted_string buffer)
- | '"' -> LITERAL (wait_balance read_dquoted_string buffer)
- | '{' -> LITERAL (wait_balance (read_long_string 0) buffer)
- | eof -> raise EOF
- | _ ->
- let tok = Lexbuf.content buffer in
- let msg = Format.asprintf "Unexpected character %S" tok in
- raise @@ LexError msg
- let rec discard buffer =
- let lexbuf = Lexbuf.buffer buffer in
- match%sedlex lexbuf with
- | '\'' ->
- ignore (wait_balance read_quoted_string buffer);
- discard buffer
- | '"' ->
- ignore (wait_balance read_dquoted_string buffer);
- discard buffer
- | '{' ->
- ignore (wait_balance (read_long_string 0) buffer);
- discard buffer
- | eof -> raise EOF
- | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) ->
- Lexbuf.reset_level buffer;
- ()
- | '!' ->
- ignore @@ skip_comment buffer;
- discard buffer
- | any -> discard buffer
- | _ -> raise EOF
|