lexer.ml 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. (**
  2. Lexer using sedlex
  3. *)
  4. open Tokens
  5. exception UnclosedQuote
  6. exception LexError of string
  7. exception EOF
  8. (* Extract the location name from the pattern *)
  9. let location_name = Str.regexp {|.* \(.*\)|}
  10. (** Try to read the identifier and check if this is a function, a keyword, or
  11. just a variable.
  12. See the tests [Syntax.Operator2] and [Syntax.Call Nl] for two cases. *)
  13. let build_ident buffer =
  14. let id = Lexbuf.content buffer |> String.uppercase_ascii in
  15. try
  16. let value = Hashtbl.find Idents.keyword_table id in
  17. let _ =
  18. match value with IF | ELIF -> Lexbuf.incr_level buffer | _ -> ()
  19. in
  20. value
  21. with Not_found ->
  22. (* If the identifier does not match a keyword and start with [*], then
  23. try it as a '*' operator. *)
  24. if Char.equal '*' id.[0] then (
  25. Lexbuf.rollback buffer;
  26. let lexbuf = Lexbuf.buffer buffer in
  27. match%sedlex lexbuf with '*' -> STAR | _ -> IDENT id)
  28. else IDENT id
  29. let wait_balance : (Buffer.t -> Lexbuf.t -> 'a) -> Lexbuf.t -> 'a =
  30. fun rule lexbuf ->
  31. let _, position = Lexbuf.positions lexbuf in
  32. Lexbuf.set_start_position lexbuf position;
  33. try
  34. let token = rule (Buffer.create 256) lexbuf in
  35. token
  36. with Not_found -> raise UnclosedQuote
  37. let space = [%sedlex.regexp? ' ' | '\t']
  38. let eol = [%sedlex.regexp? '\r' | '\n' | "\r\n"]
  39. let coma = [%sedlex.regexp? ',']
  40. let digit = [%sedlex.regexp? '0' .. '9']
  41. let letters = [%sedlex.regexp? 'a' .. 'z' | 'A' .. 'Z' | '_']
  42. let spaces = [%sedlex.regexp? Plus space]
  43. let ident = [%sedlex.regexp? Opt ('$' | '*'), letters, Star (digit | letters)]
  44. let location_ident = [%sedlex.regexp? letters | digit]
  45. let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^']
  46. let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident]
  47. let rec read_long_string level buf buffer =
  48. let lexbuf = Lexbuf.buffer buffer in
  49. match%sedlex lexbuf with
  50. | '{' ->
  51. Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
  52. read_long_string (level + 1) buf buffer
  53. | '}' -> (
  54. match level with
  55. | 0 -> Buffer.contents buf
  56. | _ ->
  57. Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
  58. read_long_string (level - 1) buf buffer)
  59. | eol ->
  60. Buffer.add_string buf (Lexbuf.content buffer);
  61. read_long_string level buf buffer
  62. | any ->
  63. Buffer.add_string buf (Lexbuf.content buffer);
  64. read_long_string level buf buffer
  65. | _ -> raise Not_found
  66. let rec read_dquoted_string buf buffer =
  67. let lexbuf = Lexbuf.buffer buffer in
  68. match%sedlex lexbuf with
  69. | "\"\"" ->
  70. Buffer.add_char buf '"';
  71. read_dquoted_string buf buffer
  72. | '"' -> Buffer.contents buf
  73. | any ->
  74. Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
  75. read_dquoted_string buf buffer
  76. | _ -> raise Not_found
  77. let rec read_quoted_string buf buffer =
  78. let lexbuf = Lexbuf.buffer buffer in
  79. match%sedlex lexbuf with
  80. | "''" ->
  81. Buffer.add_char buf '\'';
  82. read_quoted_string buf buffer
  83. | '\'' -> Buffer.contents buf
  84. | eol ->
  85. Buffer.add_string buf (Lexbuf.content buffer);
  86. read_quoted_string buf buffer
  87. | any ->
  88. Buffer.add_string buf (Lexbuf.content buffer);
  89. read_quoted_string buf buffer
  90. | _ -> raise Not_found
  91. let rec skip_comment buffer =
  92. let lexbuf = Lexbuf.buffer buffer in
  93. match%sedlex lexbuf with
  94. | '{' ->
  95. let _ = wait_balance (read_long_string 0) buffer in
  96. skip_comment buffer
  97. | '\'' ->
  98. let _ = wait_balance read_quoted_string buffer in
  99. skip_comment buffer
  100. | '"' ->
  101. let _ = wait_balance read_dquoted_string buffer in
  102. skip_comment buffer
  103. | eol ->
  104. (* Ugly hack used in order to put the eol in the front of the next
  105. parsing. *)
  106. Lexbuf.rollback buffer;
  107. COMMENT
  108. | any -> skip_comment buffer
  109. | _ -> raise Not_found
  110. (** Main lexer *)
  111. let rec token : Lexbuf.t -> token =
  112. fun buffer ->
  113. let lexbuf = Lexbuf.buffer buffer in
  114. match%sedlex lexbuf with
  115. | 0Xfeff ->
  116. (* Ignore the BOM *)
  117. token buffer
  118. | '#', Star space, location ->
  119. (* Extract the location name *)
  120. let ident = Lexbuf.content buffer in
  121. let () =
  122. match Str.string_match location_name ident 0 with
  123. | false -> ()
  124. | true -> Sedlexing.set_filename lexbuf (Str.matched_group 1 ident)
  125. in
  126. (* Restart the line number (new location here) *)
  127. Lexbuf.start buffer;
  128. LOCATION_START ident
  129. | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) ->
  130. Lexbuf.reset_level buffer;
  131. LOCATION_END
  132. | Plus digit -> INTEGER (Lexbuf.content buffer)
  133. | '+' -> PLUS
  134. | '-' -> MINUS
  135. | "+=" -> INCR
  136. | "-=" -> DECR
  137. | "*=" -> MULT_EQUAL
  138. | '/' -> DIV
  139. | '*' -> STAR
  140. | ':' ->
  141. (* We are leaving the block, the comment will be handled again *)
  142. Lexbuf.decr_level buffer;
  143. COLUMN
  144. | '[' -> L_BRACKET
  145. | ']' -> R_BRACKET
  146. | '(' ->
  147. Lexbuf.incr_level buffer;
  148. L_PAREN
  149. | ')' ->
  150. Lexbuf.decr_level buffer;
  151. R_PAREN
  152. | '<' -> LT
  153. | '>' -> GT
  154. | coma -> COMA
  155. | '=' ->
  156. Lexbuf.incr_level buffer;
  157. EQUAL
  158. | ident -> build_ident buffer
  159. | eol ->
  160. Lexbuf.reset_level buffer;
  161. EOL
  162. | '&' ->
  163. Lexbuf.reset_level buffer;
  164. AMPERSAND
  165. | '!' -> if Lexbuf.level buffer > 0 then EXCLAMATION else skip_comment buffer
  166. | spaces -> token buffer
  167. | '\'' -> LITERAL (wait_balance read_quoted_string buffer)
  168. | '"' -> LITERAL (wait_balance read_dquoted_string buffer)
  169. | '{' -> LITERAL (wait_balance (read_long_string 0) buffer)
  170. | eof -> raise EOF
  171. | _ ->
  172. let tok = Lexbuf.content buffer in
  173. let msg = Format.asprintf "Unexpected character %S" tok in
  174. raise @@ LexError msg
  175. let rec discard buffer =
  176. let lexbuf = Lexbuf.buffer buffer in
  177. match%sedlex lexbuf with
  178. | '\'' ->
  179. ignore (wait_balance read_quoted_string buffer);
  180. discard buffer
  181. | '"' ->
  182. ignore (wait_balance read_dquoted_string buffer);
  183. discard buffer
  184. | '{' ->
  185. ignore (wait_balance (read_long_string 0) buffer);
  186. discard buffer
  187. | eof -> raise EOF
  188. | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) ->
  189. Lexbuf.reset_level buffer;
  190. ()
  191. | '!' ->
  192. ignore @@ skip_comment buffer;
  193. discard buffer
  194. | any -> discard buffer
  195. | _ -> raise EOF