lexer.ml 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. (**
  2. Lexer using sedlex
  3. *)
  4. open Tokens
  5. open StdLabels
  6. exception UnclosedQuote
  7. exception LexError of string
  8. exception EOF
  9. (* Extract the location name from the pattern *)
  10. let location_name = Str.regexp {|# *\(.*\)|}
  11. (** Remove all the expression state when we are leaving the expression itself. *)
  12. let rec leave_expression buffer =
  13. match Lexbuf.state buffer with
  14. | Some Lexbuf.Expression ->
  15. Lexbuf.leave_state buffer;
  16. leave_expression buffer
  17. | _ -> ()
  18. (** Try to read the identifier and check if this is a function, a keyword, or
  19. just a variable.
  20. See the tests [Syntax.Operator2] and [Syntax.Call Nl] for two cases. *)
  21. let build_ident buffer =
  22. let id = Lexbuf.content buffer |> String.uppercase_ascii in
  23. try
  24. let value = Hashtbl.find Idents.keyword_table id in
  25. let _ =
  26. match value with
  27. | IF | ELIF -> Lexbuf.enter_state buffer Lexbuf.Expression
  28. | _ -> ()
  29. in
  30. value
  31. with Not_found ->
  32. (* If the identifier does not match a keyword and start with [*], then
  33. try it as a '*' operator. *)
  34. if Char.equal '*' id.[0] then (
  35. Lexbuf.rollback buffer;
  36. let lexbuf = Lexbuf.buffer buffer in
  37. match%sedlex lexbuf with '*' -> STAR | _ -> IDENT id)
  38. else IDENT id
  39. let wait_balance : (Buffer.t -> Lexbuf.t -> 'a) -> Lexbuf.t -> 'a =
  40. fun rule lexbuf ->
  41. let _, position = Lexbuf.positions lexbuf in
  42. Lexbuf.set_start_position lexbuf position;
  43. try
  44. let token = rule (Buffer.create 256) lexbuf in
  45. token
  46. with Not_found -> raise UnclosedQuote
  47. let space = [%sedlex.regexp? ' ' | '\t']
  48. let eol = [%sedlex.regexp? '\r' | '\n' | "\r\n"]
  49. let coma = [%sedlex.regexp? ',']
  50. let digit = [%sedlex.regexp? '0' .. '9']
  51. let letters = [%sedlex.regexp? 'a' .. 'z' | 'A' .. 'Z' | '_']
  52. let ident = [%sedlex.regexp? Opt ('$' | '*'), letters, Star (digit | letters)]
  53. let location_ident = [%sedlex.regexp? letters | digit]
  54. let location_prefix = [%sedlex.regexp? '!' | '$' | '#' | '^']
  55. let location = [%sedlex.regexp? Opt location_prefix, Plus location_ident]
  56. let rec read_long_string ?(nested = false) level buf buffer =
  57. let lexbuf = Lexbuf.buffer buffer in
  58. match%sedlex lexbuf with
  59. | "<<" ->
  60. if not nested then (
  61. match Buffer.length buf with
  62. | 0 ->
  63. Lexbuf.enter_state buffer Lexbuf.Token;
  64. ENTER_EMBED
  65. | _ ->
  66. let result = Tokens.LITERAL (Buffer.contents buf) in
  67. Buffer.reset buf;
  68. Lexbuf.rollback buffer;
  69. result)
  70. else (
  71. Buffer.add_string buf (Lexbuf.content buffer);
  72. read_long_string ~nested level buf buffer)
  73. | '{' ->
  74. Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
  75. read_long_string ~nested (level + 1) buf buffer
  76. | '}' -> (
  77. match level with
  78. | 0 ->
  79. Lexbuf.leave_state buffer;
  80. Lexbuf.enter_state buffer
  81. (Lexbuf.EndString Lex_state.readLongStringWraper);
  82. Lexbuf.rollback buffer;
  83. LITERAL (Buffer.contents buf)
  84. | _ ->
  85. (* We have nested strings. Do not terminate end *)
  86. Buffer.add_string buf (Sedlexing.Utf8.lexeme lexbuf);
  87. read_long_string ~nested (level - 1) buf buffer)
  88. | '\'' | '"' ->
  89. Buffer.add_string buf (Lexbuf.content buffer);
  90. read_long_string ~nested:true level buf buffer
  91. | eol ->
  92. Buffer.add_string buf (Lexbuf.content buffer);
  93. read_long_string ~nested level buf buffer
  94. | any ->
  95. Buffer.add_string buf (Lexbuf.content buffer);
  96. read_long_string ~nested level buf buffer
  97. | _ -> raise Not_found
  98. (** Read the text inside a ['] or ['"']
  99. The function return the parsed string, but the closing token has been
  100. rollbacked, leaving the state in [Lexbuf.EndString _].
  101. The next call to [main] will call the associated function, effectively
  102. leaving the string mode in the parser.
  103. *)
  104. let rec read_quoted_string : Lexbuf.stringWraper -> Lexbuf.buffer_builder =
  105. fun f ?(nested = false) buf buffer ->
  106. let lexbuf = Lexbuf.buffer buffer in
  107. match%sedlex lexbuf with
  108. | "<<" ->
  109. (* Enter into embed code. We enter here into a new state until the
  110. matching >>.
  111. If we already got some text before, report the literal token, then
  112. rollback to read the embeded code again. *)
  113. if not nested then (
  114. match Buffer.length buf with
  115. | 0 ->
  116. Lexbuf.enter_state buffer Lexbuf.Token;
  117. ENTER_EMBED
  118. | _ ->
  119. let result = Tokens.LITERAL (Buffer.contents buf) in
  120. Buffer.reset buf;
  121. Lexbuf.rollback buffer;
  122. result)
  123. else (
  124. Buffer.add_string buf (Lexbuf.content buffer);
  125. (f.wrap (read_quoted_string f)) buf buffer)
  126. | eol | any ->
  127. Buffer.add_string buf (Lexbuf.content buffer);
  128. (f.wrap (read_quoted_string f)) buf buffer
  129. | _ -> raise Not_found
  130. let rec skip_comment buffer =
  131. (* Simplified way to skip the content of a string until the end marker.
  132. (expect the string to be well formed) *)
  133. let rec parse_until_end f =
  134. let _ = wait_balance f buffer in
  135. match Lexbuf.state buffer with
  136. | Some Lexbuf.Token ->
  137. Lexbuf.leave_state buffer;
  138. parse_until_end f
  139. | Some (Lexbuf.EndString _) -> ()
  140. | _ -> parse_until_end f
  141. in
  142. let lexbuf = Lexbuf.buffer buffer in
  143. match%sedlex lexbuf with
  144. | '{' ->
  145. parse_until_end (read_long_string 0);
  146. let _ = Lex_state.readLongStringWraper.end_string buffer in
  147. skip_comment buffer
  148. | '\'' ->
  149. parse_until_end
  150. (Lex_state.quotedStringWraper.wrap
  151. (read_quoted_string Lex_state.quotedStringWraper));
  152. let _ = Lex_state.quotedStringWraper.end_string buffer in
  153. skip_comment buffer
  154. | '"' ->
  155. parse_until_end
  156. (Lex_state.dQuotedStringWraper.wrap
  157. (read_quoted_string Lex_state.dQuotedStringWraper));
  158. let _ = Lex_state.dQuotedStringWraper.end_string buffer in
  159. skip_comment buffer
  160. | eol ->
  161. (* Ugly hack used in order to put the eol in the front of the next
  162. parsing.
  163. This is required because the eol is also a part of the syntax, and do
  164. cannot be discard with the end of the comment. *)
  165. Lexbuf.rollback buffer;
  166. COMMENT
  167. | any -> skip_comment buffer
  168. | _ -> raise Not_found
  169. (** Main lexer *)
  170. let rec parse_token : Lexbuf.t -> token =
  171. fun buffer ->
  172. let lexbuf = Lexbuf.buffer buffer in
  173. match%sedlex lexbuf with
  174. | 0Xfeff ->
  175. (* Ignore the BOM *)
  176. parse_token buffer
  177. | '#', Star space, location ->
  178. (* Extract the location name *)
  179. let ident = Lexbuf.content buffer in
  180. let ident_name =
  181. match Str.string_match location_name ident 0 with
  182. | false -> ident
  183. | true -> Str.matched_group 1 ident
  184. in
  185. (* Restart the line number (new location here) *)
  186. Lexbuf.start buffer;
  187. LOCATION_START
  188. (fun () ->
  189. Sedlexing.set_filename lexbuf ident_name;
  190. (* Restart the line number (new location here) *)
  191. ident_name)
  192. | '_', Star space, eol, Star space ->
  193. (* The _ character can be used to break lines *)
  194. parse_token buffer
  195. | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) ->
  196. leave_expression buffer;
  197. LOCATION_END
  198. | Plus digit -> INTEGER (Lexbuf.content buffer)
  199. | '+' -> PLUS
  200. | '-' -> MINUS
  201. | "+=" -> INCR
  202. | "-=" -> DECR
  203. | "*=" -> MULT_EQUAL
  204. | "/=" -> DIV_EQUAL
  205. | '/' -> DIV
  206. | '*' -> STAR
  207. | ':' ->
  208. (* We are leaving the block, the comment will be handled again *)
  209. Lexbuf.leave_state buffer;
  210. COLUMN
  211. | '[' -> L_BRACKET
  212. | ']' -> R_BRACKET
  213. | '(' ->
  214. Lexbuf.enter_state buffer Lexbuf.Expression;
  215. L_PAREN
  216. | ')' ->
  217. Lexbuf.leave_state buffer;
  218. R_PAREN
  219. | ">>" ->
  220. (* Leave the expression if we have any*)
  221. leave_expression buffer;
  222. (* Now leave the token mode and return to the string *)
  223. Lexbuf.leave_state buffer;
  224. LEAVE_EMBED
  225. | '<' -> LT
  226. | '>' -> GT
  227. | coma -> COMA
  228. | '=' ->
  229. Lexbuf.enter_state buffer Lexbuf.Expression;
  230. EQUAL
  231. | ident -> build_ident buffer
  232. | eol ->
  233. leave_expression buffer;
  234. EOL
  235. | '&' ->
  236. leave_expression buffer;
  237. AMPERSAND
  238. | '!' -> (
  239. match Lexbuf.state buffer with
  240. | Some Lexbuf.Expression -> EXCLAMATION
  241. | _ -> skip_comment buffer)
  242. | '}' -> TEXT_MARKER
  243. | eof -> raise EOF
  244. | _ ->
  245. let tok = Lexbuf.content buffer in
  246. let msg = Format.asprintf "Unexpected character %S" tok in
  247. raise @@ LexError msg
  248. let main buffer =
  249. match Lexbuf.state buffer with
  250. | Some (Lexbuf.String w) ->
  251. wait_balance (w.wrap @@ read_quoted_string w) buffer
  252. | Some (Lexbuf.MString level) -> wait_balance (read_long_string level) buffer
  253. | Some (Lexbuf.EndString w) -> w.end_string buffer
  254. | _ ->
  255. let parser =
  256. parse_token |> Lex_state.defaultWraper.start_string
  257. |> Lexbuf.overlay buffer
  258. in
  259. parser buffer
  260. let rec discard buffer =
  261. let () = Lexbuf.start_recovery buffer in
  262. let lexbuf = Lexbuf.buffer buffer in
  263. match%sedlex lexbuf with
  264. | '-', Plus '-', Star (Sub (any, ('\r' | '\n'))) ->
  265. (* If something looks like the end of a location, get out the discard
  266. mode.
  267. We can’t really be sure if it is effectively the end of a
  268. location (because we can be in a text), but trying to figure if it is
  269. or not just don’t make sense.
  270. We are here because an error was raised, so can have any situation
  271. (for example a missing quote). *)
  272. leave_expression buffer;
  273. ()
  274. | any -> discard buffer
  275. | _ -> raise EOF