summaryrefslogtreecommitdiff
path: root/parsing/lexer.ml4
diff options
context:
space:
mode:
Diffstat (limited to 'parsing/lexer.ml4')
-rw-r--r--parsing/lexer.ml4249
1 files changed, 138 insertions, 111 deletions
diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4
index 82ae2dc8..8e839296 100644
--- a/parsing/lexer.ml4
+++ b/parsing/lexer.ml4
@@ -1,6 +1,6 @@
(************************************************************************)
(* v * The Coq Proof Assistant / The Coq Development Team *)
-(* <O___,, * INRIA - CNRS - LIX - LRI - PPS - Copyright 1999-2014 *)
+(* <O___,, * INRIA - CNRS - LIX - LRI - PPS - Copyright 1999-2015 *)
(* \VV/ **************************************************************)
(* // * This file is distributed under the terms of the *)
(* * GNU Lesser General Public License Version 2.1 *)
@@ -14,7 +14,8 @@ open Tok
(* Dictionaries: trees annotated with string options, each node being a map
from chars to dictionaries (the subtrees). A trie, in other words. *)
-module CharMap = Map.Make (struct type t = char let compare = compare end)
+module CharOrd = struct type t = char let compare : char -> char -> int = compare end
+module CharMap = Map.Make (CharOrd)
type ttree = {
node : string option;
@@ -86,27 +87,28 @@ module Error = struct
let to_string x =
"Syntax Error: Lexer: " ^
(match x with
- | Illegal_character -> "Illegal character"
- | Unterminated_comment -> "Unterminated comment"
- | Unterminated_string -> "Unterminated string"
- | Undefined_token -> "Undefined token"
- | Bad_token tok -> Format.sprintf "Bad token %S" tok
- | UnsupportedUnicode x ->
- Printf.sprintf "Unsupported Unicode character (0x%x)" x)
+ | Illegal_character -> "Illegal character"
+ | Unterminated_comment -> "Unterminated comment"
+ | Unterminated_string -> "Unterminated string"
+ | Undefined_token -> "Undefined token"
+ | Bad_token tok -> Format.sprintf "Bad token %S" tok
+ | UnsupportedUnicode x ->
+ Printf.sprintf "Unsupported Unicode character (0x%x)" x)
- let print ppf x = Format.fprintf ppf "%s@." (to_string x)
+ (* Require to fix the Camlp4 signature *)
+ let print ppf x = Pp.pp_with ppf (Pp.str (to_string x))
end
open Error
-let err loc str = Loc.raise (make_loc loc) (Error.E str)
+let err loc str = Loc.raise (Loc.make_loc loc) (Error.E str)
let bad_token str = raise (Error.E (Bad_token str))
(* Lexer conventions on tokens *)
type token_kind =
- | Utf8Token of (utf8_status * int)
+ | Utf8Token of (Unicode.status * int)
| AsciiChar
| EmptyStream
@@ -130,38 +132,38 @@ let utf8_char_size cs = function
let njunk n = Util.repeat n Stream.junk
let check_utf8_trailing_byte cs c =
- if Char.code c land 0xC0 <> 0x80 then error_utf8 cs
+ if not (Int.equal (Char.code c land 0xC0) 0x80) then error_utf8 cs
(* Recognize utf8 blocks (of length less than 4 bytes) *)
(* but don't certify full utf8 compliance (e.g. no emptyness check) *)
let lookup_utf8_tail c cs =
let c1 = Char.code c in
- if c1 land 0x40 = 0 or c1 land 0x38 = 0x38 then error_utf8 cs
+ if Int.equal (c1 land 0x40) 0 || Int.equal (c1 land 0x38) 0x38 then error_utf8 cs
else
let n, unicode =
- if c1 land 0x20 = 0 then
+ if Int.equal (c1 land 0x20) 0 then
match Stream.npeek 2 cs with
| [_;c2] ->
- check_utf8_trailing_byte cs c2;
- 2, (c1 land 0x1F) lsl 6 + (Char.code c2 land 0x3F)
+ check_utf8_trailing_byte cs c2;
+ 2, (c1 land 0x1F) lsl 6 + (Char.code c2 land 0x3F)
| _ -> error_utf8 cs
- else if c1 land 0x10 = 0 then
+ else if Int.equal (c1 land 0x10) 0 then
match Stream.npeek 3 cs with
| [_;c2;c3] ->
- check_utf8_trailing_byte cs c2; check_utf8_trailing_byte cs c3;
- 3, (c1 land 0x0F) lsl 12 + (Char.code c2 land 0x3F) lsl 6 +
- (Char.code c3 land 0x3F)
+ check_utf8_trailing_byte cs c2; check_utf8_trailing_byte cs c3;
+ 3, (c1 land 0x0F) lsl 12 + (Char.code c2 land 0x3F) lsl 6 +
+ (Char.code c3 land 0x3F)
| _ -> error_utf8 cs
else match Stream.npeek 4 cs with
| [_;c2;c3;c4] ->
- check_utf8_trailing_byte cs c2; check_utf8_trailing_byte cs c3;
- check_utf8_trailing_byte cs c4;
- 4, (c1 land 0x07) lsl 18 + (Char.code c2 land 0x3F) lsl 12 +
- (Char.code c3 land 0x3F) lsl 6 + (Char.code c4 land 0x3F)
+ check_utf8_trailing_byte cs c2; check_utf8_trailing_byte cs c3;
+ check_utf8_trailing_byte cs c4;
+ 4, (c1 land 0x07) lsl 18 + (Char.code c2 land 0x3F) lsl 12 +
+ (Char.code c3 land 0x3F) lsl 6 + (Char.code c4 land 0x3F)
| _ -> error_utf8 cs
in
- try classify_unicode unicode, n
- with UnsupportedUtf8 ->
+ try Unicode.classify unicode, n
+ with Unicode.Unsupported ->
njunk n cs; error_unsupported_unicode_character n unicode cs
let lookup_utf8 cs =
@@ -170,17 +172,18 @@ let lookup_utf8 cs =
| Some ('\x80'..'\xFF' as c) -> Utf8Token (lookup_utf8_tail c cs)
| None -> EmptyStream
-let unlocated f x =
- try f x with Loc.Exc_located (_,exc) -> raise exc
+let unlocated f x = f x
+ (** FIXME: should we still unloc the exception? *)
+(* try f x with Loc.Exc_located (_, exc) -> raise exc *)
let check_keyword str =
let rec loop_symb = parser
| [< ' (' ' | '\n' | '\r' | '\t' | '"') >] -> bad_token str
| [< s >] ->
- match unlocated lookup_utf8 s with
- | Utf8Token (_,n) -> njunk n s; loop_symb s
- | AsciiChar -> Stream.junk s; loop_symb s
- | EmptyStream -> ()
+ match unlocated lookup_utf8 s with
+ | Utf8Token (_,n) -> njunk n s; loop_symb s
+ | AsciiChar -> Stream.junk s; loop_symb s
+ | EmptyStream -> ()
in
loop_symb (Stream.of_string str)
@@ -188,7 +191,8 @@ let check_keyword_to_add s =
try check_keyword s
with Error.E (UnsupportedUnicode unicode) ->
Flags.if_verbose msg_warning
- (strbrk (Printf.sprintf "Token '%s' contains unicode character 0x%x which will not be parsable." s unicode))
+ (strbrk (Printf.sprintf "Token '%s' contains unicode character 0x%x \
+ which will not be parsable." s unicode))
let check_ident str =
let rec loop_id intail = parser
@@ -197,11 +201,13 @@ let check_ident str =
| [< ' ('0'..'9' | ''') when intail; s >] ->
loop_id true s
| [< s >] ->
- match unlocated lookup_utf8 s with
- | Utf8Token (UnicodeLetter, n) -> njunk n s; loop_id true s
- | Utf8Token (UnicodeIdentPart, n) when intail -> njunk n s; loop_id true s
- | EmptyStream -> ()
- | Utf8Token _ | AsciiChar -> bad_token str
+ match unlocated lookup_utf8 s with
+ | Utf8Token (Unicode.Letter, n) -> njunk n s; loop_id true s
+ | Utf8Token (Unicode.IdentPart, n) when intail ->
+ njunk n s;
+ loop_id true s
+ | EmptyStream -> ()
+ | Utf8Token _ | AsciiChar -> bad_token str
in
loop_id false (Stream.of_string str)
@@ -229,14 +235,7 @@ let remove_keyword str =
type frozen_t = ttree
let freeze () = !token_tree
-
-let unfreeze tt =
- token_tree := tt
-
-let init () =
- unfreeze empty_ttree
-
-let _ = init()
+let unfreeze tt = (token_tree := tt)
(* The string buffering machinery *)
@@ -260,8 +259,8 @@ let rec ident_tail len = parser
ident_tail (store len c) s
| [< s >] ->
match lookup_utf8 s with
- | Utf8Token ((UnicodeIdentPart | UnicodeLetter), n) ->
- ident_tail (nstore n len s) s
+ | Utf8Token ((Unicode.IdentPart | Unicode.Letter), n) ->
+ ident_tail (nstore n len s) s
| _ -> len
let rec number len = parser
@@ -274,28 +273,36 @@ let rec string in_comments bp len = parser
| [< ''('; s >] ->
(parser
| [< ''*'; s >] ->
- string (Option.map succ in_comments) bp (store (store len '(') '*') s
+ string
+ (Option.map succ in_comments)
+ bp (store (store len '(') '*')
+ s
| [< >] ->
- string in_comments bp (store len '(') s) s
+ string in_comments bp (store len '(') s) s
| [< ''*'; s >] ->
(parser
| [< '')'; s >] ->
- if in_comments = Some 0 then
- msg_warning (str "Not interpreting \"*)\" as the end of current non-terminated comment because it occurs in a non-terminated string of the comment.");
+ let () = match in_comments with
+ | Some 0 ->
+ msg_warning
+ (strbrk
+ "Not interpreting \"*)\" as the end of current \
+ non-terminated comment because it occurs in a \
+ non-terminated string of the comment.")
+ | _ -> ()
+ in
let in_comments = Option.map pred in_comments in
- string in_comments bp (store (store len '*') ')') s
+ string in_comments bp (store (store len '*') ')') s
| [< >] ->
- string in_comments bp (store len '*') s) s
+ string in_comments bp (store len '*') s) s
| [< 'c; s >] -> string in_comments bp (store len c) s
| [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_string
-(* Hook for exporting comment into xml theory files *)
-let xml_output_comment = ref (fun _ -> ())
-let set_xml_output_comment f = xml_output_comment := f
-
(* Utilities for comments in beautify *)
let comment_begin = ref None
-let comm_loc bp = if !comment_begin=None then comment_begin := Some bp
+let comm_loc bp = match !comment_begin with
+| None -> comment_begin := Some bp
+| _ -> ()
let current = Buffer.create 8192
let between_com = ref true
@@ -318,9 +325,9 @@ let push_char c =
if
!between_com || List.mem c ['\n';'\r'] ||
(List.mem c [' ';'\t']&&
- (Buffer.length current = 0 ||
+ (Int.equal (Buffer.length current) 0 ||
not (let s = Buffer.contents current in
- List.mem s.[String.length s - 1] [' ';'\t';'\n';'\r'])))
+ List.mem s.[String.length s - 1] [' ';'\t';'\n';'\r'])))
then
real_push_char c
@@ -333,15 +340,14 @@ let null_comment s =
let comment_stop ep =
let current_s = Buffer.contents current in
- if !Flags.xml_export && Buffer.length current > 0 &&
- (!between_com || not(null_comment current_s)) then
- !xml_output_comment current_s;
(if Flags.do_beautify() && Buffer.length current > 0 &&
(!between_com || not(null_comment current_s)) then
let bp = match !comment_begin with
Some bp -> bp
| None ->
- msgerrnl(str"No begin location for comment '"++str current_s ++str"' ending at "++int ep);
+ msgerrnl(str "No begin location for comment '"
+ ++ str current_s ++str"' ending at "
+ ++ int ep);
ep-1 in
Pp.comments := ((bp,ep),current_s) :: !Pp.comments);
Buffer.clear current;
@@ -353,8 +359,11 @@ let rec comm_string bp = parser
| [< ''"' >] -> push_string "\""
| [< ''\\'; _ =
(parser [< ' ('"' | '\\' as c) >] ->
- if c='"' then real_push_char c;
- real_push_char c
+ let () = match c with
+ | '"' -> real_push_char c
+ | _ -> ()
+ in
+ real_push_char c
| [< >] -> real_push_char '\\'); s >]
-> comm_string bp s
| [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_string
@@ -388,26 +397,26 @@ let rec progress_further last nj tt cs =
and update_longest_valid_token last nj tt cs =
match tt.node with
| Some _ as last' ->
- for i=1 to nj do Stream.junk cs done;
- progress_further last' 0 tt cs
+ stream_njunk nj cs;
+ progress_further last' 0 tt cs
| None ->
- progress_further last nj tt cs
+ progress_further last nj tt cs
(* nj is the number of char peeked since last valid token *)
(* n the number of char in utf8 block *)
and progress_utf8 last nj n c tt cs =
try
let tt = CharMap.find c tt.branch in
- if n=1 then
+ if Int.equal n 1 then
update_longest_valid_token last (nj+n) tt cs
else
- match Util.list_skipn (nj+1) (Stream.npeek (nj+n) cs) with
- | l when List.length l = n-1 ->
- List.iter (check_utf8_trailing_byte cs) l;
- let tt = List.fold_left (fun tt c -> CharMap.find c tt.branch) tt l in
- update_longest_valid_token last (nj+n) tt cs
+ match Util.List.skipn (nj+1) (Stream.npeek (nj+n) cs) with
+ | l when Int.equal (List.length l) (n - 1) ->
+ List.iter (check_utf8_trailing_byte cs) l;
+ let tt = List.fold_left (fun tt c -> CharMap.find c tt.branch) tt l in
+ update_longest_valid_token last (nj+n) tt cs
| _ ->
- error_utf8 cs
+ error_utf8 cs
with Not_found ->
last
@@ -420,6 +429,14 @@ let find_keyword id s =
| None -> raise Not_found
| Some c -> KEYWORD c
+let process_sequence bp c cs =
+ let rec aux n cs =
+ match Stream.peek cs with
+ | Some c' when c == c' -> Stream.junk cs; aux (n+1) cs
+ | _ -> BULLET (String.make n c), (bp, Stream.count cs)
+ in
+ aux 1 cs
+
(* Must be a special token *)
let process_chars bp c cs =
let t = progress_from_byte None (-1) !token_tree cs c in
@@ -427,9 +444,9 @@ let process_chars bp c cs =
match t with
| Some t -> (KEYWORD t, (bp, ep))
| None ->
- let ep' = bp + utf8_char_size cs c in
- njunk (ep' - ep) cs;
- err (bp, ep') Undefined_token
+ let ep' = bp + utf8_char_size cs c in
+ njunk (ep' - ep) cs;
+ err (bp, ep') Undefined_token
let token_of_special c s = match c with
| '$' -> METAIDENT s
@@ -444,8 +461,8 @@ let parse_after_special c bp =
token_of_special c (get_buff len)
| [< s >] ->
match lookup_utf8 s with
- | Utf8Token (UnicodeLetter, n) ->
- token_of_special c (get_buff (ident_tail (nstore n 0 s) s))
+ | Utf8Token (Unicode.Letter, n) ->
+ token_of_special c (get_buff (ident_tail (nstore n 0 s) s))
| AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars bp c s)
(* Parse what follows a question mark *)
@@ -455,9 +472,10 @@ let parse_after_qmark bp s =
| Some ('a'..'z' | 'A'..'Z' | '_') -> LEFTQMARK
| None -> KEYWORD "?"
| _ ->
- match lookup_utf8 s with
- | Utf8Token (UnicodeLetter, _) -> LEFTQMARK
- | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars bp '?' s)
+ match lookup_utf8 s with
+ | Utf8Token (Unicode.Letter, _) -> LEFTQMARK
+ | AsciiChar | Utf8Token _ | EmptyStream ->
+ fst (process_chars bp '?' s)
let blank_or_eof cs =
match Stream.peek cs with
@@ -476,11 +494,19 @@ let rec next_token = parser bp
comment_stop bp;
(* We enforce that "." should either be part of a larger keyword,
for instance ".(", or followed by a blank or eof. *)
- if t = KEYWORD "." then begin
- if not (blank_or_eof s) then err (bp,ep+1) Undefined_token;
- if Flags.do_beautify() then between_com := true;
- end;
+ let () = match t with
+ | KEYWORD ("." | "...") ->
+ if not (blank_or_eof s) then err (bp,ep+1) Undefined_token;
+ between_com := true;
+ | _ -> ()
+ in
(t, (bp,ep))
+ | [< ' ('-'|'+'|'*' as c); s >] ->
+ let t,new_between_com =
+ if !between_com then process_sequence bp c s,true
+ else process_chars bp c s,false
+ in
+ comment_stop bp; between_com := new_between_com; t
| [< ''?'; s >] ep ->
let t = parse_after_qmark bp s in comment_stop bp; (t, (ep, bp))
| [< ' ('a'..'z' | 'A'..'Z' | '_' as c);
@@ -499,23 +525,25 @@ let rec next_token = parser bp
| [< ''*'; s >] ->
comm_loc bp;
push_string "(*";
- comment bp s;
- next_token s
+ comment bp s;
+ next_token s
| [< t = process_chars bp c >] -> comment_stop bp; t >] ->
t
| [< s >] ->
match lookup_utf8 s with
- | Utf8Token (UnicodeLetter, n) ->
- let len = ident_tail (nstore n 0 s) s in
- let id = get_buff len in
- let ep = Stream.count s in
- comment_stop bp;
- (try find_keyword id s with Not_found -> IDENT id), (bp, ep)
- | AsciiChar | Utf8Token ((UnicodeSymbol | UnicodeIdentPart), _) ->
- let t = process_chars bp (Stream.next s) s in
- comment_stop bp; t
- | EmptyStream ->
- comment_stop bp; (EOI, (bp, bp + 1))
+ | Utf8Token (Unicode.Letter, n) ->
+ let len = ident_tail (nstore n 0 s) s in
+ let id = get_buff len in
+ let ep = Stream.count s in
+ comment_stop bp;
+ (try find_keyword id s with Not_found -> IDENT id), (bp, ep)
+ | AsciiChar | Utf8Token ((Unicode.Symbol | Unicode.IdentPart), _) ->
+ let t = process_chars bp (Stream.next s) s in
+ let new_between_com = match t with
+ (KEYWORD ("{"|"}"),_) -> !between_com | _ -> false in
+ comment_stop bp; between_com := new_between_com; t
+ | EmptyStream ->
+ comment_stop bp; (EOI, (bp, bp + 1))
(* (* Debug: uncomment this for tracing tokens seen by coq...*)
let next_token s =
@@ -537,10 +565,9 @@ let loct_add loct i loc = Hashtbl.add loct i loc
let current_location_table = ref (loct_create ())
-type location_table = (int, loc) Hashtbl.t
+type location_table = (int, CompatLoc.t) Hashtbl.t
let location_table () = !current_location_table
let restore_location_table t = current_location_table := t
-let location_function n = loct_func !current_location_table n
(** {6 The lexer of Coq} *)
@@ -575,7 +602,7 @@ let func cs =
Stream.from
(fun i ->
let (tok, loc) = next_token cs in
- loct_add loct i (make_loc loc); Some tok)
+ loct_add loct i (make_loc loc); Some tok)
in
current_location_table := loct;
(ts, loct_func loct)
@@ -595,10 +622,10 @@ ELSE (* official camlp4 for ocaml >= 3.10 *)
module M_ = Camlp4.ErrorHandler.Register (Error)
-module Loc = Loc
+module Loc = CompatLoc
module Token = struct
include Tok (* Cf. tok.ml *)
- module Loc = Loc
+ module Loc = CompatLoc
module Error = Camlp4.Struct.EmptyError
module Filter = struct
type token_filter = (Tok.t * Loc.t) Stream.t -> (Tok.t * Loc.t) Stream.t
@@ -631,14 +658,14 @@ let is_ident_not_keyword s =
let is_number s =
let rec aux i =
- String.length s = i or
+ Int.equal (String.length s) i ||
match s.[i] with '0'..'9' -> aux (i+1) | _ -> false
in aux 0
let strip s =
let len =
let rec loop i len =
- if i = String.length s then len
+ if Int.equal i (String.length s) then len
else if s.[i] == ' ' then loop (i + 1) len
else loop (i + 1) (len + 1)
in
@@ -656,7 +683,7 @@ let strip s =
let terminal s =
let s = strip s in
- if s = "" then Util.error "empty token.";
+ let () = match s with "" -> Errors.error "empty token." | _ -> () in
if is_ident_not_keyword s then IDENT s
else if is_number s then INT s
else KEYWORD s