1 files changed, 183 insertions, 139 deletions
diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4
index 50349e22..e351061d 100644
--- a/parsing/lexer.ml4
+++ b/parsing/lexer.ml4
@@ -1,20 +1,15 @@
 (************************************************************************)
 (*  v      *   The Coq Proof Assistant  /  The Coq Development Team     *)
-(* <O___,, *   INRIA - CNRS - LIX - LRI - PPS - Copyright 1999-2011     *)
+(* <O___,, *   INRIA - CNRS - LIX - LRI - PPS - Copyright 1999-2010     *)
 (*   \VV/  **************************************************************)
 (*    //   *      This file is distributed under the terms of the       *)
 (*         *       GNU Lesser General Public License Version 2.1        *)
 (************************************************************************)
 
-(*i camlp4use: "pr_o.cmo pa_macro.cmo" i*)
-(* Add pr_o.cmo to circumvent a useless-warning bug when preprocessed with
- * ast-based camlp4 *)
-
-(*i $Id: lexer.ml4 14641 2011-11-06 11:59:10Z herbelin $ i*)
-
 open Pp
 open Util
-open Token
+open Compat
+open Tok
 
 (* Dictionaries: trees annotated with string options, each node being a map
    from chars to dictionaries (the subtrees). A trie, in other words. *)
@@ -76,18 +71,37 @@ let ttree_remove ttree str =
 
 (* Errors occuring while lexing (explained as "Lexer error: ...") *)
 
-type error =
-  | Illegal_character
-  | Unterminated_comment
-  | Unterminated_string
-  | Undefined_token
-  | Bad_token of string
+module Error = struct
+
+  type t =
+    | Illegal_character
+    | Unterminated_comment
+    | Unterminated_string
+    | Undefined_token
+    | Bad_token of string
+    | UnsupportedUnicode of int
+
+  exception E of t
+
+  let to_string x =
+    "Syntax Error: Lexer: " ^
+      (match x with
+	 | Illegal_character -> "Illegal character"
+	 | Unterminated_comment -> "Unterminated comment"
+	 | Unterminated_string -> "Unterminated string"
+	 | Undefined_token -> "Undefined token"
+	 | Bad_token tok -> Format.sprintf "Bad token %S" tok
+	 | UnsupportedUnicode x ->
+	     Printf.sprintf "Unsupported Unicode character (0x%x)" x)
 
-exception Error of error
+  let print ppf x = Format.fprintf ppf "%s@." (to_string x)
 
-let err loc str = Stdpp.raise_with_loc (Util.make_loc loc) (Error str)
+end
+open Error
 
-let bad_token str = raise (Error (Bad_token str))
+let err loc str = Loc.raise (make_loc loc) (Error.E str)
+
+let bad_token str = raise (Error.E (Bad_token str))
 
 (* Lexer conventions on tokens *)
 
@@ -96,9 +110,9 @@ type token_kind =
   | AsciiChar
   | EmptyStream
 
-let error_unsupported_unicode_character n cs =
+let error_unsupported_unicode_character n unicode cs =
   let bp = Stream.count cs in
-  err (bp,bp+n) (Bad_token "Unsupported Unicode character")
+  err (bp,bp+n) (UnsupportedUnicode unicode)
 
 let error_utf8 cs =
   let bp = Stream.count cs in
@@ -147,7 +161,8 @@ let lookup_utf8_tail c cs =
       | _ -> error_utf8 cs
     in
     try classify_unicode unicode, n
-    with UnsupportedUtf8 -> njunk n cs; error_unsupported_unicode_character n cs
+    with UnsupportedUtf8 ->
+      njunk n cs; error_unsupported_unicode_character n unicode cs
 
 let lookup_utf8 cs =
   match Stream.peek cs with
@@ -155,14 +170,26 @@ let lookup_utf8 cs =
     | Some ('\x80'..'\xFF' as c) -> Utf8Token (lookup_utf8_tail c cs)
     | None -> EmptyStream
 
-let check_special_token str =
+let unlocated f x =
+  try f x with Loc.Exc_located (_,exc) -> raise exc
+
+let check_keyword str =
   let rec loop_symb = parser
     | [< ' (' ' | '\n' | '\r' | '\t' | '"') >] -> bad_token str
-    | [< _ = Stream.empty >] -> ()
-    | [< '_ ; s >] -> loop_symb s
+    | [< s >] ->
+	match unlocated lookup_utf8 s with
+	| Utf8Token (_,n) -> njunk n s; loop_symb s
+	| AsciiChar -> Stream.junk s; loop_symb s
+	| EmptyStream -> ()
   in
   loop_symb (Stream.of_string str)
 
+let check_keyword_to_add s =
+  try check_keyword s
+  with Error.E (UnsupportedUnicode unicode) ->
+    Flags.if_verbose msg_warning
+      (strbrk (Printf.sprintf "Token '%s' contains unicode character 0x%x which will not be parsable." s unicode))
+
 let check_ident str =
   let rec loop_id intail = parser
     | [< ' ('a'..'z' | 'A'..'Z' | '_'); s >] ->
@@ -170,7 +197,7 @@ let check_ident str =
     | [< ' ('0'..'9' | ''') when intail; s >] ->
         loop_id true s
     | [< s >] ->
-	match lookup_utf8 s with
+	match unlocated lookup_utf8 s with
 	| Utf8Token (UnicodeLetter, n) -> njunk n s; loop_id true s
 	| Utf8Token (UnicodeIdentPart, n) when intail -> njunk n s; loop_id true s
 	| EmptyStream -> ()
@@ -178,9 +205,8 @@ let check_ident str =
   in
   loop_id false (Stream.of_string str)
 
-let check_keyword str =
-  try check_special_token str
-  with Error _ -> check_ident str
+let is_ident str =
+  try let _ = check_ident str in true with Error.E _ -> false
 
 (* Keyword and symbol dictionary *)
 let token_tree = ref empty_ttree
@@ -190,22 +216,15 @@ let is_keyword s =
   with Not_found -> false
 
 let add_keyword str =
-  check_keyword str;
-  token_tree := ttree_add !token_tree str
+  if not (is_keyword str) then
+    begin
+      check_keyword_to_add str;
+      token_tree := ttree_add !token_tree str
+    end
 
 let remove_keyword str =
   token_tree := ttree_remove !token_tree str
 
-(* Adding a new token (keyword or special token). *)
-let add_token (con, str) = match con with
-  | "" -> add_keyword str
-  | "METAIDENT" | "LEFTQMARK" | "IDENT" | "FIELD" | "INT" | "STRING" | "EOI"
-      -> ()
-  | _ ->
-      raise (Token.Error ("\
-the constructor \"" ^ con ^ "\" is not recognized by Lexer"))
-
-
 (* Freeze and unfreeze the state of the lexer *)
 type frozen_t = ttree
 
@@ -249,17 +268,22 @@ let rec number len = parser
   | [< ' ('0'..'9' as c); s >] -> number (store len c) s
   | [< >] -> len
 
-let escape len c = store len c
-
 let rec string in_comments bp len = parser
   | [< ''"'; esc=(parser [<''"' >] -> true | [< >] -> false); s >] ->
       if esc then string in_comments bp (store len '"') s else len
+  | [< ''('; s >] ->
+      (parser
+        | [< ''*'; s >] ->
+            string (Option.map succ in_comments) bp (store (store len '(') '*') s
+        | [< >] ->
+	    string in_comments bp (store len '(') s) s
   | [< ''*'; s >] ->
       (parser
         | [< '')'; s >] ->
-	    if in_comments then
+            if in_comments = Some 0 then
 	      msg_warning (str "Not interpreting \"*)\" as the end of current non-terminated comment because it occurs in a non-terminated string of the comment.");
-	    string in_comments bp (store (store len '*') ')') s
+            let in_comments = Option.map pred in_comments in
+ 	    string in_comments bp (store (store len '*') ')') s
         | [< >] ->
 	    string in_comments bp (store len '*') s) s
   | [< 'c; s >] -> string in_comments bp (store len c) s
@@ -348,7 +372,7 @@ let rec comment bp = parser bp2
          | [< s >] -> real_push_char '*'; comment bp s >] -> ()
   | [< ''"'; s >] ->
       if Flags.do_beautify() then (push_string"\"";comm_string bp2 s)
-      else ignore (string true bp2 0 s);
+      else ignore (string (Some 0) bp2 0 s);
       comment bp s
   | [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_comment
   | [< 'z; s >] -> real_push_char z; comment bp s
@@ -394,61 +418,68 @@ let find_keyword id s =
   let tt = ttree_find !token_tree id in
   match progress_further tt.node 0 tt s with
   | None -> raise Not_found
-  | Some c -> c
+  | Some c -> KEYWORD c
 
 (* Must be a special token *)
 let process_chars bp c cs =
   let t = progress_from_byte None (-1) !token_tree cs c in
   let ep = Stream.count cs in
   match t with
-    | Some t -> (("", t), (bp, ep))
+    | Some t -> (KEYWORD t, (bp, ep))
     | None ->
 	let ep' = bp + utf8_char_size cs c in
 	njunk (ep' - ep) cs;
 	err (bp, ep') Undefined_token
 
-let parse_after_dollar bp =
-  parser
-  | [< ' ('a'..'z' | 'A'..'Z' | '_' as c); len = ident_tail (store 0 c) >] ->
-      ("METAIDENT", get_buff len)
-  | [< s >] ->
-      match lookup_utf8 s with
-      | Utf8Token (UnicodeLetter, n) ->
-	  ("METAIDENT", get_buff (ident_tail (nstore n 0 s) s))
-      | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars bp '$' s)
+let token_of_special c s = match c with
+  | '$' -> METAIDENT s
+  | '.' -> FIELD s
+  | _ -> assert false
 
-(* Parse what follows a dot *)
-let parse_after_dot bp c =
+(* Parse what follows a dot / a dollar *)
+
+let parse_after_special c bp =
   parser
-  | [< ' ('a'..'z' | 'A'..'Z' | '_' as c); len = ident_tail (store 0 c) >] ->
-      ("FIELD", get_buff len)
+  | [< ' ('a'..'z' | 'A'..'Z' | '_' as d); len = ident_tail (store 0 d) >] ->
+      token_of_special c (get_buff len)
   | [< s >] ->
       match lookup_utf8 s with
       | Utf8Token (UnicodeLetter, n) ->
-	  ("FIELD", get_buff (ident_tail (nstore n 0 s) s))
-      | AsciiChar | Utf8Token _ | EmptyStream ->
-	  fst (process_chars bp c s)
+	  token_of_special c (get_buff (ident_tail (nstore n 0 s) s))
+      | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars bp c s)
 
 (* Parse what follows a question mark *)
+
 let parse_after_qmark bp s =
   match Stream.peek s with
-    |Some ('a'..'z' | 'A'..'Z' | '_') -> ("LEFTQMARK", "")
-    |None -> ("","?")
+    | Some ('a'..'z' | 'A'..'Z' | '_') -> LEFTQMARK
+    | None -> KEYWORD "?"
     | _ ->
 	match lookup_utf8 s with
-	  | Utf8Token (UnicodeLetter, _) -> ("LEFTQMARK", "")
+	  | Utf8Token (UnicodeLetter, _) -> LEFTQMARK
 	  | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars bp '?' s)
 
+let blank_or_eof cs =
+  match Stream.peek cs with
+    | None -> true
+    | Some (' ' | '\t' | '\n' |'\r') -> true
+    | _ -> false
 
 (* Parse a token in a char stream *)
+
 let rec next_token = parser bp
   | [< '' ' | '\t' | '\n' |'\r' as c; s >] ->
       comm_loc bp; push_char c; next_token s
-  | [< ''$'; t = parse_after_dollar bp >] ep ->
+  | [< ''$' as c; t = parse_after_special c bp >] ep ->
       comment_stop bp; (t, (ep, bp))
-  | [< ''.' as c; t = parse_after_dot bp c >] ep ->
+  | [< ''.' as c; t = parse_after_special c bp; s >] ep ->
       comment_stop bp;
-      if Flags.do_beautify() & t=("",".") then between_com := true;
+      (* We enforce that "." should either be part of a larger keyword,
+         for instance ".(", or followed by a blank or eof. *)
+      if t = KEYWORD "." then begin
+	if not (blank_or_eof s) then err (bp,ep+1) Undefined_token;
+	if Flags.do_beautify() then between_com := true;
+      end;
       (t, (bp,ep))
   | [< ''?'; s >] ep ->
       let t = parse_after_qmark bp s in comment_stop bp; (t, (ep, bp))
@@ -456,13 +487,13 @@ let rec next_token = parser bp
        len = ident_tail (store 0 c); s >] ep ->
       let id = get_buff len in
       comment_stop bp;
-      (try ("", find_keyword id s) with Not_found -> ("IDENT", id)), (bp, ep)
+      (try find_keyword id s with Not_found -> IDENT id), (bp, ep)
   | [< ' ('0'..'9' as c); len = number (store 0 c) >] ep ->
       comment_stop bp;
-      (("INT", get_buff len), (bp, ep))
-  | [< ''\"'; len = string false bp 0 >] ep ->
+      (INT (get_buff len), (bp, ep))
+  | [< ''\"'; len = string None bp 0 >] ep ->
       comment_stop bp;
-      (("STRING", get_buff len), (bp, ep))
+      (STRING (get_buff len), (bp, ep))
   | [< ' ('(' as c);
       t = parser
         | [< ''*'; s >] ->
@@ -479,62 +510,53 @@ let rec next_token = parser bp
 	    let id = get_buff len in
 	    let ep = Stream.count s in
 	    comment_stop bp;
-	    (try ("",find_keyword id s) with Not_found -> ("IDENT",id)), (bp, ep)
+	    (try find_keyword id s with Not_found -> IDENT id), (bp, ep)
 	| AsciiChar | Utf8Token ((UnicodeSymbol | UnicodeIdentPart), _) ->
 	    let t = process_chars bp (Stream.next s) s in
 	    comment_stop bp; t
 	| EmptyStream ->
-	    comment_stop bp; (("EOI", ""), (bp, bp + 1))
+	    comment_stop bp; (EOI, (bp, bp + 1))
+
+(* (* Debug: uncomment this for tracing tokens seen by coq...*)
+let next_token s =
+  let (t,(bp,ep)) = next_token s in Printf.eprintf "[%s]\n%!" (Tok.to_string t);
+  (t,(bp,ep))
+*)
 
 (* Location table system for creating tables associating a token count
    to its location in a char stream (the source) *)
 
 let locerr () = invalid_arg "Lexer: location function"
 
-let tsz = 256   (* up to 2^29 entries on a 32-bit machine, 2^61 on 64-bit *)
-
-let loct_create () = ref [| [| |] |]
+let loct_create () = Hashtbl.create 207
 
 let loct_func loct i =
-  match
-    if i < 0 || i/tsz >= Array.length !loct then None
-    else if !loct.(i/tsz) = [| |] then None
-    else !loct.(i/tsz).(i mod tsz)
-  with
-    | Some loc -> Util.make_loc loc
-    | _ -> locerr ()
-
-let loct_add loct i loc =
-  while i/tsz >= Array.length !loct do
-    let new_tmax = Array.length !loct * 2 in
-    let new_loct = Array.make new_tmax [| |] in
-    Array.blit !loct 0 new_loct 0 (Array.length !loct);
-    loct := new_loct;
-  done;
-  if !loct.(i/tsz) = [| |] then !loct.(i/tsz) <- Array.make tsz None;
-  !loct.(i/tsz).(i mod tsz) <- Some loc
-
-let current_location_table = ref (ref [| [| |] |])
-
-let location_function n =
-  loct_func !current_location_table n
+  try Hashtbl.find loct i with Not_found -> locerr ()
 
-let func cs =
-  let loct = loct_create () in
-  let ts =
-    Stream.from
-      (fun i ->
-         let (tok, loc) = next_token cs in
-	   loct_add loct i loc; Some tok)
-  in
-    current_location_table := loct;
-    (ts, loct_func loct)
+let loct_add loct i loc = Hashtbl.add loct i loc
+
+let current_location_table = ref (loct_create ())
 
-type location_table = (int * int) option array array ref
+type location_table = (int, loc) Hashtbl.t
 let location_table () = !current_location_table
 let restore_location_table t = current_location_table := t
+let location_function n = loct_func !current_location_table n
 
-(* Names of tokens, for this lexer, used in Grammar error messages *)
+(** {6 The lexer of Coq} *)
+
+(** Note: removing a token.
+   We do nothing because [remove_token] is called only when removing a grammar
+   rule with [Grammar.delete_rule]. The latter command is called only when
+   unfreezing the state of the grammar entries (see GRAMMAR summary, file
+   env/metasyntax.ml). Therefore, instead of removing tokens one by one,
+   we unfreeze the state of the lexer. This restores the behaviour of the
+   lexer. B.B. *)
+
+IFDEF CAMLP5 THEN
+
+type te = Tok.t
+
+(** Names of tokens, for this lexer, used in Grammar error messages *)
 
 let token_text = function
   | ("", t) -> "'" ^ t ^ "'"
@@ -547,43 +569,65 @@ let token_text = function
   | (con, "") -> con
   | (con, prm) -> con ^ " \"" ^ prm ^ "\""
 
-(* The lexer of Coq *)
-
-(* Note: removing a token.
-   We do nothing because [remove_token] is called only when removing a grammar
-   rule with [Grammar.delete_rule]. The latter command is called only when
-   unfreezing the state of the grammar entries (see GRAMMAR summary, file
-   env/metasyntax.ml). Therefore, instead of removing tokens one by one,
-   we unfreeze the state of the lexer. This restores the behaviour of the
-   lexer. B.B. *)
-
-IFDEF CAMLP5 THEN
+let func cs =
+  let loct = loct_create () in
+  let ts =
+    Stream.from
+      (fun i ->
+         let (tok, loc) = next_token cs in
+	 loct_add loct i (make_loc loc); Some tok)
+  in
+  current_location_table := loct;
+  (ts, loct_func loct)
 
 let lexer = {
   Token.tok_func = func;
-  Token.tok_using = add_token;
+  Token.tok_using =
+    (fun pat -> match Tok.of_pattern pat with
+       | KEYWORD s -> add_keyword s
+       | _ -> ());
   Token.tok_removing = (fun _ -> ());
-  Token.tok_match = default_match;
+  Token.tok_match = Tok.match_pattern;
   Token.tok_comm = None;
   Token.tok_text = token_text }
 
-ELSE
-
-let lexer = {
-  Token.func = func;
-  Token.using = add_token;
-  Token.removing = (fun _ -> ());
-  Token.tparse = (fun _ -> None);
-  Token.text = token_text }
+ELSE (* official camlp4 for ocaml >= 3.10 *)
+
+module M_ = Camlp4.ErrorHandler.Register (Error)
+
+module Loc = Loc
+module Token = struct
+  include Tok (* Cf. tok.ml *)
+  module Loc = Loc
+  module Error = Camlp4.Struct.EmptyError
+  module Filter = struct
+    type token_filter = (Tok.t * Loc.t) Stream.t -> (Tok.t * Loc.t) Stream.t
+    type t = unit
+    let mk _is_kwd = ()
+    let keyword_added () kwd _ = add_keyword kwd
+    let keyword_removed () _ = ()
+    let filter () x = x
+    let define_filter () _ = ()
+  end
+end
+
+let mk () _init_loc(*FIXME*) cs =
+  let loct = loct_create () in
+  let rec self =
+    parser i
+       [< (tok, loc) = next_token; s >] ->
+          let loc = make_loc loc in
+            loct_add loct i loc;
+            [< '(tok, loc); self s >]
+      | [< >] -> [< >]
+  in current_location_table := loct; self cs
 
 END
 
-(* Terminal symbols interpretation *)
+(** Terminal symbols interpretation *)
 
 let is_ident_not_keyword s =
-  match s.[0] with
-    | 'a'..'z' | 'A'..'Z' | '_' -> not (is_keyword s)
-    | _ -> false
+  is_ident s && not (is_keyword s)
 
 let is_number s =
   let rec aux i =
@@ -613,6 +657,6 @@ let strip s =
 let terminal s =
   let s = strip s in
   if s = "" then failwith "empty token";
-  if is_ident_not_keyword s then ("IDENT", s)
-  else if is_number s then ("INT", s)
-  else ("", s)
+  if is_ident_not_keyword s then IDENT s
+  else if is_number s then INT s
+  else KEYWORD s