1 files changed, 59 insertions, 62 deletions
diff --git a/parsing/cLexer.ml4 b/parsing/cLexer.ml4
index 181c4b7fd..79771f3f6 100644
--- a/parsing/cLexer.ml4
+++ b/parsing/cLexer.ml4
@@ -69,6 +69,15 @@ let ttree_remove ttree str =
   in
   remove ttree 0
 
+let ttree_elements ttree =
+  let rec elts tt accu =
+    let accu = match tt.node with
+    | None -> accu
+    | Some s -> CString.Set.add s accu
+    in
+    CharMap.fold (fun _ tt accu -> elts tt accu) tt.branch accu
+  in
+  elts ttree CString.Set.empty
 
 (* Errors occurring while lexing (explained as "Lexer error: ...") *)
 
@@ -101,14 +110,6 @@ module Error = struct
 end
 open Error
 
-let current_file = ref ""
-
-let get_current_file () =
-  !current_file
-
-let set_current_file ~fname =
-  current_file := fname
-
 let err loc str = Loc.raise (Compat.to_coqloc loc) (Error.E str)
 
 let bad_token str = raise (Error.E (Bad_token str))
@@ -120,11 +121,6 @@ type token_kind =
   | AsciiChar
   | EmptyStream
 
-let error_unsupported_unicode_character loc n unicode cs =
-  let bp = Stream.count cs in
-  let loc = set_loc_pos loc bp (bp+n) in
-  err loc (UnsupportedUnicode unicode)
-
 let error_utf8 loc cs =
   let bp = Stream.count cs in
   Stream.junk cs; (* consume the char to avoid read it and fail again *)
@@ -174,14 +170,12 @@ let lookup_utf8_tail loc c cs =
           (Char.code c3 land 0x3F) lsl 6 + (Char.code c4 land 0x3F)
       | _ -> error_utf8 loc cs
     in
-    try Unicode.classify unicode, n
-    with Unicode.Unsupported ->
-      njunk n cs; error_unsupported_unicode_character loc n unicode cs
+    Utf8Token (Unicode.classify unicode, n)
 
 let lookup_utf8 loc cs =
   match Stream.peek cs with
     | Some ('\x00'..'\x7F') -> AsciiChar
-    | Some ('\x80'..'\xFF' as c) -> Utf8Token (lookup_utf8_tail loc c cs)
+    | Some ('\x80'..'\xFF' as c) -> lookup_utf8_tail loc c cs
     | None -> EmptyStream
 
 let unlocated f x = f x
@@ -199,17 +193,6 @@ let check_keyword str =
   in
   loop_symb (Stream.of_string str)
 
-let warn_unparsable_keyword =
-  CWarnings.create ~name:"unparsable-keyword" ~category:"parsing"
-         (fun (s,unicode) ->
-          strbrk (Printf.sprintf "Token '%s' contains unicode character 0x%x \
-                                  which will not be parsable." s unicode))
-            
-let check_keyword_to_add s =
-  try check_keyword s
-  with Error.E (UnsupportedUnicode unicode) ->
-    warn_unparsable_keyword (s,unicode)
-
 let check_ident str =
   let rec loop_id intail = parser
     | [< ' ('a'..'z' | 'A'..'Z' | '_'); s >] ->
@@ -240,13 +223,15 @@ let is_keyword s =
 let add_keyword str =
   if not (is_keyword str) then
     begin
-      check_keyword_to_add str;
+      check_keyword str;
       token_tree := ttree_add !token_tree str
     end
 
 let remove_keyword str =
   token_tree := ttree_remove !token_tree str
 
+let keywords () = ttree_elements !token_tree
+
 (* Freeze and unfreeze the state of the lexer *)
 type frozen_t = ttree
 
@@ -270,6 +255,12 @@ let get_buff len = String.sub !buff 0 len
 
 (* The classical lexer: idents, numbers, quoted strings, comments *)
 
+let warn_unrecognized_unicode =
+  CWarnings.create ~name:"unrecognized-unicode" ~category:"parsing"
+         (fun (u,id) ->
+          strbrk (Printf.sprintf "Not considering unicode character \"%s\" of unknown \
+                                  lexical status as part of identifier \"%s\"." u id))
+
 let rec ident_tail loc len = parser
   | [< ' ('a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_' as c); s >] ->
       ident_tail loc (store len c) s
@@ -277,6 +268,10 @@ let rec ident_tail loc len = parser
       match lookup_utf8 loc s with
       | Utf8Token ((Unicode.IdentPart | Unicode.Letter), n) ->
           ident_tail loc (nstore n len s) s
+      | Utf8Token (Unicode.Unknown, n) ->
+          let id = get_buff len in
+          let u = String.concat "" (List.map (String.make 1) (Stream.npeek n s)) in
+          warn_unrecognized_unicode ~loc:!@loc (u,id); len
       | _ -> len
 
 let rec number len = parser
@@ -334,6 +329,9 @@ let rec string loc ~comm_level bp len = parser
 (* Hook for exporting comment into xml theory files *)
 let (f_xml_output_comment, xml_output_comment) = Hook.make ~default:ignore ()
 
+(* To associate locations to a file name *)
+let current_file = ref None
+
 (* Utilities for comments in beautify *)
 let comment_begin = ref None
 let comm_loc bp = match !comment_begin with
@@ -354,16 +352,20 @@ let rec split_comments comacc acc pos = function
 
 let extract_comments pos = split_comments [] [] pos !comments
 
-type comments_state = int option * string * bool * ((int * int) * string) list
-let restore_comments_state (o,s,b,c) =
+(* The state of the lexer visible from outside *)
+type lexer_state = int option * string * bool * ((int * int) * string) list * string option
+
+let init_lexer_state f = (None,"",true,[],f)
+let set_lexer_state (o,s,b,c,f) =
   comment_begin := o;
   Buffer.clear current_comment; Buffer.add_string current_comment s;
   between_commands := b;
-  comments := c
-let default_comments_state = (None,"",true,[])
-let comments_state () =
-  let s = (!comment_begin, Buffer.contents current_comment, !between_commands, !comments) in
-  restore_comments_state default_comments_state; s
+  comments := c;
+  current_file := f
+let release_lexer_state () =
+  (!comment_begin, Buffer.contents current_comment, !between_commands, !comments, !current_file)
+let drop_lexer_state () =
+    set_lexer_state (init_lexer_state None)
 
 let real_push_char c = Buffer.add_char current_comment c
 
@@ -391,7 +393,7 @@ let comment_stop ep =
   if !Flags.xml_export && Buffer.length current_comment > 0 &&
     (!between_commands || not(null_comment current_s)) then
       Hook.get f_xml_output_comment current_s;
-  (if Flags.do_beautify() && Buffer.length current_comment > 0 &&
+  (if !Flags.beautify && Buffer.length current_comment > 0 &&
     (!between_commands || not(null_comment current_s)) then
     let bp = match !comment_begin with
         Some bp -> bp
@@ -408,7 +410,7 @@ let comment_stop ep =
 
 (* Does not unescape!!! *)
 let rec comm_string loc bp = parser
-  | [< ''"' >] ep -> push_string "\""; loc
+  | [< ''"' >] -> push_string "\""; loc
   | [< ''\\'; loc =
            (parser [< ' ('"' | '\\' as c) >] ->
               let () = match c with
@@ -438,7 +440,7 @@ let rec comment loc bp = parser bp2
       let loc =
 	(* In beautify mode, the lexing differs between strings in comments and
 	   regular strings (e.g. escaping). It seems wrong. *)
-	if Flags.do_beautify() then (push_string"\""; comm_string loc bp2 s)
+	if !Flags.beautify then (push_string"\""; comm_string loc bp2 s)
 	else fst (string loc ~comm_level:(Some 0) bp2 0 s)
       in
       comment loc bp s
@@ -511,20 +513,19 @@ let process_chars loc bp c cs =
 	let loc = set_loc_pos loc bp ep' in
         err loc Undefined_token
 
-let token_of_special c s = match c with
-  | '.' -> FIELD s
-  | _ -> assert false
+(* Parse what follows a dot *)
 
-(* Parse what follows a dot / a dollar *)
-
-let parse_after_special loc c bp =
+let parse_after_dot loc c bp =
   parser
-  | [< ' ('a'..'z' | 'A'..'Z' | '_' as d); len = ident_tail loc (store 0 d) >] ->
-      token_of_special c (get_buff len)
+  | [< ' ('a'..'z' | 'A'..'Z' | '_' as d); len = ident_tail loc (store 0 d); s >] ->
+      let field = get_buff len in
+      (try find_keyword loc ("."^field) s with Not_found -> FIELD field)
   | [< s >] ->
       match lookup_utf8 loc s with
       | Utf8Token (Unicode.Letter, n) ->
-          token_of_special c (get_buff (ident_tail loc (nstore n 0 s) s))
+          let len = ident_tail loc (nstore n 0 s) s in
+          let field = get_buff len in
+          (try find_keyword loc ("."^field) s with Not_found -> FIELD field)
       | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars loc bp c s)
 
 (* Parse what follows a question mark *)
@@ -552,7 +553,7 @@ let rec next_token loc = parser bp
       comm_loc bp; push_char c; next_token (bump_loc_line loc ep) s
   | [< '' ' | '\t' | '\r' as c; s >] ->
       comm_loc bp; push_char c; next_token loc s
-  | [< ''.' as c; t = parse_after_special loc c bp; s >] ep ->
+  | [< ''.' as c; t = parse_after_dot loc c bp; s >] ep ->
       comment_stop bp;
       (* We enforce that "." should either be part of a larger keyword,
          for instance ".(", or followed by a blank or eof. *)
@@ -572,7 +573,7 @@ let rec next_token loc = parser bp
       comment_stop bp; between_commands := new_between_commands; t
   | [< ''?'; s >] ep ->
       let t = parse_after_qmark loc bp s in
-      comment_stop bp; (t, set_loc_pos loc ep bp)
+      comment_stop bp; (t, set_loc_pos loc bp ep)
   | [< ' ('a'..'z' | 'A'..'Z' | '_' as c);
        len = ident_tail loc (store 0 c); s >] ep ->
       let id = get_buff len in
@@ -592,6 +593,12 @@ let rec next_token loc = parser bp
             let loc = comment loc bp s in next_token loc s
         | [< t = process_chars loc bp c >] -> comment_stop bp; t >] ->
       t
+  | [< ' ('{' | '}' as c); s >] ep ->
+      let t,new_between_commands =
+        if !between_commands then (KEYWORD (String.make 1 c), set_loc_pos loc bp ep), true
+        else process_chars loc bp c s, false
+      in
+      comment_stop bp; between_commands := new_between_commands; t
   | [< s >] ->
       match lookup_utf8 loc s with
         | Utf8Token (Unicode.Letter, n) ->
@@ -600,11 +607,9 @@ let rec next_token loc = parser bp
             let ep = Stream.count s in
             comment_stop bp;
             (try find_keyword loc id s with Not_found -> IDENT id), set_loc_pos loc bp ep
-        | AsciiChar | Utf8Token ((Unicode.Symbol | Unicode.IdentPart), _) ->
+        | AsciiChar | Utf8Token ((Unicode.Symbol | Unicode.IdentPart | Unicode.Unknown), _) ->
             let t = process_chars loc bp (Stream.next s) s in
-            let new_between_commands = match t with
-              (KEYWORD ("{"|"}"),_) -> !between_commands | _ -> false in
-            comment_stop bp; between_commands := new_between_commands; t
+            comment_stop bp; t
         | EmptyStream ->
             comment_stop bp; (EOI, set_loc_pos loc bp (bp+1))
 
@@ -626,12 +631,6 @@ let loct_func loct i =
 
 let loct_add loct i loc = Hashtbl.add loct i loc
 
-let current_location_table = ref (loct_create ())
-
-type location_table = (int, Compat.CompatLoc.t) Hashtbl.t
-let location_table () = !current_location_table
-let restore_location_table t = current_location_table := t
-
 (** {6 The lexer of Coq} *)
 
 (** Note: removing a token.
@@ -669,7 +668,6 @@ let func cs =
 	 cur_loc := Compat.after loc;
          loct_add loct i loc; Some tok)
   in
-  current_location_table := loct;
   (ts, loct_func loct)
 
 let lexer = {
@@ -706,7 +704,6 @@ end
 let mk () =
   let loct = loct_create () in
   let cur_loc = ref (Compat.make_loc !current_file 1 0 0 0) in
-  current_location_table := loct; 
   let rec self init_loc (* FIXME *) =
     parser i
        [< (tok, loc) = next_token !cur_loc; s >] ->