About "unsupported" unicode characters in notations.

- 8.2 (bug-fix): reverted check for unicode early at notation definition time (an unsupported "cadratin" space, 0x2003, was used in CoRN!) [by the way, what to do with unicode spacing characters in general?] - trunk: improved error message, removed redundant code git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/coq/trunk@13561 85f007b7-540e-0410-9357-904b9bb8a0f7
author: herbelin <herbelin@85f007b7-540e-0410-9357-904b9bb8a0f7> 2010-10-17 19:59:39 +0000
committer: herbelin <herbelin@85f007b7-540e-0410-9357-904b9bb8a0f7> 2010-10-17 19:59:39 +0000
commit: 2521bbc7e9805fd57d2852c1e9631250def11d57 (patch)
tree: 6cdc089a8a07a033bc5089edb5bbaa3adb193412 /parsing
parent: d25dfc1a30ab227178f8cd4304f28a35f4d7e19d (diff)
1 files changed, 25 insertions, 13 deletions
diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4
index 3c392acb6..5e2b10a7a 100644
--- a/parsing/lexer.ml4
+++ b/parsing/lexer.ml4
@@ -79,6 +79,7 @@ module Error = struct
     | Unterminated_string
     | Undefined_token
     | Bad_token of string
+    | UnsupportedUnicode of int
 
   exception E of t
 
@@ -89,7 +90,9 @@ module Error = struct
 	 | Unterminated_comment -> "Unterminated comment"
 	 | Unterminated_string -> "Unterminated string"
 	 | Undefined_token -> "Undefined token"
-	 | Bad_token tok -> Format.sprintf "Bad token %S" tok)
+	 | Bad_token tok -> Format.sprintf "Bad token %S" tok
+	 | UnsupportedUnicode x ->
+	     Printf.sprintf "Unsupported Unicode character (0x%x)" x)
 
   let print ppf x = Format.fprintf ppf "%s@." (to_string x)
 
@@ -107,9 +110,9 @@ type token_kind =
   | AsciiChar
   | EmptyStream
 
-let error_unsupported_unicode_character n cs =
+let error_unsupported_unicode_character n unicode cs =
   let bp = Stream.count cs in
-  err (bp,bp+n) (Bad_token "Unsupported Unicode character")
+  err (bp,bp+n) (UnsupportedUnicode unicode)
 
 let error_utf8 cs =
   let bp = Stream.count cs in
@@ -158,7 +161,8 @@ let lookup_utf8_tail c cs =
       | _ -> error_utf8 cs
     in
     try classify_unicode unicode, n
-    with UnsupportedUtf8 -> njunk n cs; error_unsupported_unicode_character n cs
+    with UnsupportedUtf8 ->
+      njunk n cs; error_unsupported_unicode_character n unicode cs
 
 let lookup_utf8 cs =
   match Stream.peek cs with
@@ -166,17 +170,26 @@ let lookup_utf8 cs =
     | Some ('\x80'..'\xFF' as c) -> Utf8Token (lookup_utf8_tail c cs)
     | None -> EmptyStream
 
-let check_special_token str =
+let unlocated f x =
+  try f x with Loc.Exc_located (_,exc) -> raise exc
+
+let check_keyword str =
   let rec loop_symb = parser
     | [< ' (' ' | '\n' | '\r' | '\t' | '"') >] -> bad_token str
     | [< s >] ->
-	match lookup_utf8 s with
+	match unlocated lookup_utf8 s with
 	| Utf8Token (_,n) -> njunk n s; loop_symb s
 	| AsciiChar -> Stream.junk s; loop_symb s
 	| EmptyStream -> ()
   in
   loop_symb (Stream.of_string str)
 
+let check_keyword_to_add s =
+  try check_keyword s
+  with Error.E (UnsupportedUnicode unicode) ->
+    Flags.if_verbose msg_warning
+      (strbrk (Printf.sprintf "Token '%s' contains unicode character 0x%x which will not be parsable." s unicode))
+
 let check_ident str =
   let rec loop_id intail = parser
     | [< ' ('a'..'z' | 'A'..'Z' | '_'); s >] ->
@@ -184,7 +197,7 @@ let check_ident str =
     | [< ' ('0'..'9' | ''') when intail; s >] ->
         loop_id true s
     | [< s >] ->
-	match lookup_utf8 s with
+	match unlocated lookup_utf8 s with
 	| Utf8Token (UnicodeLetter, n) -> njunk n s; loop_id true s
 	| Utf8Token (UnicodeIdentPart, n) when intail -> njunk n s; loop_id true s
 	| EmptyStream -> ()
@@ -192,10 +205,6 @@ let check_ident str =
   in
   loop_id false (Stream.of_string str)
 
-let check_keyword str =
-  try check_special_token str
-  with Error.E _ -> check_ident str
-
 (* Keyword and symbol dictionary *)
 let token_tree = ref empty_ttree
 
@@ -204,8 +213,11 @@ let is_keyword s =
   with Not_found -> false
 
 let add_keyword str =
-  check_keyword str;
-  token_tree := ttree_add !token_tree str
+  if not (is_keyword str) then
+    begin
+      check_keyword_to_add str;
+      token_tree := ttree_add !token_tree str
+    end
 
 let remove_keyword str =
   token_tree := ttree_remove !token_tree str
author	herbelin <herbelin@85f007b7-540e-0410-9357-904b9bb8a0f7>	2010-10-17 19:59:39 +0000
committer	herbelin <herbelin@85f007b7-540e-0410-9357-904b9bb8a0f7>	2010-10-17 19:59:39 +0000
commit	2521bbc7e9805fd57d2852c1e9631250def11d57 (patch)
tree	6cdc089a8a07a033bc5089edb5bbaa3adb193412 /parsing
parent	d25dfc1a30ab227178f8cd4304f28a35f4d7e19d (diff)