Ajout de nouvelles plages de symboles unicode; prise en compte des indices unicode et letter-like dans les identificateurs

git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/coq/trunk@7870 85f007b7-540e-0410-9357-904b9bb8a0f7
author: herbelin <herbelin@85f007b7-540e-0410-9357-904b9bb8a0f7> 2006-01-15 20:29:09 +0000
committer: herbelin <herbelin@85f007b7-540e-0410-9357-904b9bb8a0f7> 2006-01-15 20:29:09 +0000
commit: 66b97c0035e0a5f2609cf3743cceeadf26600039 (patch)
tree: 69e5572ade1ec93fcef6ed92d09bd2f8026b88fa /parsing
parent: 80c3103a85deb61ed37963bb114dad5901ca2c39 (diff)
1 files changed, 57 insertions, 35 deletions
diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4
index be52fa5f1..2c17590a3 100644
--- a/parsing/lexer.ml4
+++ b/parsing/lexer.ml4
@@ -82,16 +82,19 @@ let check_ident str =
   let rec loop_id = parser
     | [< ' ('$' | 'a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_'); s >] ->
         loop_id s
-    (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *)
-    | [< ' ('\206' | '\207'); ' ('\128'..'\191'); s >] -> loop_id s
-    | [< ''\226'; 'c2; 'c3; s >] ->
+    (* utf-8 Greek letters U0380-03FF *)
+    | [< ' ('\xCE' | '\xCF'); ' ('\x80'..'\xBF'); s >] -> loop_id s
+    | [< ''\xE2'; 'c2; 'c3; s >] ->
 	  (match c2, c3 with
-	      (* utf8 letter-like unicode 2100-214F *) 
-	    | (('\132', '\128'..'\191') | ('\133', '\128'..'\143')) ->
+	      (* utf-8 letter-like U2100-214F *)
+	    | ( ('\x84', '\x80'..'\xBF')
+	      | ('\x85', '\x80'..'\x8F')
+	      (* utf-8 subscript U2080-2089 *) 
+	      | ('\x82', '\x80'..'\x89')) ->
                 loop_id s 
-		(* utf8 symbols (see [parse_226_tail]) *)
-	    | (('\134'..'\143' | '\152'..'\155'
-               | '\164'..'\165' | '\168'..'\171'),_) ->
+	      (* utf-8 symbols (see [parse_226_tail]) *)
+	    | (('\x86'..'\x8F' | '\x94'..'\x9B'
+               | '\xA4'..'\xA5' | '\xA8'..'\xAB'),_) ->
 		bad_token str
 	    | _ -> 
 		bad_token str)
@@ -169,11 +172,23 @@ let get_buff len = String.sub !buff 0 len
 let rec ident_tail len = parser
   | [< ' ('a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_' as c); s >] ->
       ident_tail (store len c) s
-  (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *)
-  | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2) ; s >] ->
+  (* utf-8 Greek letters U0380-03FF *)
+  | [< ' ('\xCE' | '\xCF' as c1); ' ('\x80'..'\xBF' as c2) ; s >] ->
       ident_tail (store (store len c1) c2) s
-  | [< >] -> len
-
+  | [< s >] ->
+      match Stream.peek s with
+      | Some '\xE2' -> 
+	  (match List.tl (Stream.npeek 3 s) with
+	    (* utf-8 subscript U2080-2089 *) 
+	  | ['\x82' as c2; ('\x80'..'\x89' as c3)]
+	    (* utf-8 letter-like U2100-214F part 1 *)
+	  | ['\x84' as c2; ('\x80'..'\xBF' as c3)]
+	    (* utf-8 letter-like U2100-214F part 2 *)
+	  | ['\x85' as c2; ('\x80'..'\x8F' as c3)] ->
+	    Stream.junk s; Stream.junk s; Stream.junk s;
+	    ident_tail (store (store (store len '\xE2') c2) c3) s
+	  | _ -> len)
+      | _ -> len
 
 let rec number len = parser
   | [< ' ('0'..'9' as c); s >] -> number (store len c) s
@@ -273,7 +288,7 @@ let rec comment bp = parser bp2
       else ignore (string bp2 0 s);
       comment bp s
   | [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_comment
-  | [< '_ as z; s >] -> real_push_char z; comment bp s
+  | [< 'z; s >] -> real_push_char z; comment bp s
 
 (* Parse a special token, using the [token_tree] *)
 
@@ -307,42 +322,49 @@ type token_226_tail =
   | TokSymbol of string option
   | TokIdent of string
 
+(* 1110xxxx 10yyyyzz 10zztttt utf-8 codes for xxxx=0010 *)
 let parse_226_tail tk = parser
-  | [< ''\132' as c2; ' ('\128'..'\191' as c3);
-      (* utf8 letter-like unicode 2100-214F *)
-      len = ident_tail (store (store (store 0 '\226') c2) c3) >] ->
+  | [< ''\x82' as c2; ' ('\x80'..'\x89' as c3);
+      (* utf-8 subscript U2080-2089 *) 
+      len = ident_tail (store (store (store 0 '\xE2') c2) c3) >] ->
       TokIdent (get_buff len) 
-  | [< ''\133' as c2; ' ('\128'..'\143' as c3);
-      (* utf8 letter-like unicode 2100-214F *)
-      len = ident_tail (store (store (store 0 '\226') c2) c3) >] ->
+  | [< ''\x84' as c2; ' ('\x80'..'\xBF' as c3);
+      (* utf-8 letter-like U2100-214F part 1 *)
+      len = ident_tail (store (store (store 0 '\xE2') c2) c3) >] ->
       TokIdent (get_buff len) 
-  | [< ' ('\134'..'\143' | '\152'..'\155' 
-         | '\164'..'\165' | '\168'..'\171' as c2); 'c3;
-      (* utf8 arrows A unicode 2190-21FF *)
-      (* utf8 mathematical operators unicode 2200-22FF *)
-      (* utf8 miscellaneous technical unicode 2300-23FF *)
-      (* utf8 miscellaneous symbols unicode 2600-26FF *)
-      (* utf8 arrows B unicode 2900-297F *)
-      (* utf8 mathematical operators unicode 2A00-2AFF *)
+  | [< ''\x85' as c2; ' ('\x80'..'\x8F' as c3);
+      (* utf-8 letter-like U2100-214F part 2 *)
+      len = ident_tail (store (store (store 0 '\xE2') c2) c3) >] ->
+      TokIdent (get_buff len) 
+  | [< ' ('\x86'..'\x8F' | '\x94'..'\x9B' | '\xA4'..'\xA5' 
+         | '\xA8'..'\xAB' as c2); 'c3;
+      (* utf-8 arrows A U2190-21FF *)
+      (* utf-8 mathematical operators U2200-22FF *)
+      (* utf-8 miscellaneous technical U2300-23FF *)
+      (* utf-8 box drawing U2500-257F has ceiling, etc. *)
+      (* utf-8 block elements U2580-259F *)
+      (* utf-8 geom. shapes U25A0-25FF (has triangles, losange, etc) *)
+      (* utf-8 miscellaneous symbols U2600-26FF *)
+      (* utf-8 arrows B U2900-297F *)
+      (* utf-8 mathematical operators U2A00-2AFF *)
       t = special (progress_special c3 (progress_special c2 
-	(progress_special '\226' tk))) >] ->
+	(progress_special '\xE2' tk))) >] ->
       TokSymbol t
   | [< '_; '_ >] ->
       (* Unsupported utf-8 code *)
       TokSymbol None
 
-
 (* Parse what follows a dot *)
 let parse_after_dot bp c = parser
   | [< ' ('a'..'z' | 'A'..'Z' | '_' as c);
     len = ident_tail (store 0 c) >] ->
       ("FIELD", get_buff len)
-  (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *)
-  | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2); 
+  (* utf-8 Greek letters U0380-03FF *)
+  | [< ' ('\xCE' | '\xCF' as c1); ' ('\x80'..'\xBF' as c2); 
     len = ident_tail (store (store 0 c1) c2) >] ->
       ("FIELD", get_buff len)
   (* utf-8 mathematical symbols have format E2 xx xx [E2=226] *)
-  | [< ''\226'; t = parse_226_tail 
+  | [< ''\xE2'; t = parse_226_tail 
       (progress_special '.' (Some !token_tree)) >] ep ->
       (match t with
 	| TokSymbol (Some t) -> ("", t)
@@ -367,14 +389,14 @@ let rec next_token = parser bp
       let id = get_buff len in 
       comment_stop bp;
       (try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep)
-  (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *)
-  | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2); 
+  (* utf-8 Greek letters U0380-03FF [CE80-CEBF and CF80-CFBF] *)
+  | [< ' ('\xCE' | '\xCF' as c1); ' ('\x80'..'\xBF' as c2); 
 	   len = ident_tail (store (store 0 c1) c2) >] ep ->
       let id = get_buff len in
       comment_stop bp;
       (try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep)
   (* utf-8 mathematical symbols have format E2 xx xx [E2=226] *)
-  | [< ''\226'; t = parse_226_tail (Some !token_tree) >] ep ->
+  | [< ''\xE2'; t = parse_226_tail (Some !token_tree) >] ep ->
       comment_stop bp;
       (match t with
 	| TokSymbol (Some t) -> ("", t), (bp, ep)
author	herbelin <herbelin@85f007b7-540e-0410-9357-904b9bb8a0f7>	2006-01-15 20:29:09 +0000
committer	herbelin <herbelin@85f007b7-540e-0410-9357-904b9bb8a0f7>	2006-01-15 20:29:09 +0000
commit	66b97c0035e0a5f2609cf3743cceeadf26600039 (patch)
tree	69e5572ade1ec93fcef6ed92d09bd2f8026b88fa /parsing
parent	80c3103a85deb61ed37963bb114dad5901ca2c39 (diff)