aboutsummaryrefslogtreecommitdiffhomepage
path: root/parsing
diff options
context:
space:
mode:
authorGravatar herbelin <herbelin@85f007b7-540e-0410-9357-904b9bb8a0f7>2006-01-15 20:29:09 +0000
committerGravatar herbelin <herbelin@85f007b7-540e-0410-9357-904b9bb8a0f7>2006-01-15 20:29:09 +0000
commit66b97c0035e0a5f2609cf3743cceeadf26600039 (patch)
tree69e5572ade1ec93fcef6ed92d09bd2f8026b88fa /parsing
parent80c3103a85deb61ed37963bb114dad5901ca2c39 (diff)
Ajout de nouvelles plages de symboles unicode; prise en compte des indices unicode et letter-like dans les identificateurs
git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/coq/trunk@7870 85f007b7-540e-0410-9357-904b9bb8a0f7
Diffstat (limited to 'parsing')
-rw-r--r--parsing/lexer.ml492
1 files changed, 57 insertions, 35 deletions
diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4
index be52fa5f1..2c17590a3 100644
--- a/parsing/lexer.ml4
+++ b/parsing/lexer.ml4
@@ -82,16 +82,19 @@ let check_ident str =
let rec loop_id = parser
| [< ' ('$' | 'a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_'); s >] ->
loop_id s
- (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *)
- | [< ' ('\206' | '\207'); ' ('\128'..'\191'); s >] -> loop_id s
- | [< ''\226'; 'c2; 'c3; s >] ->
+ (* utf-8 Greek letters U0380-03FF *)
+ | [< ' ('\xCE' | '\xCF'); ' ('\x80'..'\xBF'); s >] -> loop_id s
+ | [< ''\xE2'; 'c2; 'c3; s >] ->
(match c2, c3 with
- (* utf8 letter-like unicode 2100-214F *)
- | (('\132', '\128'..'\191') | ('\133', '\128'..'\143')) ->
+ (* utf-8 letter-like U2100-214F *)
+ | ( ('\x84', '\x80'..'\xBF')
+ | ('\x85', '\x80'..'\x8F')
+ (* utf-8 subscript U2080-2089 *)
+ | ('\x82', '\x80'..'\x89')) ->
loop_id s
- (* utf8 symbols (see [parse_226_tail]) *)
- | (('\134'..'\143' | '\152'..'\155'
- | '\164'..'\165' | '\168'..'\171'),_) ->
+ (* utf-8 symbols (see [parse_226_tail]) *)
+ | (('\x86'..'\x8F' | '\x94'..'\x9B'
+ | '\xA4'..'\xA5' | '\xA8'..'\xAB'),_) ->
bad_token str
| _ ->
bad_token str)
@@ -169,11 +172,23 @@ let get_buff len = String.sub !buff 0 len
let rec ident_tail len = parser
| [< ' ('a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_' as c); s >] ->
ident_tail (store len c) s
- (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *)
- | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2) ; s >] ->
+ (* utf-8 Greek letters U0380-03FF *)
+ | [< ' ('\xCE' | '\xCF' as c1); ' ('\x80'..'\xBF' as c2) ; s >] ->
ident_tail (store (store len c1) c2) s
- | [< >] -> len
-
+ | [< s >] ->
+ match Stream.peek s with
+ | Some '\xE2' ->
+ (match List.tl (Stream.npeek 3 s) with
+ (* utf-8 subscript U2080-2089 *)
+ | ['\x82' as c2; ('\x80'..'\x89' as c3)]
+ (* utf-8 letter-like U2100-214F part 1 *)
+ | ['\x84' as c2; ('\x80'..'\xBF' as c3)]
+ (* utf-8 letter-like U2100-214F part 2 *)
+ | ['\x85' as c2; ('\x80'..'\x8F' as c3)] ->
+ Stream.junk s; Stream.junk s; Stream.junk s;
+ ident_tail (store (store (store len '\xE2') c2) c3) s
+ | _ -> len)
+ | _ -> len
let rec number len = parser
| [< ' ('0'..'9' as c); s >] -> number (store len c) s
@@ -273,7 +288,7 @@ let rec comment bp = parser bp2
else ignore (string bp2 0 s);
comment bp s
| [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_comment
- | [< '_ as z; s >] -> real_push_char z; comment bp s
+ | [< 'z; s >] -> real_push_char z; comment bp s
(* Parse a special token, using the [token_tree] *)
@@ -307,42 +322,49 @@ type token_226_tail =
| TokSymbol of string option
| TokIdent of string
+(* 1110xxxx 10yyyyzz 10zztttt utf-8 codes for xxxx=0010 *)
let parse_226_tail tk = parser
- | [< ''\132' as c2; ' ('\128'..'\191' as c3);
- (* utf8 letter-like unicode 2100-214F *)
- len = ident_tail (store (store (store 0 '\226') c2) c3) >] ->
+ | [< ''\x82' as c2; ' ('\x80'..'\x89' as c3);
+ (* utf-8 subscript U2080-2089 *)
+ len = ident_tail (store (store (store 0 '\xE2') c2) c3) >] ->
TokIdent (get_buff len)
- | [< ''\133' as c2; ' ('\128'..'\143' as c3);
- (* utf8 letter-like unicode 2100-214F *)
- len = ident_tail (store (store (store 0 '\226') c2) c3) >] ->
+ | [< ''\x84' as c2; ' ('\x80'..'\xBF' as c3);
+ (* utf-8 letter-like U2100-214F part 1 *)
+ len = ident_tail (store (store (store 0 '\xE2') c2) c3) >] ->
TokIdent (get_buff len)
- | [< ' ('\134'..'\143' | '\152'..'\155'
- | '\164'..'\165' | '\168'..'\171' as c2); 'c3;
- (* utf8 arrows A unicode 2190-21FF *)
- (* utf8 mathematical operators unicode 2200-22FF *)
- (* utf8 miscellaneous technical unicode 2300-23FF *)
- (* utf8 miscellaneous symbols unicode 2600-26FF *)
- (* utf8 arrows B unicode 2900-297F *)
- (* utf8 mathematical operators unicode 2A00-2AFF *)
+ | [< ''\x85' as c2; ' ('\x80'..'\x8F' as c3);
+ (* utf-8 letter-like U2100-214F part 2 *)
+ len = ident_tail (store (store (store 0 '\xE2') c2) c3) >] ->
+ TokIdent (get_buff len)
+ | [< ' ('\x86'..'\x8F' | '\x94'..'\x9B' | '\xA4'..'\xA5'
+ | '\xA8'..'\xAB' as c2); 'c3;
+ (* utf-8 arrows A U2190-21FF *)
+ (* utf-8 mathematical operators U2200-22FF *)
+ (* utf-8 miscellaneous technical U2300-23FF *)
+ (* utf-8 box drawing U2500-257F has ceiling, etc. *)
+ (* utf-8 block elements U2580-259F *)
+ (* utf-8 geom. shapes U25A0-25FF (has triangles, losange, etc) *)
+ (* utf-8 miscellaneous symbols U2600-26FF *)
+ (* utf-8 arrows B U2900-297F *)
+ (* utf-8 mathematical operators U2A00-2AFF *)
t = special (progress_special c3 (progress_special c2
- (progress_special '\226' tk))) >] ->
+ (progress_special '\xE2' tk))) >] ->
TokSymbol t
| [< '_; '_ >] ->
(* Unsupported utf-8 code *)
TokSymbol None
-
(* Parse what follows a dot *)
let parse_after_dot bp c = parser
| [< ' ('a'..'z' | 'A'..'Z' | '_' as c);
len = ident_tail (store 0 c) >] ->
("FIELD", get_buff len)
- (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *)
- | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2);
+ (* utf-8 Greek letters U0380-03FF *)
+ | [< ' ('\xCE' | '\xCF' as c1); ' ('\x80'..'\xBF' as c2);
len = ident_tail (store (store 0 c1) c2) >] ->
("FIELD", get_buff len)
(* utf-8 mathematical symbols have format E2 xx xx [E2=226] *)
- | [< ''\226'; t = parse_226_tail
+ | [< ''\xE2'; t = parse_226_tail
(progress_special '.' (Some !token_tree)) >] ep ->
(match t with
| TokSymbol (Some t) -> ("", t)
@@ -367,14 +389,14 @@ let rec next_token = parser bp
let id = get_buff len in
comment_stop bp;
(try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep)
- (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *)
- | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2);
+ (* utf-8 Greek letters U0380-03FF [CE80-CEBF and CF80-CFBF] *)
+ | [< ' ('\xCE' | '\xCF' as c1); ' ('\x80'..'\xBF' as c2);
len = ident_tail (store (store 0 c1) c2) >] ep ->
let id = get_buff len in
comment_stop bp;
(try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep)
(* utf-8 mathematical symbols have format E2 xx xx [E2=226] *)
- | [< ''\226'; t = parse_226_tail (Some !token_tree) >] ep ->
+ | [< ''\xE2'; t = parse_226_tail (Some !token_tree) >] ep ->
comment_stop bp;
(match t with
| TokSymbol (Some t) -> ("", t), (bp, ep)