1 files changed, 235 insertions, 146 deletions
diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4
index 6119b86e..c02dc59b 100644
--- a/parsing/lexer.ml4
+++ b/parsing/lexer.ml4
@@ -6,7 +6,7 @@
 (*         *       GNU Lesser General Public License Version 2.1        *)
 (************************************************************************)
 
-(*i $Id: lexer.ml4 7870 2006-01-15 20:29:09Z herbelin $ i*)
+(*i $Id: lexer.ml4 8924 2006-06-08 17:49:01Z notin $ i*)
 
 open Pp
 open Token
@@ -54,7 +54,7 @@ let ttree_find ttree str =
   in 
   proc_rec ttree 0
 
-(* Lexer conventions on tokens *)
+(* Errors occuring while lexing (explained as "Lexer error: ...") *)
 
 type error =
   | Illegal_character
@@ -65,8 +65,163 @@ type error =
 
 exception Error of error
 
+let err loc str = Stdpp.raise_with_loc (Util.make_loc loc) (Error str)
+
 let bad_token str = raise (Error (Bad_token str))
 
+(* Lexer conventions on tokens *)
+
+type utf8_token =
+    Utf8Letter of int | Utf8IdentPart of int | Utf8Symbol | AsciiChar
+
+let error_unsupported_unicode_character n cs =
+  let bp = Stream.count cs in
+  err (bp,bp+n) (Bad_token "Unsupported Unicode character")
+
+let error_utf8 cs =
+  let bp = Stream.count cs in
+  err (bp, bp+1) Illegal_character
+
+let njunk n = Util.repeat n Stream.junk
+
+let check_utf8_trailing_byte cs c =
+  if Char.code c land 0xC0 <> 0x80 then error_utf8 cs
+
+(* Recognize utf8 blocks (of length less than 4 bytes) *)
+(* but don't certify full utf8 compliance (e.g. no emptyness check) *)
+let lookup_utf8_tail c cs =
+  let c1 = Char.code c in
+  if c1 land 0x40 = 0 or c1 land 0x38 = 0x38 then error_utf8 cs 
+  else
+    let n, unicode =
+      if c1 land 0x20 = 0 then
+      match Stream.npeek 2 cs with
+      | [_;c2] ->
+	  check_utf8_trailing_byte cs c2;
+	  2, (c1 land 0x1F) lsl 6 + (Char.code c2 land 0x3F)
+      | _ -> error_utf8 cs
+      else if c1 land 0x10 = 0 then
+      match Stream.npeek 3 cs with
+      | [_;c2;c3] ->
+	  check_utf8_trailing_byte cs c2; check_utf8_trailing_byte cs c3;
+	  3, (c1 land 0x0F) lsl 12 + (Char.code c2 land 0x3F) lsl 6 + 
+	  (Char.code c3 land 0x3F)
+      | _ -> error_utf8 cs
+      else match Stream.npeek 4 cs with
+      | [_;c2;c3;c4] ->
+	  check_utf8_trailing_byte cs c2; check_utf8_trailing_byte cs c3;
+	  check_utf8_trailing_byte cs c4; 
+	  4, (c1 land 0x07) lsl 18 + (Char.code c2 land 0x3F) lsl 12 +
+	  (Char.code c3 land 0x3F) lsl 6 + (Char.code c4 land 0x3F)
+      | _ -> error_utf8 cs
+    in
+    match unicode land 0x1F000 with
+    | 0x0 ->
+    begin match unicode with
+    (* utf-8 Latin-1 non breaking space U00A0 *)
+    | 0x00A0 -> Utf8Letter n
+    (* utf-8 Latin-1 symbols U00A1-00BF *)
+    | x when 0x00A0 <= x & x <= 0x00BF -> Utf8Symbol
+    (* utf-8 Latin-1 letters U00C0-00D6 *)
+    | x when 0x00C0 <= x & x <= 0x00D6 -> Utf8Letter n
+    (* utf-8 Latin-1 symbol U00D7 *)
+    | 0x00D7 -> Utf8Symbol
+    (* utf-8 Latin-1 letters U00D8-00F6 *)
+    | x when 0x00D8 <= x & x <= 0x00F6 -> Utf8Letter n
+    (* utf-8 Latin-1 symbol U00F7 *)
+    | 0x00F7 -> Utf8Symbol
+    (* utf-8 Latin-1 letters U00F8-00FF *)
+    | x when 0x00F8 <= x & x <= 0x00FF -> Utf8Letter n
+    (* utf-8 Latin Extended A U0100-017F and Latin Extended B U0180-U0241 *)
+    | x when 0x0100 <= x & x <= 0x0241 -> Utf8Letter n
+    (* utf-8 Phonetic letters U0250-02AF *)
+    | x when 0x0250 <= x & x <= 0x02AF -> Utf8Letter n
+    (* utf-8 what do to with diacritics U0300-U036F ? *)
+    (* utf-8 Greek letters U0380-03FF *)
+    | x when 0x0380 <= x & x <= 0x03FF -> Utf8Letter n
+    (* utf-8 Cyrillic letters U0400-0481 *)
+    | x when 0x0400 <= x & x <= 0x0481 -> Utf8Letter n
+    (* utf-8 Cyrillic symbol U0482 *)
+    | 0x0482 -> Utf8Symbol
+    (* utf-8 what do to with diacritics U0483-U0489 \ U0487 ? *)
+    (* utf-8 Cyrillic letters U048A-U4F9 (Warning: 04CF) *)
+    | x when 0x048A <= x & x <= 0x04F9 -> Utf8Letter n
+    (* utf-8 Cyrillic supplements letters U0500-U050F *)
+    | x when 0x0500 <= x & x <= 0x050F -> Utf8Letter n
+    (* utf-8 Hebrew letters U05D0-05EA *)
+    | x when 0x05D0 <= x & x <= 0x05EA -> Utf8Letter n
+    (* utf-8 Hebrew letters U0621-064A *)
+    | x when 0x0621 <= x & x <= 0x064A -> Utf8Letter n
+    | _ -> error_unsupported_unicode_character n cs
+    end
+    | 0x1000 ->
+    begin match unicode with
+    (* utf-8 Georgian U10A0-10FF (has holes) *)
+    | x when 0x10A0 <= x & x <= 0x10FF -> Utf8Letter n
+    (* utf-8 Hangul Jamo U1100-11FF (has holes) *)
+    | x when 0x1100 <= x & x <= 0x11FF -> Utf8Letter n
+    (* utf-8 Latin additional letters U1E00-1E9B and U1EA0-1EF9 *)
+    | x when 0x1E00 <= x & x <= 0x1E9B -> Utf8Letter n
+    | x when 0x1EA0 <= x & x <= 0x1EF9 -> Utf8Letter n
+    | _ -> error_unsupported_unicode_character n cs
+    end
+    | 0x2000 ->
+    begin match unicode with
+    (* utf-8 general punctuation U2080-2089 *)
+    (* Hyphens *)
+    | x when 0x2010 <= x & x <= 0x2011 -> Utf8Letter n 
+    (* Dashes and other symbols *)
+    | x when 0x2012 <= x & x <= 0x2027 -> Utf8Symbol
+    (* Per mille and per ten thousand signs *)
+    | x when 0x2030 <= x & x <= 0x2031 -> Utf8Symbol
+    (* Prime letters *)
+    | x when 0x2032 <= x & x <= 0x2034 or x = 0x2057 -> Utf8IdentPart n
+    (* Miscellaneous punctuation *)
+    | x when 0x2039 <= x & x <= 0x2056 -> Utf8Symbol
+    | x when 0x2058 <= x & x <= 0x205E -> Utf8Symbol
+    (* Invisible mathematical operators *)
+    | x when 0x2061 <= x & x <= 0x2063 -> Utf8Symbol
+
+    (* utf-8 subscript U2080-2089 *) 
+    | x when 0x2080 <= x & x <= 0x2089 -> Utf8IdentPart n
+    (* utf-8 letter-like U2100-214F *)
+    | x when 0x2100 <= x & x <= 0x214F -> Utf8Letter n
+    (* utf-8 number-forms U2153-2183 *)
+    | x when 0x2153 <= x & x <= 0x2183 -> Utf8Symbol
+    (* utf-8 arrows A U2190-21FF *)
+    (* utf-8 mathematical operators U2200-22FF *)
+    (* utf-8 miscellaneous technical U2300-23FF *)
+    | x when 0x2190 <= x & x <= 0x23FF -> Utf8Symbol
+    (* utf-8 box drawing U2500-257F has ceiling, etc. *)
+    (* utf-8 block elements U2580-259F *)
+    (* utf-8 geom. shapes U25A0-25FF (has triangles, losange, etc) *)
+    (* utf-8 miscellaneous symbols U2600-26FF *)
+    | x when 0x2500 <= x & x <= 0x26FF -> Utf8Symbol
+    (* utf-8 arrows B U2900-297F *)
+    | x when 0x2900 <= x & x <= 0x297F -> Utf8Symbol
+    (* utf-8 mathematical operators U2A00-2AFF *)
+    | x when 0x2A00 <= x & x <= 0x2AFF -> Utf8Symbol
+    | _ -> error_unsupported_unicode_character n cs
+    end
+    | _ ->
+    begin match unicode with
+    (* utf-8 Hiragana U3040-309F and Katakana U30A0-30FF *)
+    | x when 0x3040 <= x & x <= 0x30FF -> Utf8Letter n
+    (* utf-8 Unified CJK Ideographs U4E00-9FA5 *)
+    | x when 0x4E00 <= x & x <= 0x9FA5 -> Utf8Letter n
+    (* utf-8 Hangul syllables UAC00-D7AF *)
+    | x when 0xAC00 <= x & x <= 0xD7AF -> Utf8Letter n
+    (* utf-8 Gothic U10330-1034A *)
+    | x when 0x10330 <= x & x <= 0x1034A -> Utf8Letter n
+    | _ -> error_unsupported_unicode_character n cs
+    end
+	
+let lookup_utf8 cs =
+  match Stream.peek cs with
+    | Some ('\x00'..'\x7F') -> Some AsciiChar
+    | Some ('\x80'..'\xFF' as c) -> Some (lookup_utf8_tail c cs)
+    | None -> None
+
 let check_special_token str =
   let rec loop_symb = parser
     | [< ' (' ' | '\n' | '\r' | '\t' | '"') >] -> bad_token str
@@ -76,35 +231,19 @@ let check_special_token str =
   loop_symb (Stream.of_string str)
 
 let check_ident str =
-  let first_letter = function
-      (''' | '0'..'9') -> false
-    | _ -> true in
-  let rec loop_id = parser
-    | [< ' ('$' | 'a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_'); s >] ->
-        loop_id s
-    (* utf-8 Greek letters U0380-03FF *)
-    | [< ' ('\xCE' | '\xCF'); ' ('\x80'..'\xBF'); s >] -> loop_id s
-    | [< ''\xE2'; 'c2; 'c3; s >] ->
-	  (match c2, c3 with
-	      (* utf-8 letter-like U2100-214F *)
-	    | ( ('\x84', '\x80'..'\xBF')
-	      | ('\x85', '\x80'..'\x8F')
-	      (* utf-8 subscript U2080-2089 *) 
-	      | ('\x82', '\x80'..'\x89')) ->
-                loop_id s 
-	      (* utf-8 symbols (see [parse_226_tail]) *)
-	    | (('\x86'..'\x8F' | '\x94'..'\x9B'
-               | '\xA4'..'\xA5' | '\xA8'..'\xAB'),_) ->
-		bad_token str
-	    | _ -> 
-		bad_token str)
-    | [< _ = Stream.empty >] -> ()
-    | [< >] -> bad_token str
+  let rec loop_id intail = parser
+    | [< ' ('$' | 'a'..'z' | 'A'..'Z' | '_'); s >] ->
+        loop_id true s
+    | [< ' ('0'..'9' | ''') when intail; s >] ->
+        loop_id true s
+    | [< s >] ->
+	match lookup_utf8 s with
+	| Some (Utf8Letter n) -> njunk n s; loop_id true s
+	| Some (Utf8IdentPart n) when intail -> njunk n s; loop_id true s
+	| Some _ -> bad_token str
+	| None -> ()
   in
-  if String.length str > 0 && first_letter str.[0] then
-    loop_id (Stream.of_string str)
-  else
-    bad_token str
+  loop_id false (Stream.of_string str)
 
 let check_keyword str =
   try check_special_token str
@@ -145,9 +284,6 @@ let init () =
 
 let _ = init()
 
-(* Errors occuring while lexing (explained as "Lexer error: ...") *)
-let err loc str = Stdpp.raise_with_loc (Util.make_loc loc) (Error str)
-
 (* The string buffering machinery *)
 
 let buff = ref (String.create 80)
@@ -158,36 +294,20 @@ let store len x =
   !buff.[len] <- x;
   succ len
 
-let mstore len s =
-  let rec add_rec len i =
-    if i == String.length s then len else add_rec (store len s.[i]) (succ i)
-  in
-  add_rec len 0
+let rec nstore n len cs =
+  if n>0 then nstore (n-1) (store len (Stream.next cs)) cs else len
 
 let get_buff len = String.sub !buff 0 len
 
-
 (* The classical lexer: idents, numbers, quoted strings, comments *)
 
 let rec ident_tail len = parser
   | [< ' ('a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_' as c); s >] ->
       ident_tail (store len c) s
-  (* utf-8 Greek letters U0380-03FF *)
-  | [< ' ('\xCE' | '\xCF' as c1); ' ('\x80'..'\xBF' as c2) ; s >] ->
-      ident_tail (store (store len c1) c2) s
   | [< s >] ->
-      match Stream.peek s with
-      | Some '\xE2' -> 
-	  (match List.tl (Stream.npeek 3 s) with
-	    (* utf-8 subscript U2080-2089 *) 
-	  | ['\x82' as c2; ('\x80'..'\x89' as c3)]
-	    (* utf-8 letter-like U2100-214F part 1 *)
-	  | ['\x84' as c2; ('\x80'..'\xBF' as c3)]
-	    (* utf-8 letter-like U2100-214F part 2 *)
-	  | ['\x85' as c2; ('\x80'..'\x8F' as c3)] ->
-	    Stream.junk s; Stream.junk s; Stream.junk s;
-	    ident_tail (store (store (store len '\xE2') c2) c3) s
-	  | _ -> len)
+      match lookup_utf8 s with
+      | Some (Utf8IdentPart n | Utf8Letter n) ->
+	  ident_tail (nstore n len s) s
       | _ -> len
 
 let rec number len = parser
@@ -292,89 +412,61 @@ let rec comment bp = parser bp2
 
 (* Parse a special token, using the [token_tree] *)
 
-let progress_special c = function
-  | None -> None
-  | Some tt -> try Some (CharMap.find c tt.branch) with Not_found -> None
-
-let rec special tt cs = match tt with
-  | None -> None
-  | Some tt ->
-      match
-	match Stream.peek cs with
-	  | Some c -> 
-	      (try Some (CharMap.find c tt.branch) with Not_found -> None)
-	  | None -> None
-      with
-	| Some _ as tt' -> Stream.junk cs; special tt' cs
-	| None -> tt.node
-
+(* Peek as much utf-8 lexemes as possible *)
+(* then look if a special token is obtained *)
+let rec special tt cs =
+  match Stream.peek cs with
+    | Some c -> progress_from_byte 0 tt cs c
+    | None -> tt.node
+
+  (* nr is the number of char peeked; n the number of char in utf8 block *)
+and progress_utf8 nr n c tt cs =
+  try
+    let tt = CharMap.find c tt.branch in
+    let tt =
+      if n=1 then tt else
+	match Stream.npeek (n-nr) cs with
+	| l when List.length l = n-nr ->
+	    let l = Util.list_skipn (1-nr) l in
+	    List.iter (check_utf8_trailing_byte cs) l;
+	    List.fold_left (fun tt c -> CharMap.find c tt.branch) tt l
+	| _ -> 
+	    error_utf8 cs
+    in
+    for i=1 to n-nr do Stream.junk cs done;
+    special tt cs
+  with Not_found ->
+    tt.node
+
+and progress_from_byte nr tt cs = function
+  (* Utf8 leading byte *)
+  | '\x00'..'\x7F' as c -> progress_utf8 nr 1 c tt cs
+  | '\xC0'..'\xDF' as c -> progress_utf8 nr 2 c tt cs
+  | '\xE0'..'\xEF' as c -> progress_utf8 nr 3 c tt cs
+  | '\xF0'..'\xF7' as c -> progress_utf8 nr 4 c tt cs
+  | _ (* '\x80'..\xBF'|'\xF8'..'\xFF' *) ->
+      error_utf8 cs
+
+(* Must be a special token *)
 let process_chars bp c cs =
-  let t =
-    try special (Some (CharMap.find c !token_tree.branch)) cs
-    with Not_found -> !token_tree.node
-  in
+  let t = progress_from_byte 1 !token_tree cs c in
   let ep = Stream.count cs in
   match t with
     | Some t -> (("", t), (bp, ep))
     | None -> err (bp, ep) Undefined_token
 
-type token_226_tail =
-  | TokSymbol of string option
-  | TokIdent of string
-
-(* 1110xxxx 10yyyyzz 10zztttt utf-8 codes for xxxx=0010 *)
-let parse_226_tail tk = parser
-  | [< ''\x82' as c2; ' ('\x80'..'\x89' as c3);
-      (* utf-8 subscript U2080-2089 *) 
-      len = ident_tail (store (store (store 0 '\xE2') c2) c3) >] ->
-      TokIdent (get_buff len) 
-  | [< ''\x84' as c2; ' ('\x80'..'\xBF' as c3);
-      (* utf-8 letter-like U2100-214F part 1 *)
-      len = ident_tail (store (store (store 0 '\xE2') c2) c3) >] ->
-      TokIdent (get_buff len) 
-  | [< ''\x85' as c2; ' ('\x80'..'\x8F' as c3);
-      (* utf-8 letter-like U2100-214F part 2 *)
-      len = ident_tail (store (store (store 0 '\xE2') c2) c3) >] ->
-      TokIdent (get_buff len) 
-  | [< ' ('\x86'..'\x8F' | '\x94'..'\x9B' | '\xA4'..'\xA5' 
-         | '\xA8'..'\xAB' as c2); 'c3;
-      (* utf-8 arrows A U2190-21FF *)
-      (* utf-8 mathematical operators U2200-22FF *)
-      (* utf-8 miscellaneous technical U2300-23FF *)
-      (* utf-8 box drawing U2500-257F has ceiling, etc. *)
-      (* utf-8 block elements U2580-259F *)
-      (* utf-8 geom. shapes U25A0-25FF (has triangles, losange, etc) *)
-      (* utf-8 miscellaneous symbols U2600-26FF *)
-      (* utf-8 arrows B U2900-297F *)
-      (* utf-8 mathematical operators U2A00-2AFF *)
-      t = special (progress_special c3 (progress_special c2 
-	(progress_special '\xE2' tk))) >] ->
-      TokSymbol t
-  | [< '_; '_ >] ->
-      (* Unsupported utf-8 code *)
-      TokSymbol None
-
 (* Parse what follows a dot *)
 let parse_after_dot bp c = parser
-  | [< ' ('a'..'z' | 'A'..'Z' | '_' as c);
-    len = ident_tail (store 0 c) >] ->
-      ("FIELD", get_buff len)
-  (* utf-8 Greek letters U0380-03FF *)
-  | [< ' ('\xCE' | '\xCF' as c1); ' ('\x80'..'\xBF' as c2); 
-    len = ident_tail (store (store 0 c1) c2) >] ->
+  | [< ' ('a'..'z' | 'A'..'Z' | '_' as c); len = ident_tail (store 0 c) >] ->
       ("FIELD", get_buff len)
-  (* utf-8 mathematical symbols have format E2 xx xx [E2=226] *)
-  | [< ''\xE2'; t = parse_226_tail 
-      (progress_special '.' (Some !token_tree)) >] ep ->
-      (match t with
-	| TokSymbol (Some t) -> ("", t)
-	| TokSymbol None -> err (bp, ep) Undefined_token
-	| TokIdent t -> ("FIELD", t))
-  | [< (t,_) = process_chars bp c >] -> t
-
+  | [< s >] ->
+      match lookup_utf8 s with
+      | Some (Utf8Letter n) -> 
+	  ("FIELD", get_buff (ident_tail (nstore n 0 s) s))
+      | Some (Utf8IdentPart _ | AsciiChar | Utf8Symbol) | None -> 
+	  fst (process_chars bp c s)
 
 (* Parse a token in a char stream *)
-
 let rec next_token = parser bp
   | [< '' ' | '\t' | '\n' |'\r' as c; s >] ->
       comm_loc bp; push_char c; next_token s
@@ -383,27 +475,13 @@ let rec next_token = parser bp
       (("METAIDENT", get_buff len), (bp,ep))
   | [< ''.' as c; t = parse_after_dot bp c >] ep ->
       comment_stop bp;
+      if Options.do_translate() & t=("",".") then between_com := true;
       (t, (bp,ep))
   | [< ' ('a'..'z' | 'A'..'Z' | '_' as c);
        len = ident_tail (store 0 c) >] ep ->
       let id = get_buff len in 
       comment_stop bp;
       (try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep)
-  (* utf-8 Greek letters U0380-03FF [CE80-CEBF and CF80-CFBF] *)
-  | [< ' ('\xCE' | '\xCF' as c1); ' ('\x80'..'\xBF' as c2); 
-	   len = ident_tail (store (store 0 c1) c2) >] ep ->
-      let id = get_buff len in
-      comment_stop bp;
-      (try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep)
-  (* utf-8 mathematical symbols have format E2 xx xx [E2=226] *)
-  | [< ''\xE2'; t = parse_226_tail (Some !token_tree) >] ep ->
-      comment_stop bp;
-      (match t with
-	| TokSymbol (Some t) -> ("", t), (bp, ep)
-	| TokSymbol None -> err (bp, ep) Undefined_token
-	| TokIdent id ->
-	    (try ("", find_keyword id) with Not_found -> ("IDENT", id)),
-	    (bp, ep))
   | [< ' ('0'..'9' as c); len = number (store 0 c) >] ep ->
       comment_stop bp;
       (("INT", get_buff len), (bp, ep))
@@ -419,8 +497,19 @@ let rec next_token = parser bp
 	    next_token s
         | [< t = process_chars bp c >] -> comment_stop bp; t >] ->
       t
-  | [< 'c; t = process_chars bp c >] -> comment_stop bp; t
-  | [< _ = Stream.empty >] -> comment_stop bp; (("EOI", ""), (bp, bp + 1))
+  | [< s >] ->
+      match lookup_utf8 s with
+	| Some (Utf8Letter n) ->
+	    let len = ident_tail (nstore n 0 s) s in
+	    let id = get_buff len in
+	    let ep = Stream.count s in
+	    comment_stop bp;
+	    (try ("",find_keyword id) with Not_found -> ("IDENT",id)), (bp, ep)
+	| Some (Utf8Symbol | AsciiChar | Utf8IdentPart _) -> 
+	    let t = process_chars bp (Stream.next s) s in
+	    comment_stop bp; t
+	| None ->
+	    comment_stop bp; (("EOI", ""), (bp, bp + 1))
 
 (* Location table system for creating tables associating a token count
    to its location in a char stream (the source) *)
@@ -461,10 +550,10 @@ let func cs =
     Stream.from
       (fun i ->
          let (tok, loc) = next_token cs in
-	 loct_add loct i loc; Some tok)
+	   loct_add loct i loc; Some tok)
   in
-  current_location_table := loct;
-  (ts, loct_func loct)
+    current_location_table := loct;
+    (ts, loct_func loct)
 
 type location_table = (int * int) option array array ref
 let location_table () = !current_location_table