1 files changed, 42 insertions, 130 deletions
diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4
index 80eaf7f0..c1e4cfc6 100644
--- a/parsing/lexer.ml4
+++ b/parsing/lexer.ml4
@@ -6,9 +6,15 @@
 (*         *       GNU Lesser General Public License Version 2.1        *)
 (************************************************************************)
 
-(*i $Id: lexer.ml4 9015 2006-07-05 17:19:22Z herbelin $ i*)
+(*i $Id: lexer.ml4 11059 2008-06-06 09:29:20Z herbelin $ i*)
+
+
+(*i camlp4use: "pr_o.cmo" i*) 
+(* Add pr_o.cmo to circumvent a useless-warning bug when preprocessed with
+ * ast-based camlp4 *)
 
 open Pp
+open Util
 open Token
 
 (* Dictionaries: trees annotated with string options, each node being a map
@@ -71,8 +77,10 @@ let bad_token str = raise (Error (Bad_token str))
 
 (* Lexer conventions on tokens *)
 
-type utf8_token =
-    Utf8Letter of int | Utf8IdentPart of int | Utf8Symbol | AsciiChar
+type token_kind =
+  | Utf8Token of (utf8_status * int)
+  | AsciiChar
+  | EmptyStream
 
 let error_unsupported_unicode_character n cs =
   let bp = Stream.count cs in
@@ -80,6 +88,7 @@ let error_unsupported_unicode_character n cs =
 
 let error_utf8 cs =
   let bp = Stream.count cs in
+  Stream.junk cs; (* consume the char to avoid read it and fail again *)
   err (bp, bp+1) Illegal_character
 
 let njunk n = Util.repeat n Stream.junk
@@ -115,114 +124,14 @@ let lookup_utf8_tail c cs =
 	  (Char.code c3 land 0x3F) lsl 6 + (Char.code c4 land 0x3F)
       | _ -> error_utf8 cs
     in
-    match unicode land 0x1F000 with
-    | 0x0 ->
-    begin match unicode with
-    (* utf-8 Latin-1 non breaking space U00A0 *)
-    | 0x00A0 -> Utf8Letter n
-    (* utf-8 Latin-1 symbols U00A1-00BF *)
-    | x when 0x00A0 <= x & x <= 0x00BF -> Utf8Symbol
-    (* utf-8 Latin-1 letters U00C0-00D6 *)
-    | x when 0x00C0 <= x & x <= 0x00D6 -> Utf8Letter n
-    (* utf-8 Latin-1 symbol U00D7 *)
-    | 0x00D7 -> Utf8Symbol
-    (* utf-8 Latin-1 letters U00D8-00F6 *)
-    | x when 0x00D8 <= x & x <= 0x00F6 -> Utf8Letter n
-    (* utf-8 Latin-1 symbol U00F7 *)
-    | 0x00F7 -> Utf8Symbol
-    (* utf-8 Latin-1 letters U00F8-00FF *)
-    | x when 0x00F8 <= x & x <= 0x00FF -> Utf8Letter n
-    (* utf-8 Latin Extended A U0100-017F and Latin Extended B U0180-U0241 *)
-    | x when 0x0100 <= x & x <= 0x0241 -> Utf8Letter n
-    (* utf-8 Phonetic letters U0250-02AF *)
-    | x when 0x0250 <= x & x <= 0x02AF -> Utf8Letter n
-    (* utf-8 what do to with diacritics U0300-U036F ? *)
-    (* utf-8 Greek letters U0380-03FF *)
-    | x when 0x0380 <= x & x <= 0x03FF -> Utf8Letter n
-    (* utf-8 Cyrillic letters U0400-0481 *)
-    | x when 0x0400 <= x & x <= 0x0481 -> Utf8Letter n
-    (* utf-8 Cyrillic symbol U0482 *)
-    | 0x0482 -> Utf8Symbol
-    (* utf-8 what do to with diacritics U0483-U0489 \ U0487 ? *)
-    (* utf-8 Cyrillic letters U048A-U4F9 (Warning: 04CF) *)
-    | x when 0x048A <= x & x <= 0x04F9 -> Utf8Letter n
-    (* utf-8 Cyrillic supplement letters U0500-U050F *)
-    | x when 0x0500 <= x & x <= 0x050F -> Utf8Letter n
-    (* utf-8 Hebrew letters U05D0-05EA *)
-    | x when 0x05D0 <= x & x <= 0x05EA -> Utf8Letter n
-    (* utf-8 Arabic letters U0621-064A *)
-    | x when 0x0621 <= x & x <= 0x064A -> Utf8Letter n
-    (* utf-8 Arabic supplement letters U0750-076D *)
-    | x when 0x0750 <= x & x <= 0x076D -> Utf8Letter n
-    | _ -> error_unsupported_unicode_character n cs
-    end
-    | 0x1000 ->
-    begin match unicode with
-    (* utf-8 Georgian U10A0-10FF (has holes) *)
-    | x when 0x10A0 <= x & x <= 0x10FF -> Utf8Letter n
-    (* utf-8 Hangul Jamo U1100-11FF (has holes) *)
-    | x when 0x1100 <= x & x <= 0x11FF -> Utf8Letter n
-    (* utf-8 Latin additional letters U1E00-1E9B and U1EA0-1EF9 *)
-    | x when 0x1E00 <= x & x <= 0x1E9B -> Utf8Letter n
-    | x when 0x1EA0 <= x & x <= 0x1EF9 -> Utf8Letter n
-    | _ -> error_unsupported_unicode_character n cs
-    end
-    | 0x2000 ->
-    begin match unicode with
-    (* utf-8 general punctuation U2080-2089 *)
-    (* Hyphens *)
-    | x when 0x2010 <= x & x <= 0x2011 -> Utf8Letter n 
-    (* Dashes and other symbols *)
-    | x when 0x2012 <= x & x <= 0x2027 -> Utf8Symbol
-    (* Per mille and per ten thousand signs *)
-    | x when 0x2030 <= x & x <= 0x2031 -> Utf8Symbol
-    (* Prime letters *)
-    | x when 0x2032 <= x & x <= 0x2034 or x = 0x2057 -> Utf8IdentPart n
-    (* Miscellaneous punctuation *)
-    | x when 0x2039 <= x & x <= 0x2056 -> Utf8Symbol
-    | x when 0x2058 <= x & x <= 0x205E -> Utf8Symbol
-    (* Invisible mathematical operators *)
-    | x when 0x2061 <= x & x <= 0x2063 -> Utf8Symbol
-
-    (* utf-8 subscript U2080-2089 *) 
-    | x when 0x2080 <= x & x <= 0x2089 -> Utf8IdentPart n
-    (* utf-8 letter-like U2100-214F *)
-    | x when 0x2100 <= x & x <= 0x214F -> Utf8Letter n
-    (* utf-8 number-forms U2153-2183 *)
-    | x when 0x2153 <= x & x <= 0x2183 -> Utf8Symbol
-    (* utf-8 arrows A U2190-21FF *)
-    (* utf-8 mathematical operators U2200-22FF *)
-    (* utf-8 miscellaneous technical U2300-23FF *)
-    | x when 0x2190 <= x & x <= 0x23FF -> Utf8Symbol
-    (* utf-8 box drawing U2500-257F has ceiling, etc. *)
-    (* utf-8 block elements U2580-259F *)
-    (* utf-8 geom. shapes U25A0-25FF (has triangles, losange, etc) *)
-    (* utf-8 miscellaneous symbols U2600-26FF *)
-    | x when 0x2500 <= x & x <= 0x26FF -> Utf8Symbol
-    (* utf-8 arrows B U2900-297F *)
-    | x when 0x2900 <= x & x <= 0x297F -> Utf8Symbol
-    (* utf-8 mathematical operators U2A00-2AFF *)
-    | x when 0x2A00 <= x & x <= 0x2AFF -> Utf8Symbol
-    | _ -> error_unsupported_unicode_character n cs
-    end
-    | _ ->
-    begin match unicode with
-    (* utf-8 Hiragana U3040-309F and Katakana U30A0-30FF *)
-    | x when 0x3040 <= x & x <= 0x30FF -> Utf8Letter n
-    (* utf-8 Unified CJK Ideographs U4E00-9FA5 *)
-    | x when 0x4E00 <= x & x <= 0x9FA5 -> Utf8Letter n
-    (* utf-8 Hangul syllables UAC00-D7AF *)
-    | x when 0xAC00 <= x & x <= 0xD7AF -> Utf8Letter n
-    (* utf-8 Gothic U10330-1034A *)
-    | x when 0x10330 <= x & x <= 0x1034A -> Utf8Letter n
-    | _ -> error_unsupported_unicode_character n cs
-    end
+    try classify_unicode unicode, n
+    with UnsupportedUtf8 -> error_unsupported_unicode_character n cs
 	
 let lookup_utf8 cs =
   match Stream.peek cs with
-    | Some ('\x00'..'\x7F') -> Some AsciiChar
-    | Some ('\x80'..'\xFF' as c) -> Some (lookup_utf8_tail c cs)
-    | None -> None
+    | Some ('\x00'..'\x7F') -> AsciiChar
+    | Some ('\x80'..'\xFF' as c) -> Utf8Token (lookup_utf8_tail c cs)
+    | None -> EmptyStream
 
 let check_special_token str =
   let rec loop_symb = parser
@@ -234,16 +143,16 @@ let check_special_token str =
 
 let check_ident str =
   let rec loop_id intail = parser
-    | [< ' ('$' | 'a'..'z' | 'A'..'Z' | '_'); s >] ->
+    | [< ' ('a'..'z' | 'A'..'Z' | '_'); s >] ->
         loop_id true s
     | [< ' ('0'..'9' | ''') when intail; s >] ->
         loop_id true s
     | [< s >] ->
 	match lookup_utf8 s with
-	| Some (Utf8Letter n) -> njunk n s; loop_id true s
-	| Some (Utf8IdentPart n) when intail -> njunk n s; loop_id true s
-	| Some _ -> bad_token str
-	| None -> ()
+	| Utf8Token (UnicodeLetter, n) -> njunk n s; loop_id true s
+	| Utf8Token (UnicodeIdentPart, n) when intail -> njunk n s; loop_id true s
+	| EmptyStream -> ()
+	| Utf8Token _ | AsciiChar -> bad_token str
   in
   loop_id false (Stream.of_string str)
 
@@ -266,7 +175,7 @@ let add_keyword str =
 (* Adding a new token (keyword or special token). *)
 let add_token (con, str) = match con with
   | "" -> add_keyword str
-  | "METAIDENT" | "IDENT" | "FIELD" | "INT" | "STRING" | "EOI"
+  | "METAIDENT" | "PATTERNIDENT" | "IDENT" | "FIELD" | "INT" | "STRING" | "EOI"
       -> ()
   | _ ->
       raise (Token.Error ("\
@@ -308,7 +217,7 @@ let rec ident_tail len = parser
       ident_tail (store len c) s
   | [< s >] ->
       match lookup_utf8 s with
-      | Some (Utf8IdentPart n | Utf8Letter n) ->
+      | Utf8Token ((UnicodeIdentPart | UnicodeLetter), n) ->
 	  ident_tail (nstore n len s) s
       | _ -> len
 
@@ -368,10 +277,10 @@ let null_comment s =
 
 let comment_stop ep =
   let current_s = Buffer.contents current in
-  if !Options.xml_export && Buffer.length current > 0 &&
+  if !Flags.xml_export && Buffer.length current > 0 &&
     (!between_com || not(null_comment current_s)) then
       !xml_output_comment current_s;
-  (if Options.do_translate() && Buffer.length current > 0 &&
+  (if Flags.do_translate() && Buffer.length current > 0 &&
     (!between_com || not(null_comment current_s)) then
     let bp = match !comment_begin with
         Some bp -> bp
@@ -406,7 +315,7 @@ let rec comment bp = parser bp2
          | [< '')' >] -> push_string "*)";
          | [< s >] -> real_push_char '*'; comment bp s >] -> ()
   | [< ''"'; s >] ->
-      if Options.do_translate() then (push_string"\"";comm_string bp2 s)
+      if Flags.do_translate() then (push_string"\"";comm_string bp2 s)
       else ignore (string bp2 0 s);
       comment bp s
   | [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_comment
@@ -457,27 +366,30 @@ let process_chars bp c cs =
     | Some t -> (("", t), (bp, ep))
     | None -> err (bp, ep) Undefined_token
 
-(* Parse what follows a dot *)
-let parse_after_dot bp c = parser
+(* Parse what follows a dot/question mark *)
+let parse_after_dot bp c =
+  let constructor = if c = '?' then "PATTERNIDENT" else "FIELD" in
+  parser
   | [< ' ('a'..'z' | 'A'..'Z' | '_' as c); len = ident_tail (store 0 c) >] ->
-      ("FIELD", get_buff len)
+      (constructor, get_buff len)
   | [< s >] ->
       match lookup_utf8 s with
-      | Some (Utf8Letter n) -> 
-	  ("FIELD", get_buff (ident_tail (nstore n 0 s) s))
-      | Some (Utf8IdentPart _ | AsciiChar | Utf8Symbol) | None -> 
+      | Utf8Token (UnicodeLetter, n) -> 
+	  (constructor, get_buff (ident_tail (nstore n 0 s) s))
+      | AsciiChar | Utf8Token _ | EmptyStream -> 
 	  fst (process_chars bp c s)
 
 (* Parse a token in a char stream *)
 let rec next_token = parser bp
   | [< '' ' | '\t' | '\n' |'\r' as c; s >] ->
       comm_loc bp; push_char c; next_token s
-  | [< ''$'; len = ident_tail (store 0 '$') >] ep -> 
+  | [< ''$'; ' ('a'..'z' | 'A'..'Z' | '_' as c); 
+      len = ident_tail (store 0 c) >] ep -> 
       comment_stop bp;
       (("METAIDENT", get_buff len), (bp,ep))
-  | [< ''.' as c; t = parse_after_dot bp c >] ep ->
+  | [< ' ('.' | '?') as c; t = parse_after_dot bp c >] ep ->
       comment_stop bp;
-      if Options.do_translate() & t=("",".") then between_com := true;
+      if Flags.do_translate() & t=("",".") then between_com := true;
       (t, (bp,ep))
   | [< ' ('a'..'z' | 'A'..'Z' | '_' as c);
        len = ident_tail (store 0 c) >] ep ->
@@ -501,16 +413,16 @@ let rec next_token = parser bp
       t
   | [< s >] ->
       match lookup_utf8 s with
-	| Some (Utf8Letter n) ->
+	| Utf8Token (UnicodeLetter, n) ->
 	    let len = ident_tail (nstore n 0 s) s in
 	    let id = get_buff len in
 	    let ep = Stream.count s in
 	    comment_stop bp;
 	    (try ("",find_keyword id) with Not_found -> ("IDENT",id)), (bp, ep)
-	| Some (Utf8Symbol | AsciiChar | Utf8IdentPart _) -> 
+	| AsciiChar | Utf8Token ((UnicodeSymbol | UnicodeIdentPart), _) -> 
 	    let t = process_chars bp (Stream.next s) s in
 	    comment_stop bp; t
-	| None ->
+	| EmptyStream ->
 	    comment_stop bp; (("EOI", ""), (bp, bp + 1))
 
 (* Location table system for creating tables associating a token count