From a0cfa4f118023d35b767a999d5a2ac4b082857b4 Mon Sep 17 00:00:00 2001 From: Samuel Mimram Date: Fri, 25 Jul 2008 15:12:53 +0200 Subject: Imported Upstream version 8.2~beta3+dfsg --- parsing/lexer.ml4 | 172 +++++++++++++----------------------------------------- 1 file changed, 42 insertions(+), 130 deletions(-) (limited to 'parsing/lexer.ml4') diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4 index 80eaf7f0..c1e4cfc6 100644 --- a/parsing/lexer.ml4 +++ b/parsing/lexer.ml4 @@ -6,9 +6,15 @@ (* * GNU Lesser General Public License Version 2.1 *) (************************************************************************) -(*i $Id: lexer.ml4 9015 2006-07-05 17:19:22Z herbelin $ i*) +(*i $Id: lexer.ml4 11059 2008-06-06 09:29:20Z herbelin $ i*) + + +(*i camlp4use: "pr_o.cmo" i*) +(* Add pr_o.cmo to circumvent a useless-warning bug when preprocessed with + * ast-based camlp4 *) open Pp +open Util open Token (* Dictionaries: trees annotated with string options, each node being a map @@ -71,8 +77,10 @@ let bad_token str = raise (Error (Bad_token str)) (* Lexer conventions on tokens *) -type utf8_token = - Utf8Letter of int | Utf8IdentPart of int | Utf8Symbol | AsciiChar +type token_kind = + | Utf8Token of (utf8_status * int) + | AsciiChar + | EmptyStream let error_unsupported_unicode_character n cs = let bp = Stream.count cs in @@ -80,6 +88,7 @@ let error_unsupported_unicode_character n cs = let error_utf8 cs = let bp = Stream.count cs in + Stream.junk cs; (* consume the char to avoid read it and fail again *) err (bp, bp+1) Illegal_character let njunk n = Util.repeat n Stream.junk @@ -115,114 +124,14 @@ let lookup_utf8_tail c cs = (Char.code c3 land 0x3F) lsl 6 + (Char.code c4 land 0x3F) | _ -> error_utf8 cs in - match unicode land 0x1F000 with - | 0x0 -> - begin match unicode with - (* utf-8 Latin-1 non breaking space U00A0 *) - | 0x00A0 -> Utf8Letter n - (* utf-8 Latin-1 symbols U00A1-00BF *) - | x when 0x00A0 <= x & x <= 0x00BF -> Utf8Symbol - (* utf-8 Latin-1 letters U00C0-00D6 *) - | x when 0x00C0 <= x & x <= 0x00D6 -> Utf8Letter n - (* utf-8 Latin-1 symbol U00D7 *) - | 0x00D7 -> Utf8Symbol - (* utf-8 Latin-1 letters U00D8-00F6 *) - | x when 0x00D8 <= x & x <= 0x00F6 -> Utf8Letter n - (* utf-8 Latin-1 symbol U00F7 *) - | 0x00F7 -> Utf8Symbol - (* utf-8 Latin-1 letters U00F8-00FF *) - | x when 0x00F8 <= x & x <= 0x00FF -> Utf8Letter n - (* utf-8 Latin Extended A U0100-017F and Latin Extended B U0180-U0241 *) - | x when 0x0100 <= x & x <= 0x0241 -> Utf8Letter n - (* utf-8 Phonetic letters U0250-02AF *) - | x when 0x0250 <= x & x <= 0x02AF -> Utf8Letter n - (* utf-8 what do to with diacritics U0300-U036F ? *) - (* utf-8 Greek letters U0380-03FF *) - | x when 0x0380 <= x & x <= 0x03FF -> Utf8Letter n - (* utf-8 Cyrillic letters U0400-0481 *) - | x when 0x0400 <= x & x <= 0x0481 -> Utf8Letter n - (* utf-8 Cyrillic symbol U0482 *) - | 0x0482 -> Utf8Symbol - (* utf-8 what do to with diacritics U0483-U0489 \ U0487 ? *) - (* utf-8 Cyrillic letters U048A-U4F9 (Warning: 04CF) *) - | x when 0x048A <= x & x <= 0x04F9 -> Utf8Letter n - (* utf-8 Cyrillic supplement letters U0500-U050F *) - | x when 0x0500 <= x & x <= 0x050F -> Utf8Letter n - (* utf-8 Hebrew letters U05D0-05EA *) - | x when 0x05D0 <= x & x <= 0x05EA -> Utf8Letter n - (* utf-8 Arabic letters U0621-064A *) - | x when 0x0621 <= x & x <= 0x064A -> Utf8Letter n - (* utf-8 Arabic supplement letters U0750-076D *) - | x when 0x0750 <= x & x <= 0x076D -> Utf8Letter n - | _ -> error_unsupported_unicode_character n cs - end - | 0x1000 -> - begin match unicode with - (* utf-8 Georgian U10A0-10FF (has holes) *) - | x when 0x10A0 <= x & x <= 0x10FF -> Utf8Letter n - (* utf-8 Hangul Jamo U1100-11FF (has holes) *) - | x when 0x1100 <= x & x <= 0x11FF -> Utf8Letter n - (* utf-8 Latin additional letters U1E00-1E9B and U1EA0-1EF9 *) - | x when 0x1E00 <= x & x <= 0x1E9B -> Utf8Letter n - | x when 0x1EA0 <= x & x <= 0x1EF9 -> Utf8Letter n - | _ -> error_unsupported_unicode_character n cs - end - | 0x2000 -> - begin match unicode with - (* utf-8 general punctuation U2080-2089 *) - (* Hyphens *) - | x when 0x2010 <= x & x <= 0x2011 -> Utf8Letter n - (* Dashes and other symbols *) - | x when 0x2012 <= x & x <= 0x2027 -> Utf8Symbol - (* Per mille and per ten thousand signs *) - | x when 0x2030 <= x & x <= 0x2031 -> Utf8Symbol - (* Prime letters *) - | x when 0x2032 <= x & x <= 0x2034 or x = 0x2057 -> Utf8IdentPart n - (* Miscellaneous punctuation *) - | x when 0x2039 <= x & x <= 0x2056 -> Utf8Symbol - | x when 0x2058 <= x & x <= 0x205E -> Utf8Symbol - (* Invisible mathematical operators *) - | x when 0x2061 <= x & x <= 0x2063 -> Utf8Symbol - - (* utf-8 subscript U2080-2089 *) - | x when 0x2080 <= x & x <= 0x2089 -> Utf8IdentPart n - (* utf-8 letter-like U2100-214F *) - | x when 0x2100 <= x & x <= 0x214F -> Utf8Letter n - (* utf-8 number-forms U2153-2183 *) - | x when 0x2153 <= x & x <= 0x2183 -> Utf8Symbol - (* utf-8 arrows A U2190-21FF *) - (* utf-8 mathematical operators U2200-22FF *) - (* utf-8 miscellaneous technical U2300-23FF *) - | x when 0x2190 <= x & x <= 0x23FF -> Utf8Symbol - (* utf-8 box drawing U2500-257F has ceiling, etc. *) - (* utf-8 block elements U2580-259F *) - (* utf-8 geom. shapes U25A0-25FF (has triangles, losange, etc) *) - (* utf-8 miscellaneous symbols U2600-26FF *) - | x when 0x2500 <= x & x <= 0x26FF -> Utf8Symbol - (* utf-8 arrows B U2900-297F *) - | x when 0x2900 <= x & x <= 0x297F -> Utf8Symbol - (* utf-8 mathematical operators U2A00-2AFF *) - | x when 0x2A00 <= x & x <= 0x2AFF -> Utf8Symbol - | _ -> error_unsupported_unicode_character n cs - end - | _ -> - begin match unicode with - (* utf-8 Hiragana U3040-309F and Katakana U30A0-30FF *) - | x when 0x3040 <= x & x <= 0x30FF -> Utf8Letter n - (* utf-8 Unified CJK Ideographs U4E00-9FA5 *) - | x when 0x4E00 <= x & x <= 0x9FA5 -> Utf8Letter n - (* utf-8 Hangul syllables UAC00-D7AF *) - | x when 0xAC00 <= x & x <= 0xD7AF -> Utf8Letter n - (* utf-8 Gothic U10330-1034A *) - | x when 0x10330 <= x & x <= 0x1034A -> Utf8Letter n - | _ -> error_unsupported_unicode_character n cs - end + try classify_unicode unicode, n + with UnsupportedUtf8 -> error_unsupported_unicode_character n cs let lookup_utf8 cs = match Stream.peek cs with - | Some ('\x00'..'\x7F') -> Some AsciiChar - | Some ('\x80'..'\xFF' as c) -> Some (lookup_utf8_tail c cs) - | None -> None + | Some ('\x00'..'\x7F') -> AsciiChar + | Some ('\x80'..'\xFF' as c) -> Utf8Token (lookup_utf8_tail c cs) + | None -> EmptyStream let check_special_token str = let rec loop_symb = parser @@ -234,16 +143,16 @@ let check_special_token str = let check_ident str = let rec loop_id intail = parser - | [< ' ('$' | 'a'..'z' | 'A'..'Z' | '_'); s >] -> + | [< ' ('a'..'z' | 'A'..'Z' | '_'); s >] -> loop_id true s | [< ' ('0'..'9' | ''') when intail; s >] -> loop_id true s | [< s >] -> match lookup_utf8 s with - | Some (Utf8Letter n) -> njunk n s; loop_id true s - | Some (Utf8IdentPart n) when intail -> njunk n s; loop_id true s - | Some _ -> bad_token str - | None -> () + | Utf8Token (UnicodeLetter, n) -> njunk n s; loop_id true s + | Utf8Token (UnicodeIdentPart, n) when intail -> njunk n s; loop_id true s + | EmptyStream -> () + | Utf8Token _ | AsciiChar -> bad_token str in loop_id false (Stream.of_string str) @@ -266,7 +175,7 @@ let add_keyword str = (* Adding a new token (keyword or special token). *) let add_token (con, str) = match con with | "" -> add_keyword str - | "METAIDENT" | "IDENT" | "FIELD" | "INT" | "STRING" | "EOI" + | "METAIDENT" | "PATTERNIDENT" | "IDENT" | "FIELD" | "INT" | "STRING" | "EOI" -> () | _ -> raise (Token.Error ("\ @@ -308,7 +217,7 @@ let rec ident_tail len = parser ident_tail (store len c) s | [< s >] -> match lookup_utf8 s with - | Some (Utf8IdentPart n | Utf8Letter n) -> + | Utf8Token ((UnicodeIdentPart | UnicodeLetter), n) -> ident_tail (nstore n len s) s | _ -> len @@ -368,10 +277,10 @@ let null_comment s = let comment_stop ep = let current_s = Buffer.contents current in - if !Options.xml_export && Buffer.length current > 0 && + if !Flags.xml_export && Buffer.length current > 0 && (!between_com || not(null_comment current_s)) then !xml_output_comment current_s; - (if Options.do_translate() && Buffer.length current > 0 && + (if Flags.do_translate() && Buffer.length current > 0 && (!between_com || not(null_comment current_s)) then let bp = match !comment_begin with Some bp -> bp @@ -406,7 +315,7 @@ let rec comment bp = parser bp2 | [< '')' >] -> push_string "*)"; | [< s >] -> real_push_char '*'; comment bp s >] -> () | [< ''"'; s >] -> - if Options.do_translate() then (push_string"\"";comm_string bp2 s) + if Flags.do_translate() then (push_string"\"";comm_string bp2 s) else ignore (string bp2 0 s); comment bp s | [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_comment @@ -457,27 +366,30 @@ let process_chars bp c cs = | Some t -> (("", t), (bp, ep)) | None -> err (bp, ep) Undefined_token -(* Parse what follows a dot *) -let parse_after_dot bp c = parser +(* Parse what follows a dot/question mark *) +let parse_after_dot bp c = + let constructor = if c = '?' then "PATTERNIDENT" else "FIELD" in + parser | [< ' ('a'..'z' | 'A'..'Z' | '_' as c); len = ident_tail (store 0 c) >] -> - ("FIELD", get_buff len) + (constructor, get_buff len) | [< s >] -> match lookup_utf8 s with - | Some (Utf8Letter n) -> - ("FIELD", get_buff (ident_tail (nstore n 0 s) s)) - | Some (Utf8IdentPart _ | AsciiChar | Utf8Symbol) | None -> + | Utf8Token (UnicodeLetter, n) -> + (constructor, get_buff (ident_tail (nstore n 0 s) s)) + | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars bp c s) (* Parse a token in a char stream *) let rec next_token = parser bp | [< '' ' | '\t' | '\n' |'\r' as c; s >] -> comm_loc bp; push_char c; next_token s - | [< ''$'; len = ident_tail (store 0 '$') >] ep -> + | [< ''$'; ' ('a'..'z' | 'A'..'Z' | '_' as c); + len = ident_tail (store 0 c) >] ep -> comment_stop bp; (("METAIDENT", get_buff len), (bp,ep)) - | [< ''.' as c; t = parse_after_dot bp c >] ep -> + | [< ' ('.' | '?') as c; t = parse_after_dot bp c >] ep -> comment_stop bp; - if Options.do_translate() & t=("",".") then between_com := true; + if Flags.do_translate() & t=("",".") then between_com := true; (t, (bp,ep)) | [< ' ('a'..'z' | 'A'..'Z' | '_' as c); len = ident_tail (store 0 c) >] ep -> @@ -501,16 +413,16 @@ let rec next_token = parser bp t | [< s >] -> match lookup_utf8 s with - | Some (Utf8Letter n) -> + | Utf8Token (UnicodeLetter, n) -> let len = ident_tail (nstore n 0 s) s in let id = get_buff len in let ep = Stream.count s in comment_stop bp; (try ("",find_keyword id) with Not_found -> ("IDENT",id)), (bp, ep) - | Some (Utf8Symbol | AsciiChar | Utf8IdentPart _) -> + | AsciiChar | Utf8Token ((UnicodeSymbol | UnicodeIdentPart), _) -> let t = process_chars bp (Stream.next s) s in comment_stop bp; t - | None -> + | EmptyStream -> comment_stop bp; (("EOI", ""), (bp, bp + 1)) (* Location table system for creating tables associating a token count -- cgit v1.2.3