summaryrefslogtreecommitdiff
path: root/parsing/lexer.ml4
diff options
context:
space:
mode:
Diffstat (limited to 'parsing/lexer.ml4')
-rw-r--r--parsing/lexer.ml4172
1 files changed, 42 insertions, 130 deletions
diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4
index 80eaf7f0..c1e4cfc6 100644
--- a/parsing/lexer.ml4
+++ b/parsing/lexer.ml4
@@ -6,9 +6,15 @@
(* * GNU Lesser General Public License Version 2.1 *)
(************************************************************************)
-(*i $Id: lexer.ml4 9015 2006-07-05 17:19:22Z herbelin $ i*)
+(*i $Id: lexer.ml4 11059 2008-06-06 09:29:20Z herbelin $ i*)
+
+
+(*i camlp4use: "pr_o.cmo" i*)
+(* Add pr_o.cmo to circumvent a useless-warning bug when preprocessed with
+ * ast-based camlp4 *)
open Pp
+open Util
open Token
(* Dictionaries: trees annotated with string options, each node being a map
@@ -71,8 +77,10 @@ let bad_token str = raise (Error (Bad_token str))
(* Lexer conventions on tokens *)
-type utf8_token =
- Utf8Letter of int | Utf8IdentPart of int | Utf8Symbol | AsciiChar
+type token_kind =
+ | Utf8Token of (utf8_status * int)
+ | AsciiChar
+ | EmptyStream
let error_unsupported_unicode_character n cs =
let bp = Stream.count cs in
@@ -80,6 +88,7 @@ let error_unsupported_unicode_character n cs =
let error_utf8 cs =
let bp = Stream.count cs in
+ Stream.junk cs; (* consume the char to avoid read it and fail again *)
err (bp, bp+1) Illegal_character
let njunk n = Util.repeat n Stream.junk
@@ -115,114 +124,14 @@ let lookup_utf8_tail c cs =
(Char.code c3 land 0x3F) lsl 6 + (Char.code c4 land 0x3F)
| _ -> error_utf8 cs
in
- match unicode land 0x1F000 with
- | 0x0 ->
- begin match unicode with
- (* utf-8 Latin-1 non breaking space U00A0 *)
- | 0x00A0 -> Utf8Letter n
- (* utf-8 Latin-1 symbols U00A1-00BF *)
- | x when 0x00A0 <= x & x <= 0x00BF -> Utf8Symbol
- (* utf-8 Latin-1 letters U00C0-00D6 *)
- | x when 0x00C0 <= x & x <= 0x00D6 -> Utf8Letter n
- (* utf-8 Latin-1 symbol U00D7 *)
- | 0x00D7 -> Utf8Symbol
- (* utf-8 Latin-1 letters U00D8-00F6 *)
- | x when 0x00D8 <= x & x <= 0x00F6 -> Utf8Letter n
- (* utf-8 Latin-1 symbol U00F7 *)
- | 0x00F7 -> Utf8Symbol
- (* utf-8 Latin-1 letters U00F8-00FF *)
- | x when 0x00F8 <= x & x <= 0x00FF -> Utf8Letter n
- (* utf-8 Latin Extended A U0100-017F and Latin Extended B U0180-U0241 *)
- | x when 0x0100 <= x & x <= 0x0241 -> Utf8Letter n
- (* utf-8 Phonetic letters U0250-02AF *)
- | x when 0x0250 <= x & x <= 0x02AF -> Utf8Letter n
- (* utf-8 what do to with diacritics U0300-U036F ? *)
- (* utf-8 Greek letters U0380-03FF *)
- | x when 0x0380 <= x & x <= 0x03FF -> Utf8Letter n
- (* utf-8 Cyrillic letters U0400-0481 *)
- | x when 0x0400 <= x & x <= 0x0481 -> Utf8Letter n
- (* utf-8 Cyrillic symbol U0482 *)
- | 0x0482 -> Utf8Symbol
- (* utf-8 what do to with diacritics U0483-U0489 \ U0487 ? *)
- (* utf-8 Cyrillic letters U048A-U4F9 (Warning: 04CF) *)
- | x when 0x048A <= x & x <= 0x04F9 -> Utf8Letter n
- (* utf-8 Cyrillic supplement letters U0500-U050F *)
- | x when 0x0500 <= x & x <= 0x050F -> Utf8Letter n
- (* utf-8 Hebrew letters U05D0-05EA *)
- | x when 0x05D0 <= x & x <= 0x05EA -> Utf8Letter n
- (* utf-8 Arabic letters U0621-064A *)
- | x when 0x0621 <= x & x <= 0x064A -> Utf8Letter n
- (* utf-8 Arabic supplement letters U0750-076D *)
- | x when 0x0750 <= x & x <= 0x076D -> Utf8Letter n
- | _ -> error_unsupported_unicode_character n cs
- end
- | 0x1000 ->
- begin match unicode with
- (* utf-8 Georgian U10A0-10FF (has holes) *)
- | x when 0x10A0 <= x & x <= 0x10FF -> Utf8Letter n
- (* utf-8 Hangul Jamo U1100-11FF (has holes) *)
- | x when 0x1100 <= x & x <= 0x11FF -> Utf8Letter n
- (* utf-8 Latin additional letters U1E00-1E9B and U1EA0-1EF9 *)
- | x when 0x1E00 <= x & x <= 0x1E9B -> Utf8Letter n
- | x when 0x1EA0 <= x & x <= 0x1EF9 -> Utf8Letter n
- | _ -> error_unsupported_unicode_character n cs
- end
- | 0x2000 ->
- begin match unicode with
- (* utf-8 general punctuation U2080-2089 *)
- (* Hyphens *)
- | x when 0x2010 <= x & x <= 0x2011 -> Utf8Letter n
- (* Dashes and other symbols *)
- | x when 0x2012 <= x & x <= 0x2027 -> Utf8Symbol
- (* Per mille and per ten thousand signs *)
- | x when 0x2030 <= x & x <= 0x2031 -> Utf8Symbol
- (* Prime letters *)
- | x when 0x2032 <= x & x <= 0x2034 or x = 0x2057 -> Utf8IdentPart n
- (* Miscellaneous punctuation *)
- | x when 0x2039 <= x & x <= 0x2056 -> Utf8Symbol
- | x when 0x2058 <= x & x <= 0x205E -> Utf8Symbol
- (* Invisible mathematical operators *)
- | x when 0x2061 <= x & x <= 0x2063 -> Utf8Symbol
-
- (* utf-8 subscript U2080-2089 *)
- | x when 0x2080 <= x & x <= 0x2089 -> Utf8IdentPart n
- (* utf-8 letter-like U2100-214F *)
- | x when 0x2100 <= x & x <= 0x214F -> Utf8Letter n
- (* utf-8 number-forms U2153-2183 *)
- | x when 0x2153 <= x & x <= 0x2183 -> Utf8Symbol
- (* utf-8 arrows A U2190-21FF *)
- (* utf-8 mathematical operators U2200-22FF *)
- (* utf-8 miscellaneous technical U2300-23FF *)
- | x when 0x2190 <= x & x <= 0x23FF -> Utf8Symbol
- (* utf-8 box drawing U2500-257F has ceiling, etc. *)
- (* utf-8 block elements U2580-259F *)
- (* utf-8 geom. shapes U25A0-25FF (has triangles, losange, etc) *)
- (* utf-8 miscellaneous symbols U2600-26FF *)
- | x when 0x2500 <= x & x <= 0x26FF -> Utf8Symbol
- (* utf-8 arrows B U2900-297F *)
- | x when 0x2900 <= x & x <= 0x297F -> Utf8Symbol
- (* utf-8 mathematical operators U2A00-2AFF *)
- | x when 0x2A00 <= x & x <= 0x2AFF -> Utf8Symbol
- | _ -> error_unsupported_unicode_character n cs
- end
- | _ ->
- begin match unicode with
- (* utf-8 Hiragana U3040-309F and Katakana U30A0-30FF *)
- | x when 0x3040 <= x & x <= 0x30FF -> Utf8Letter n
- (* utf-8 Unified CJK Ideographs U4E00-9FA5 *)
- | x when 0x4E00 <= x & x <= 0x9FA5 -> Utf8Letter n
- (* utf-8 Hangul syllables UAC00-D7AF *)
- | x when 0xAC00 <= x & x <= 0xD7AF -> Utf8Letter n
- (* utf-8 Gothic U10330-1034A *)
- | x when 0x10330 <= x & x <= 0x1034A -> Utf8Letter n
- | _ -> error_unsupported_unicode_character n cs
- end
+ try classify_unicode unicode, n
+ with UnsupportedUtf8 -> error_unsupported_unicode_character n cs
let lookup_utf8 cs =
match Stream.peek cs with
- | Some ('\x00'..'\x7F') -> Some AsciiChar
- | Some ('\x80'..'\xFF' as c) -> Some (lookup_utf8_tail c cs)
- | None -> None
+ | Some ('\x00'..'\x7F') -> AsciiChar
+ | Some ('\x80'..'\xFF' as c) -> Utf8Token (lookup_utf8_tail c cs)
+ | None -> EmptyStream
let check_special_token str =
let rec loop_symb = parser
@@ -234,16 +143,16 @@ let check_special_token str =
let check_ident str =
let rec loop_id intail = parser
- | [< ' ('$' | 'a'..'z' | 'A'..'Z' | '_'); s >] ->
+ | [< ' ('a'..'z' | 'A'..'Z' | '_'); s >] ->
loop_id true s
| [< ' ('0'..'9' | ''') when intail; s >] ->
loop_id true s
| [< s >] ->
match lookup_utf8 s with
- | Some (Utf8Letter n) -> njunk n s; loop_id true s
- | Some (Utf8IdentPart n) when intail -> njunk n s; loop_id true s
- | Some _ -> bad_token str
- | None -> ()
+ | Utf8Token (UnicodeLetter, n) -> njunk n s; loop_id true s
+ | Utf8Token (UnicodeIdentPart, n) when intail -> njunk n s; loop_id true s
+ | EmptyStream -> ()
+ | Utf8Token _ | AsciiChar -> bad_token str
in
loop_id false (Stream.of_string str)
@@ -266,7 +175,7 @@ let add_keyword str =
(* Adding a new token (keyword or special token). *)
let add_token (con, str) = match con with
| "" -> add_keyword str
- | "METAIDENT" | "IDENT" | "FIELD" | "INT" | "STRING" | "EOI"
+ | "METAIDENT" | "PATTERNIDENT" | "IDENT" | "FIELD" | "INT" | "STRING" | "EOI"
-> ()
| _ ->
raise (Token.Error ("\
@@ -308,7 +217,7 @@ let rec ident_tail len = parser
ident_tail (store len c) s
| [< s >] ->
match lookup_utf8 s with
- | Some (Utf8IdentPart n | Utf8Letter n) ->
+ | Utf8Token ((UnicodeIdentPart | UnicodeLetter), n) ->
ident_tail (nstore n len s) s
| _ -> len
@@ -368,10 +277,10 @@ let null_comment s =
let comment_stop ep =
let current_s = Buffer.contents current in
- if !Options.xml_export && Buffer.length current > 0 &&
+ if !Flags.xml_export && Buffer.length current > 0 &&
(!between_com || not(null_comment current_s)) then
!xml_output_comment current_s;
- (if Options.do_translate() && Buffer.length current > 0 &&
+ (if Flags.do_translate() && Buffer.length current > 0 &&
(!between_com || not(null_comment current_s)) then
let bp = match !comment_begin with
Some bp -> bp
@@ -406,7 +315,7 @@ let rec comment bp = parser bp2
| [< '')' >] -> push_string "*)";
| [< s >] -> real_push_char '*'; comment bp s >] -> ()
| [< ''"'; s >] ->
- if Options.do_translate() then (push_string"\"";comm_string bp2 s)
+ if Flags.do_translate() then (push_string"\"";comm_string bp2 s)
else ignore (string bp2 0 s);
comment bp s
| [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_comment
@@ -457,27 +366,30 @@ let process_chars bp c cs =
| Some t -> (("", t), (bp, ep))
| None -> err (bp, ep) Undefined_token
-(* Parse what follows a dot *)
-let parse_after_dot bp c = parser
+(* Parse what follows a dot/question mark *)
+let parse_after_dot bp c =
+ let constructor = if c = '?' then "PATTERNIDENT" else "FIELD" in
+ parser
| [< ' ('a'..'z' | 'A'..'Z' | '_' as c); len = ident_tail (store 0 c) >] ->
- ("FIELD", get_buff len)
+ (constructor, get_buff len)
| [< s >] ->
match lookup_utf8 s with
- | Some (Utf8Letter n) ->
- ("FIELD", get_buff (ident_tail (nstore n 0 s) s))
- | Some (Utf8IdentPart _ | AsciiChar | Utf8Symbol) | None ->
+ | Utf8Token (UnicodeLetter, n) ->
+ (constructor, get_buff (ident_tail (nstore n 0 s) s))
+ | AsciiChar | Utf8Token _ | EmptyStream ->
fst (process_chars bp c s)
(* Parse a token in a char stream *)
let rec next_token = parser bp
| [< '' ' | '\t' | '\n' |'\r' as c; s >] ->
comm_loc bp; push_char c; next_token s
- | [< ''$'; len = ident_tail (store 0 '$') >] ep ->
+ | [< ''$'; ' ('a'..'z' | 'A'..'Z' | '_' as c);
+ len = ident_tail (store 0 c) >] ep ->
comment_stop bp;
(("METAIDENT", get_buff len), (bp,ep))
- | [< ''.' as c; t = parse_after_dot bp c >] ep ->
+ | [< ' ('.' | '?') as c; t = parse_after_dot bp c >] ep ->
comment_stop bp;
- if Options.do_translate() & t=("",".") then between_com := true;
+ if Flags.do_translate() & t=("",".") then between_com := true;
(t, (bp,ep))
| [< ' ('a'..'z' | 'A'..'Z' | '_' as c);
len = ident_tail (store 0 c) >] ep ->
@@ -501,16 +413,16 @@ let rec next_token = parser bp
t
| [< s >] ->
match lookup_utf8 s with
- | Some (Utf8Letter n) ->
+ | Utf8Token (UnicodeLetter, n) ->
let len = ident_tail (nstore n 0 s) s in
let id = get_buff len in
let ep = Stream.count s in
comment_stop bp;
(try ("",find_keyword id) with Not_found -> ("IDENT",id)), (bp, ep)
- | Some (Utf8Symbol | AsciiChar | Utf8IdentPart _) ->
+ | AsciiChar | Utf8Token ((UnicodeSymbol | UnicodeIdentPart), _) ->
let t = process_chars bp (Stream.next s) s in
comment_stop bp; t
- | None ->
+ | EmptyStream ->
comment_stop bp; (("EOI", ""), (bp, bp + 1))
(* Location table system for creating tables associating a token count