diff options
Diffstat (limited to 'parsing/lexer.ml4')
-rw-r--r-- | parsing/lexer.ml4 | 254 |
1 files changed, 123 insertions, 131 deletions
diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4 index bf5f3bfe..6119b86e 100644 --- a/parsing/lexer.ml4 +++ b/parsing/lexer.ml4 @@ -6,7 +6,7 @@ (* * GNU Lesser General Public License Version 2.1 *) (************************************************************************) -(*i $Id: lexer.ml4,v 1.24.2.6 2004/07/16 20:51:12 herbelin Exp $ i*) +(*i $Id: lexer.ml4 7870 2006-01-15 20:29:09Z herbelin $ i*) open Pp open Token @@ -82,23 +82,22 @@ let check_ident str = let rec loop_id = parser | [< ' ('$' | 'a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_'); s >] -> loop_id s - (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *) - | [< ' ('\206' | '\207'); ' ('\128'..'\191'); s >] -> loop_id s - | [< ''\226'; 'c2; 'c3; s >] -> + (* utf-8 Greek letters U0380-03FF *) + | [< ' ('\xCE' | '\xCF'); ' ('\x80'..'\xBF'); s >] -> loop_id s + | [< ''\xE2'; 'c2; 'c3; s >] -> (match c2, c3 with - (* utf8 letter-like unicode 2100-214F *) - | (('\132', '\128'..'\191') | ('\133', '\128'..'\143')) -> + (* utf-8 letter-like U2100-214F *) + | ( ('\x84', '\x80'..'\xBF') + | ('\x85', '\x80'..'\x8F') + (* utf-8 subscript U2080-2089 *) + | ('\x82', '\x80'..'\x89')) -> loop_id s - (* utf8 symbols (see [parse_226_tail]) *) - | (('\134'..'\143' | '\152'..'\155' | '\159' - | '\164'..'\171'),_) -> + (* utf-8 symbols (see [parse_226_tail]) *) + | (('\x86'..'\x8F' | '\x94'..'\x9B' + | '\xA4'..'\xA5' | '\xA8'..'\xAB'),_) -> bad_token str - | _ -> (* default to iso 8859-1 "â" *) - if !Options.v7 then loop_id [< 'c2; 'c3; s >] - else bad_token str) - (* iso 8859-1 accentuated letters *) - | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255'); s >] -> - if !Options.v7 then loop_id s else bad_token str + | _ -> + bad_token str) | [< _ = Stream.empty >] -> () | [< >] -> bad_token str in @@ -170,27 +169,26 @@ let get_buff len = String.sub !buff 0 len (* The classical lexer: idents, numbers, quoted strings, comments *) -let rec ident_tail len strm = - if !Options.v7 then - match strm with parser - | [< ' ('a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_' | '@' as c); s >] -> - ident_tail (store len c) s - (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *) - | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2) ; s >] -> - ident_tail (store (store len c1) c2) s - (* iso 8859-1 accentuated letters *) - | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255' as c); s >] -> - ident_tail (store len c) s - | [< >] -> len - else - match strm with parser - | [< ' ('a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_' as c); s >] -> - ident_tail (store len c) s - (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *) - | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2) ; s >] -> - ident_tail (store (store len c1) c2) s - | [< >] -> len - +let rec ident_tail len = parser + | [< ' ('a'..'z' | 'A'..'Z' | '0'..'9' | ''' | '_' as c); s >] -> + ident_tail (store len c) s + (* utf-8 Greek letters U0380-03FF *) + | [< ' ('\xCE' | '\xCF' as c1); ' ('\x80'..'\xBF' as c2) ; s >] -> + ident_tail (store (store len c1) c2) s + | [< s >] -> + match Stream.peek s with + | Some '\xE2' -> + (match List.tl (Stream.npeek 3 s) with + (* utf-8 subscript U2080-2089 *) + | ['\x82' as c2; ('\x80'..'\x89' as c3)] + (* utf-8 letter-like U2100-214F part 1 *) + | ['\x84' as c2; ('\x80'..'\xBF' as c3)] + (* utf-8 letter-like U2100-214F part 2 *) + | ['\x85' as c2; ('\x80'..'\x8F' as c3)] -> + Stream.junk s; Stream.junk s; Stream.junk s; + ident_tail (store (store (store len '\xE2') c2) c3) s + | _ -> len) + | _ -> len let rec number len = parser | [< ' ('0'..'9' as c); s >] -> number (store len c) s @@ -198,21 +196,11 @@ let rec number len = parser let escape len c = store len c -let rec string_v8 bp len = parser +let rec string bp len = parser | [< ''"'; esc=(parser [<''"' >] -> true | [< >] -> false); s >] -> - if esc then string_v8 bp (store len '"') s else len - | [< 'c; s >] -> string_v8 bp (store len c) s - | [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_string - -let rec string_v7 bp len = parser - | [< ''"' >] -> len - | [< ''\\'; c = (parser [< ' ('"' | '\\' as c) >] -> c | [< >] -> '\\'); s >] - -> string_v7 bp (escape len c) s + if esc then string bp (store len '"') s else len + | [< 'c; s >] -> string bp (store len c) s | [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_string - | [< 'c; s >] -> string_v7 bp (store len c) s - -let string bp len s = - if !Options.v7 then string_v7 bp len s else string_v8 bp len s (* Hook for exporting comment into xml theory files *) let xml_output_comment = ref (fun _ -> ()) @@ -293,14 +281,14 @@ let rec comment bp = parser bp2 s >] -> comment bp s | [< ''*'; _ = parser - | [< '')' >] ep -> push_string "*)"; + | [< '')' >] -> push_string "*)"; | [< s >] -> real_push_char '*'; comment bp s >] -> () | [< ''"'; s >] -> if Options.do_translate() then (push_string"\"";comm_string bp2 s) else ignore (string bp2 0 s); comment bp s | [< _ = Stream.empty >] ep -> err (bp, ep) Unterminated_comment - | [< '_ as z; s >] ep -> real_push_char z; comment bp s + | [< 'z; s >] -> real_push_char z; comment bp s (* Parse a special token, using the [token_tree] *) @@ -334,100 +322,81 @@ type token_226_tail = | TokSymbol of string option | TokIdent of string +(* 1110xxxx 10yyyyzz 10zztttt utf-8 codes for xxxx=0010 *) let parse_226_tail tk = parser - | [< ''\132' as c2; ' ('\128'..'\191' as c3); - (* utf8 letter-like unicode 2100-214F *) - len = ident_tail (store (store (store 0 '\226') c2) c3) >] -> + | [< ''\x82' as c2; ' ('\x80'..'\x89' as c3); + (* utf-8 subscript U2080-2089 *) + len = ident_tail (store (store (store 0 '\xE2') c2) c3) >] -> + TokIdent (get_buff len) + | [< ''\x84' as c2; ' ('\x80'..'\xBF' as c3); + (* utf-8 letter-like U2100-214F part 1 *) + len = ident_tail (store (store (store 0 '\xE2') c2) c3) >] -> TokIdent (get_buff len) - | [< ''\133' as c2; ' ('\128'..'\143' as c3); - (* utf8 letter-like unicode 2100-214F *) - len = ident_tail (store (store (store 0 '\226') c2) c3) >] -> + | [< ''\x85' as c2; ' ('\x80'..'\x8F' as c3); + (* utf-8 letter-like U2100-214F part 2 *) + len = ident_tail (store (store (store 0 '\xE2') c2) c3) >] -> TokIdent (get_buff len) - | [< ' ('\134'..'\143' | '\152'..'\155' | '\159' - | '\164'..'\171' as c2); 'c3; - (* utf8 arrows A unicode 2190-21FF *) - (* utf8 mathematical operators unicode 2200-22FF *) - (* utf8 miscellaneous technical unicode 2300-23FF *) - (* utf8 miscellaneous symbols unicode 2600-26FF *) - (* utf8 Miscellaneous Mathematical Symbols-A unicode 27C0-27DF *) - (* utf8 Supplemental Arrows-A unicode 27E0-27FF *) - (* utf8 Supplemental Arrows-B unicode 2900-297F *) - (* utf8 Miscellaneous Mathematical Symbols-B unicode 2980-29FF *) - (* utf8 mathematical operators unicode 2A00-2AFF *) + | [< ' ('\x86'..'\x8F' | '\x94'..'\x9B' | '\xA4'..'\xA5' + | '\xA8'..'\xAB' as c2); 'c3; + (* utf-8 arrows A U2190-21FF *) + (* utf-8 mathematical operators U2200-22FF *) + (* utf-8 miscellaneous technical U2300-23FF *) + (* utf-8 box drawing U2500-257F has ceiling, etc. *) + (* utf-8 block elements U2580-259F *) + (* utf-8 geom. shapes U25A0-25FF (has triangles, losange, etc) *) + (* utf-8 miscellaneous symbols U2600-26FF *) + (* utf-8 arrows B U2900-297F *) + (* utf-8 mathematical operators U2A00-2AFF *) t = special (progress_special c3 (progress_special c2 - (progress_special '\226' tk))) >] -> + (progress_special '\xE2' tk))) >] -> TokSymbol t - | [< len = ident_tail (store 0 '\226') >] -> - TokIdent (get_buff len) - + | [< '_; '_ >] -> + (* Unsupported utf-8 code *) + TokSymbol None (* Parse what follows a dot *) -let parse_after_dot bp c strm = - if !Options.v7 then - match strm with parser - | [< ' ('_' | 'a'..'z' | 'A'..'Z' as c); - len = ident_tail (store 0 c) >] -> - ("FIELD", get_buff len) - (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *) - | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2); - len = ident_tail (store (store 0 c1) c2) >] -> - ("FIELD", get_buff len) - (* utf-8 mathematical symbols have format E2 xx xx [E2=226] *) - | [< ''\226' as c1; t = parse_226_tail - (progress_special '.' (Some !token_tree)) >] ep -> - (match t with - | TokSymbol (Some t) -> ("", t) - | TokSymbol None -> err (bp, ep) Undefined_token - | TokIdent t -> ("FIELD", t)) - (* iso 8859-1 accentuated letters *) - | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255' as c); - len = ident_tail (store 0 c) >] -> - ("FIELD", get_buff len) - | [< (t,_) = process_chars bp c >] -> t - else - match strm with parser - | [< ' ('a'..'z' | 'A'..'Z' | '_' as c); - len = ident_tail (store 0 c) >] -> - ("FIELD", get_buff len) - (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *) - | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2); - len = ident_tail (store (store 0 c1) c2) >] -> - ("FIELD", get_buff len) - (* utf-8 mathematical symbols have format E2 xx xx [E2=226] *) - | [< ''\226' as c1; t = parse_226_tail - (progress_special '.' (Some !token_tree)) >] ep -> - (match t with - | TokSymbol (Some t) -> ("", t) - | TokSymbol None -> err (bp, ep) Undefined_token - | TokIdent t -> ("FIELD", t)) - | [< (t,_) = process_chars bp c >] -> t +let parse_after_dot bp c = parser + | [< ' ('a'..'z' | 'A'..'Z' | '_' as c); + len = ident_tail (store 0 c) >] -> + ("FIELD", get_buff len) + (* utf-8 Greek letters U0380-03FF *) + | [< ' ('\xCE' | '\xCF' as c1); ' ('\x80'..'\xBF' as c2); + len = ident_tail (store (store 0 c1) c2) >] -> + ("FIELD", get_buff len) + (* utf-8 mathematical symbols have format E2 xx xx [E2=226] *) + | [< ''\xE2'; t = parse_226_tail + (progress_special '.' (Some !token_tree)) >] ep -> + (match t with + | TokSymbol (Some t) -> ("", t) + | TokSymbol None -> err (bp, ep) Undefined_token + | TokIdent t -> ("FIELD", t)) + | [< (t,_) = process_chars bp c >] -> t (* Parse a token in a char stream *) let rec next_token = parser bp - | [< '' ' | '\t' | '\n' |'\r' as c; s >] ep -> + | [< '' ' | '\t' | '\n' |'\r' as c; s >] -> comm_loc bp; push_char c; next_token s | [< ''$'; len = ident_tail (store 0 '$') >] ep -> comment_stop bp; (("METAIDENT", get_buff len), (bp,ep)) | [< ''.' as c; t = parse_after_dot bp c >] ep -> comment_stop bp; - if !Options.v7 & t=("",".") then between_com := true; (t, (bp,ep)) | [< ' ('a'..'z' | 'A'..'Z' | '_' as c); len = ident_tail (store 0 c) >] ep -> let id = get_buff len in comment_stop bp; (try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep) - (* Greek utf-8 letters [CE80-CEBF and CF80-CFBF] (CE=206; BF=191) *) - | [< ' ('\206' | '\207' as c1); ' ('\128'..'\191' as c2); + (* utf-8 Greek letters U0380-03FF [CE80-CEBF and CF80-CFBF] *) + | [< ' ('\xCE' | '\xCF' as c1); ' ('\x80'..'\xBF' as c2); len = ident_tail (store (store 0 c1) c2) >] ep -> let id = get_buff len in comment_stop bp; (try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep) (* utf-8 mathematical symbols have format E2 xx xx [E2=226] *) - | [< ''\226' as c1; t = parse_226_tail (Some !token_tree) >] ep -> + | [< ''\xE2'; t = parse_226_tail (Some !token_tree) >] ep -> comment_stop bp; (match t with | TokSymbol (Some t) -> ("", t), (bp, ep) @@ -435,21 +404,6 @@ let rec next_token = parser bp | TokIdent id -> (try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep)) - (* iso 8859-1 accentuated letters *) - | [< ' ('\192'..'\214' | '\216'..'\246' | '\248'..'\255' as c) ; s >] -> - if !Options.v7 then - begin - match s with parser - [< len = ident_tail (store 0 c) >] ep -> - let id = get_buff len in - comment_stop bp; - (try ("", find_keyword id) with Not_found -> ("IDENT", id)), (bp, ep) - end - else - begin - match s with parser - [< t = process_chars bp c >] -> comment_stop bp; t - end | [< ' ('0'..'9' as c); len = number (store 0 c) >] ep -> comment_stop bp; (("INT", get_buff len), (bp, ep)) @@ -537,3 +491,41 @@ let tparse (p_con, p_prm) = else (parser [< '(con, prm) when con = p_con && prm = p_prm >] -> prm) i*) + +(* Terminal symbols interpretation *) + +let is_ident_not_keyword s = + match s.[0] with + | 'a'..'z' | 'A'..'Z' | '_' -> not (is_keyword s) + | _ -> false + +let is_number s = + match s.[0] with + | '0'..'9' -> true + | _ -> false + +let strip s = + let len = + let rec loop i len = + if i = String.length s then len + else if s.[i] == ' ' then loop (i + 1) len + else loop (i + 1) (len + 1) + in + loop 0 0 + in + if len == String.length s then s + else + let s' = String.create len in + let rec loop i i' = + if i == String.length s then s' + else if s.[i] == ' ' then loop (i + 1) i' + else begin s'.[i'] <- s.[i]; loop (i + 1) (i' + 1) end + in + loop 0 0 + +let terminal s = + let s = strip s in + if s = "" then failwith "empty token"; + if is_ident_not_keyword s then ("IDENT", s) + else if is_number s then ("INT", s) + else ("", s) |