diff options
Diffstat (limited to 'parsing/lexer.ml4')
-rw-r--r-- | parsing/lexer.ml4 | 25 |
1 files changed, 15 insertions, 10 deletions
diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4 index 2633386f..52b5ede7 100644 --- a/parsing/lexer.ml4 +++ b/parsing/lexer.ml4 @@ -6,7 +6,7 @@ (* * GNU Lesser General Public License Version 2.1 *) (************************************************************************) -(*i $Id: lexer.ml4 11786 2009-01-14 13:07:34Z herbelin $ i*) +(*i $Id: lexer.ml4 12891 2010-03-30 11:40:02Z herbelin $ i*) (*i camlp4use: "pr_o.cmo" i*) @@ -91,6 +91,14 @@ let error_utf8 cs = Stream.junk cs; (* consume the char to avoid read it and fail again *) err (bp, bp+1) Illegal_character +let utf8_char_size cs = function + (* Utf8 leading byte *) + | '\x00'..'\x7F' -> 1 + | '\xC0'..'\xDF' -> 2 + | '\xE0'..'\xEF' -> 3 + | '\xF0'..'\xF7' -> 4 + | _ (* '\x80'..\xBF'|'\xF8'..'\xFF' *) -> error_utf8 cs + let njunk n = Util.repeat n Stream.junk let check_utf8_trailing_byte cs c = @@ -355,14 +363,8 @@ and progress_utf8 last nj n c tt cs = with Not_found -> last -and progress_from_byte last nj tt cs = function - (* Utf8 leading byte *) - | '\x00'..'\x7F' as c -> progress_utf8 last nj 1 c tt cs - | '\xC0'..'\xDF' as c -> progress_utf8 last nj 2 c tt cs - | '\xE0'..'\xEF' as c -> progress_utf8 last nj 3 c tt cs - | '\xF0'..'\xF7' as c -> progress_utf8 last nj 4 c tt cs - | _ (* '\x80'..\xBF'|'\xF8'..'\xFF' *) -> - error_utf8 cs +and progress_from_byte last nj tt cs c = + progress_utf8 last nj (utf8_char_size cs c) c tt cs (* Must be a special token *) let process_chars bp c cs = @@ -370,7 +372,10 @@ let process_chars bp c cs = let ep = Stream.count cs in match t with | Some t -> (("", t), (bp, ep)) - | None -> err (bp, ep) Undefined_token + | None -> + let ep' = bp + utf8_char_size cs c in + njunk (ep' - ep) cs; + err (bp, ep') Undefined_token let parse_after_dollar bp = parser |