From a4c7f8bd98be2a200489325ff7c5061cf80ab4f3 Mon Sep 17 00:00:00 2001 From: Enrico Tassi Date: Tue, 27 Dec 2016 16:53:30 +0100 Subject: Imported Upstream version 8.6 --- lib/unicode.ml | 122 +++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 106 insertions(+), 16 deletions(-) (limited to 'lib/unicode.ml') diff --git a/lib/unicode.ml b/lib/unicode.ml index 1765e93d..ced5e258 100644 --- a/lib/unicode.ml +++ b/lib/unicode.ml @@ -8,9 +8,7 @@ (** Unicode utilities *) -type status = Letter | IdentPart | Symbol - -exception Unsupported +type status = Letter | IdentPart | Symbol | Unknown (* The following table stores classes of Unicode characters that are used by the lexer. There are 3 different classes so 2 bits are @@ -18,7 +16,7 @@ exception Unsupported to simplify the masking process. (This choice seems to be a good trade-off between speed and space after some benchmarks.) *) -(* A 256ko table, initially filled with zeros. *) +(* A 256 KiB table, initially filled with zeros. *) let table = Array.make (1 lsl 17) 0 (* Associate a 2-bit pattern to each status at position [i]. @@ -29,6 +27,7 @@ let mask i = function | Letter -> 1 lsl ((i land 7) lsl 1) (* 01 *) | IdentPart -> 2 lsl ((i land 7) lsl 1) (* 10 *) | Symbol -> 3 lsl ((i land 7) lsl 1) (* 11 *) + | Unknown -> 0 lsl ((i land 7) lsl 1) (* 00 *) (* Helper to reset 2 bits in a word. *) let reset_mask i = @@ -55,7 +54,7 @@ let lookup x = if v = 1 then Letter else if v = 2 then IdentPart else if v = 3 then Symbol - else raise Unsupported + else Unknown (* [classify] discriminates between 3 different kinds of symbols based on the standard unicode classification (extracted from @@ -147,6 +146,11 @@ let utf8_of_unicode n = s end +(* If [s] is some UTF-8 encoded string + and [i] is a position of some UTF-8 character within [s] + then [next_utf8 s i] returns [(j,n)] where: + - [j] indicates the position of the next UTF-8 character + - [n] represents the UTF-8 character at index [i] *) let next_utf8 s i = let err () = invalid_arg "utf8" in let l = String.length s - i in @@ -168,6 +172,13 @@ let next_utf8 s i = (c land 0x3F) lsl 6 + (d land 0x3F) else err () +let is_utf8 s = + let rec check i = + let (off, _) = next_utf8 s i in + check (i + off) + in + try check 0 with End_of_input -> true | Invalid_argument _ -> false + (* Check the well-formedness of an identifier *) let initial_refutation j n s = @@ -203,7 +214,6 @@ let ident_refutation s = |x -> x with | End_of_input -> Some (true,"The empty string is not an identifier.") - | Unsupported -> Some (true,s^": unsupported character in utf8 sequence.") | Invalid_argument _ -> Some (true,s^": invalid utf8 sequence.") let lowercase_unicode = @@ -228,14 +238,94 @@ let is_basic_ascii s = !ok let ascii_of_ident s = - if is_basic_ascii s then s else - let i = ref 0 and out = ref "" in - begin try while true do + let len = String.length s in + let has_UU i = + i+2 < len && s.[i]='_' && s.[i+1]='U' && s.[i+2]='U' + in + let i = ref 0 in + while !i < len && Char.code s.[!i] < 128 && not (has_UU !i) do + incr i + done; + if !i = len then s else + let out = Buffer.create (2*len) in + Buffer.add_substring out s 0 !i; + while !i < len do let j, n = next_utf8 s !i in - out := - if n >= 128 - then Printf.sprintf "%s__U%04x_" !out n - else Printf.sprintf "%s%c" !out s.[!i]; - i := !i + j - done with End_of_input -> () end; - !out + if n >= 128 then + (Printf.bprintf out "_UU%04x_" n; i := !i + j) + else if has_UU !i then + (Buffer.add_string out "_UUU"; i := !i + 3) + else + (Buffer.add_char out s.[!i]; incr i) + done; + Buffer.contents out + +(* Compute length of an UTF-8 encoded string + Rem 1 : utf8_length <= String.length (equal if pure ascii) + Rem 2 : if used for an iso8859_1 encoded string, the result is + wrong in very rare cases. Such a wrong case corresponds to any + sequence of a character in range 192..253 immediately followed by a + character in range 128..191 (typical case in french is "déçu" which + is counted 3 instead of 4); then no real harm to use always + utf8_length even if using an iso8859_1 encoding *) + +(** FIXME: duplicate code with Pp *) + +let utf8_length s = + let len = String.length s + and cnt = ref 0 + and nc = ref 0 + and p = ref 0 in + while !p < len do + begin + match s.[!p] with + | '\000'..'\127' -> nc := 0 (* ascii char *) + | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *) + | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *) + | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *) + | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *) + | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *) + | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *) + | '\254'..'\255' -> nc := 0 (* invalid byte *) + end ; + incr p ; + while !p < len && !nc > 0 do + match s.[!p] with + | '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc + | _ (* not a continuation byte *) -> nc := 0 + done ; + incr cnt + done ; + !cnt + +(* Variant of String.sub for UTF8 character positions *) +let utf8_sub s start_u len_u = + let len_b = String.length s + and end_u = start_u + len_u + and cnt = ref 0 + and nc = ref 0 + and p = ref 0 in + let start_b = ref len_b in + while !p < len_b && !cnt < end_u do + if !cnt <= start_u then start_b := !p ; + begin + match s.[!p] with + | '\000'..'\127' -> nc := 0 (* ascii char *) + | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *) + | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *) + | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *) + | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *) + | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *) + | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *) + | '\254'..'\255' -> nc := 0 (* invalid byte *) + end ; + incr p ; + while !p < len_b && !nc > 0 do + match s.[!p] with + | '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc + | _ (* not a continuation byte *) -> nc := 0 + done ; + incr cnt + done ; + let end_b = !p in + String.sub s !start_b (end_b - !start_b) -- cgit v1.2.3