From 9043add656177eeac1491a73d2f3ab92bec0013c Mon Sep 17 00:00:00 2001 From: Benjamin Barenblat Date: Sat, 29 Dec 2018 14:31:27 -0500 Subject: Imported Upstream version 8.8.2 --- clib/unicode.ml | 387 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 387 insertions(+) create mode 100644 clib/unicode.ml (limited to 'clib/unicode.ml') diff --git a/clib/unicode.ml b/clib/unicode.ml new file mode 100644 index 00000000..1e45c0d2 --- /dev/null +++ b/clib/unicode.ml @@ -0,0 +1,387 @@ +(************************************************************************) +(* * The Coq Proof Assistant / The Coq Development Team *) +(* v * INRIA, CNRS and contributors - Copyright 1999-2018 *) +(* 1 lsl ((i land 7) * 3) (* 001 *) + | IdentPart -> 2 lsl ((i land 7) * 3) (* 010 *) + | Symbol -> 3 lsl ((i land 7) * 3) (* 011 *) + | IdentSep -> 4 lsl ((i land 7) * 3) (* 100 *) + | Unknown -> 0 lsl ((i land 7) * 3) (* 000 *) + +(* Helper to reset 3 bits in a word. *) +let reset_mask i = + lnot (7 lsl ((i land 7) * 3)) + +(* Initialize the lookup table from a list of segments, assigning + a status to every character of each segment. The order of these + assignments is relevant: it is possible to assign status [s] to + a segment [(c1, c2)] and later assign [s'] to [c] even if [c] is + between [c1] and [c2]. *) +let mk_lookup_table_from_unicode_tables_for status tables = + List.iter + (List.iter + (fun (c1, c2) -> + for i = c1 to c2 do + table.(i lsr 3) <- + (table.(i lsr 3) land (reset_mask i)) lor (mask i status) + done)) + tables + +(* Look up into the table and interpret the found pattern. *) +let lookup x = + let v = (table.(x lsr 3) lsr ((x land 7) * 3)) land 7 in + if v = 1 then Letter + else if v = 2 then IdentPart + else if v = 3 then Symbol + else if v = 4 then IdentSep + else Unknown + +(* [classify] discriminates between 5 different kinds of + symbols based on the standard unicode classification (extracted from + Camomile). *) +let classify = + let single c = [ (c, c) ] in + (* General tables. *) + mk_lookup_table_from_unicode_tables_for Symbol + [ + Unicodetable.sm; (* Symbol, maths. *) + Unicodetable.sc; (* Symbol, currency. *) + Unicodetable.so; (* Symbol, modifier. *) + Unicodetable.pd; (* Punctuation, dash. *) + Unicodetable.pc; (* Punctuation, connector. *) + Unicodetable.pe; (* Punctuation, open. *) + Unicodetable.ps; (* Punctution, close. *) + Unicodetable.pi; (* Punctuation, initial quote. *) + Unicodetable.pf; (* Punctuation, final quote. *) + Unicodetable.po; (* Punctuation, other. *) + ]; + mk_lookup_table_from_unicode_tables_for Letter + [ + Unicodetable.lu; (* Letter, uppercase. *) + Unicodetable.ll; (* Letter, lowercase. *) + Unicodetable.lt; (* Letter, titlecase. *) + Unicodetable.lo; (* Letter, others. *) + ]; + mk_lookup_table_from_unicode_tables_for IdentPart + [ + Unicodetable.nd; (* Number, decimal digits. *) + Unicodetable.nl; (* Number, letter. *) + Unicodetable.no; (* Number, other. *) + ]; + + (* Workaround. Some characters seems to be missing in + Camomile's category tables. We add them manually. *) + mk_lookup_table_from_unicode_tables_for Letter + [ + [(0x01D00, 0x01D7F)]; (* Phonetic Extensions. *) + [(0x01D80, 0x01DBF)]; (* Phonetic Extensions Suppl. *) + [(0x01DC0, 0x01DFF)]; (* Combining Diacritical Marks Suppl.*) + ]; + + (* Exceptions (from a previous version of this function). *) + mk_lookup_table_from_unicode_tables_for Symbol + [ + [(0x000B2, 0x000B3)]; (* Superscript 2-3. *) + single 0x000B9; (* Superscript 1. *) + single 0x02070; (* Superscript 0. *) + [(0x02074, 0x02079)]; (* Superscript 4-9. *) + single 0x0002E; (* Dot. *) + ]; + mk_lookup_table_from_unicode_tables_for IdentSep + [ + single 0x005F; (* Underscore. *) + single 0x00A0; (* Non breaking space. *) + ]; + mk_lookup_table_from_unicode_tables_for IdentPart + [ + single 0x0027; (* Single quote. *) + ]; + (* Lookup *) + lookup + +exception End_of_input + +let utf8_of_unicode n = + if n < 128 then + String.make 1 (Char.chr n) + else + let (m,s) = if n < 2048 then (2,192) else if n < 65536 then (3,224) else (4,240) in + String.init m (fun i -> + let j = (n lsr ((m - 1 - i) * 6)) land 63 in + Char.chr (j + if i = 0 then s else 128)) + +(* If [s] is some UTF-8 encoded string + and [i] is a position of some UTF-8 character within [s] + then [next_utf8 s i] returns [(j,n)] where: + - [j] indicates the position of the next UTF-8 character + - [n] represents the UTF-8 character at index [i] *) +let next_utf8 s i = + let err () = invalid_arg "utf8" in + let l = String.length s - i in + if l = 0 then raise End_of_input + else let a = Char.code s.[i] in if a <= 0x7F then + 1, a + else if a land 0x40 = 0 || l = 1 then err () + else let b = Char.code s.[i+1] in if b land 0xC0 <> 0x80 then err () + else if a land 0x20 = 0 then + 2, (a land 0x1F) lsl 6 + (b land 0x3F) + else if l = 2 then err () + else let c = Char.code s.[i+2] in if c land 0xC0 <> 0x80 then err () + else if a land 0x10 = 0 then + 3, (a land 0x0F) lsl 12 + (b land 0x3F) lsl 6 + (c land 0x3F) + else if l = 3 then err () + else let d = Char.code s.[i+3] in if d land 0xC0 <> 0x80 then err () + else if a land 0x08 = 0 then + 4, (a land 0x07) lsl 18 + (b land 0x3F) lsl 12 + + (c land 0x3F) lsl 6 + (d land 0x3F) + else err () + +let is_utf8 s = + let rec check i = + let (off, _) = next_utf8 s i in + check (i + off) + in + try check 0 with End_of_input -> true | Invalid_argument _ -> false + +(* Escape string if it contains non-utf8 characters *) + +let escaped_non_utf8 s = + let mk_escape x = Printf.sprintf "%%%X" x in + let buff = Buffer.create (String.length s * 3) in + let rec process_trailing_aux i j = + if i = j then i else + match String.unsafe_get s i with + | '\128'..'\191' -> process_trailing_aux (i+1) j + | _ -> i in + let process_trailing i n = + let j = if i+n-1 >= String.length s then i+1 else process_trailing_aux (i+1) (i+n) in + (if j = i+n then + Buffer.add_string buff (String.sub s i n) + else + let v = Array.init (j-i) (fun k -> mk_escape (Char.code s.[i+k])) in + Buffer.add_string buff (String.concat "" (Array.to_list v))); + j in + let rec process i = + if i >= String.length s then Buffer.contents buff else + let c = String.unsafe_get s i in + match c with + | '\000'..'\127' -> Buffer.add_char buff c; process (i+1) + | '\128'..'\191' | '\248'..'\255' -> Buffer.add_string buff (mk_escape (Char.code c)); process (i+1) + | '\192'..'\223' -> process (process_trailing i 2) + | '\224'..'\239' -> process (process_trailing i 3) + | '\240'..'\247' -> process (process_trailing i 4) + in + process 0 + +let escaped_if_non_utf8 s = + if is_utf8 s then s else escaped_non_utf8 s + +(* Check the well-formedness of an identifier *) + +let is_valid_ident_initial = function + | Letter | IdentSep -> true + | IdentPart | Symbol | Unknown -> false + +let initial_refutation j n s = + if is_valid_ident_initial (classify n) then None + else + let c = String.sub s 0 j in + Some (false, + "Invalid character '"^c^"' at beginning of identifier \""^s^"\".") + +let is_valid_ident_trailing = function + | Letter | IdentSep | IdentPart -> true + | Symbol | Unknown -> false + +let trailing_refutation i j n s = + if is_valid_ident_trailing (classify n) then None + else + let c = String.sub s i j in + Some (false, + "Invalid character '"^c^"' in identifier \""^s^"\".") + +let is_unknown = function + | Unknown -> true + | Letter | IdentSep | IdentPart | Symbol -> false + +let is_ident_part = function + | IdentPart -> true + | Letter | IdentSep | Symbol | Unknown -> false + +let is_ident_sep = function + | IdentSep -> true + | Letter | IdentPart | Symbol | Unknown -> false + +let ident_refutation s = + if s = ".." then None else try + let j, n = next_utf8 s 0 in + match initial_refutation j n s with + |None -> + begin try + let rec aux i = + let j, n = next_utf8 s i in + match trailing_refutation i j n s with + |None -> aux (i + j) + |x -> x + in aux j + with End_of_input -> None + end + |x -> x + with + | End_of_input -> Some (true,"The empty string is not an identifier.") + | Invalid_argument _ -> Some (true,escaped_non_utf8 s^": invalid utf8 sequence.") + +let lowercase_unicode = + let tree = Segmenttree.make Unicodetable.to_lower in + fun unicode -> + try + match Segmenttree.lookup unicode tree with + | `Abs c -> c + | `Delta d -> unicode + d + with Not_found -> unicode + +let lowercase_first_char s = + assert (s <> ""); + let j, n = next_utf8 s 0 in + utf8_of_unicode (lowercase_unicode n) + +let split_at_first_letter s = + let n, v = next_utf8 s 0 in + if ((* optim *) n = 1 && s.[0] != '_') || not (is_ident_sep (classify v)) then None + else begin + let n = ref n in + let p = ref 0 in + while !n < String.length s && + let n', v = next_utf8 s !n in + p := n'; + (* Test if not letter *) + ((* optim *) n' = 1 && (s.[!n] = '_' || s.[!n] = '\'')) + || let st = classify v in + is_ident_sep st || is_ident_part st + do n := !n + !p + done; + let s1 = String.sub s 0 !n in + let s2 = String.sub s !n (String.length s - !n) in + Some (s1,s2) + end + +(** For extraction, we need to encode unicode character into ascii ones *) + +let is_basic_ascii s = + let ok = ref true in + String.iter (fun c -> if Char.code c >= 128 then ok := false) s; + !ok + +let ascii_of_ident s = + let len = String.length s in + let has_UU i = + i+2 < len && s.[i]='_' && s.[i+1]='U' && s.[i+2]='U' + in + let i = ref 0 in + while !i < len && Char.code s.[!i] < 128 && not (has_UU !i) do + incr i + done; + if !i = len then s else + let out = Buffer.create (2*len) in + Buffer.add_substring out s 0 !i; + while !i < len do + let j, n = next_utf8 s !i in + if n >= 128 then + (Printf.bprintf out "_UU%04x_" n; i := !i + j) + else if has_UU !i then + (Buffer.add_string out "_UUU"; i := !i + 3) + else + (Buffer.add_char out s.[!i]; incr i) + done; + Buffer.contents out + +(* Compute length of an UTF-8 encoded string + Rem 1 : utf8_length <= String.length (equal if pure ascii) + Rem 2 : if used for an iso8859_1 encoded string, the result is + wrong in very rare cases. Such a wrong case corresponds to any + sequence of a character in range 192..253 immediately followed by a + character in range 128..191 (typical case in french is "déçu" which + is counted 3 instead of 4); then no real harm to use always + utf8_length even if using an iso8859_1 encoding *) + +(** FIXME: duplicate code with Pp *) + +let utf8_length s = + let len = String.length s + and cnt = ref 0 + and nc = ref 0 + and p = ref 0 in + while !p < len do + begin + match s.[!p] with + | '\000'..'\127' -> nc := 0 (* ascii char *) + | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *) + | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *) + | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *) + | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *) + | '\248'..'\255' -> nc := 0 (* invalid byte *) + end ; + incr p ; + while !p < len && !nc > 0 do + match s.[!p] with + | '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc + | _ (* not a continuation byte *) -> nc := 0 + done ; + incr cnt + done ; + !cnt + +(* Variant of String.sub for UTF8 character positions *) +let utf8_sub s start_u len_u = + let len_b = String.length s + and end_u = start_u + len_u + and cnt = ref 0 + and nc = ref 0 + and p = ref 0 in + let start_b = ref len_b in + while !p < len_b && !cnt < end_u do + if !cnt <= start_u then start_b := !p ; + begin + match s.[!p] with + | '\000'..'\127' -> nc := 0 (* ascii char *) + | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *) + | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *) + | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *) + | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *) + | '\248'..'\255' -> nc := 0 (* invalid byte *) + end ; + incr p ; + while !p < len_b && !nc > 0 do + match s.[!p] with + | '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc + | _ (* not a continuation byte *) -> nc := 0 + done ; + incr cnt + done ; + let end_b = !p in + String.sub s !start_b (end_b - !start_b) -- cgit v1.2.3