aboutsummaryrefslogtreecommitdiffhomepage
path: root/clib/unicode.ml
diff options
context:
space:
mode:
authorGravatar Emilio Jesus Gallego Arias <e+git@x80.org>2017-12-15 18:51:45 +0100
committerGravatar Emilio Jesus Gallego Arias <e+git@x80.org>2017-12-23 19:20:30 +0100
commit5ffa147bd2fe548df3ac9053fe497d0871a5f6df (patch)
treecc62882184c34e33e2995a5a4ff4ebfcbd0defe0 /clib/unicode.ml
parentdea75d74c222c25f6aa6c38506ac7a51b339e9c6 (diff)
[lib] Split auxiliary libraries into Coq-specific and general.
Up to this point the `lib` directory contained two different library archives, `clib.cma` and `lib.cma`, which a rough splitting between Coq-specific libraries and general-purpose ones. We know split the directory in two, as to make the distinction clear: - `clib`: contains libraries that are not Coq specific and implement common data structures and programming patterns. These libraries could be eventually replace with external dependencies and the rest of the code base wouldn't notice much. - `lib`: contains Coq-specific common libraries in widespread use along the codebase, but that are not considered part of other components. Examples are printing, error handling, or flags. In some cases we have coupling due to utility files depending on Coq specific flags, however this commit doesn't modify any files, but only moves them around, further cleanup is welcome, as indeed a few files in `lib` should likely be placed in `clib`. Also note that `Deque` is not used ATM.
Diffstat (limited to 'clib/unicode.ml')
-rw-r--r--clib/unicode.ml385
1 files changed, 385 insertions, 0 deletions
diff --git a/clib/unicode.ml b/clib/unicode.ml
new file mode 100644
index 000000000..f193c4e0f
--- /dev/null
+++ b/clib/unicode.ml
@@ -0,0 +1,385 @@
+(***********************************************************************)
+(* v * The Coq Proof Assistant / The Coq Development Team *)
+(* <O___,, * INRIA-Rocquencourt & LRI-CNRS-Orsay *)
+(* \VV/ *************************************************************)
+(* // * This file is distributed under the terms of the *)
+(* * GNU Lesser General Public License Version 2.1 *)
+(***********************************************************************)
+
+(** Unicode utilities *)
+
+type status = Letter | IdentPart | Symbol | IdentSep | Unknown
+
+(* The following table stores classes of Unicode characters that
+ are used by the lexer. There are 5 different classes so 3 bits
+ are allocated for each character. We encode the masks of 8
+ characters per word, thus using 24 bits over the 31 available
+ bits. (This choice seems to be a good trade-off between speed
+ and space after some benchmarks.) *)
+
+(* A 256 KiB table, initially filled with zeros. *)
+let table = Array.make (1 lsl 17) 0
+
+(* Associate a 2-bit pattern to each status at position [i].
+ Only the 3 lowest bits of [i] are taken into account to
+ define the position of the pattern in the word.
+ Notice that pattern "00" means "undefined". *)
+let mask i = function
+ | Letter -> 1 lsl ((i land 7) * 3) (* 001 *)
+ | IdentPart -> 2 lsl ((i land 7) * 3) (* 010 *)
+ | Symbol -> 3 lsl ((i land 7) * 3) (* 011 *)
+ | IdentSep -> 4 lsl ((i land 7) * 3) (* 100 *)
+ | Unknown -> 0 lsl ((i land 7) * 3) (* 000 *)
+
+(* Helper to reset 3 bits in a word. *)
+let reset_mask i =
+ lnot (7 lsl ((i land 7) * 3))
+
+(* Initialize the lookup table from a list of segments, assigning
+ a status to every character of each segment. The order of these
+ assignments is relevant: it is possible to assign status [s] to
+ a segment [(c1, c2)] and later assign [s'] to [c] even if [c] is
+ between [c1] and [c2]. *)
+let mk_lookup_table_from_unicode_tables_for status tables =
+ List.iter
+ (List.iter
+ (fun (c1, c2) ->
+ for i = c1 to c2 do
+ table.(i lsr 3) <-
+ (table.(i lsr 3) land (reset_mask i)) lor (mask i status)
+ done))
+ tables
+
+(* Look up into the table and interpret the found pattern. *)
+let lookup x =
+ let v = (table.(x lsr 3) lsr ((x land 7) * 3)) land 7 in
+ if v = 1 then Letter
+ else if v = 2 then IdentPart
+ else if v = 3 then Symbol
+ else if v = 4 then IdentSep
+ else Unknown
+
+(* [classify] discriminates between 5 different kinds of
+ symbols based on the standard unicode classification (extracted from
+ Camomile). *)
+let classify =
+ let single c = [ (c, c) ] in
+ (* General tables. *)
+ mk_lookup_table_from_unicode_tables_for Symbol
+ [
+ Unicodetable.sm; (* Symbol, maths. *)
+ Unicodetable.sc; (* Symbol, currency. *)
+ Unicodetable.so; (* Symbol, modifier. *)
+ Unicodetable.pd; (* Punctuation, dash. *)
+ Unicodetable.pc; (* Punctuation, connector. *)
+ Unicodetable.pe; (* Punctuation, open. *)
+ Unicodetable.ps; (* Punctution, close. *)
+ Unicodetable.pi; (* Punctuation, initial quote. *)
+ Unicodetable.pf; (* Punctuation, final quote. *)
+ Unicodetable.po; (* Punctuation, other. *)
+ ];
+ mk_lookup_table_from_unicode_tables_for Letter
+ [
+ Unicodetable.lu; (* Letter, uppercase. *)
+ Unicodetable.ll; (* Letter, lowercase. *)
+ Unicodetable.lt; (* Letter, titlecase. *)
+ Unicodetable.lo; (* Letter, others. *)
+ ];
+ mk_lookup_table_from_unicode_tables_for IdentPart
+ [
+ Unicodetable.nd; (* Number, decimal digits. *)
+ Unicodetable.nl; (* Number, letter. *)
+ Unicodetable.no; (* Number, other. *)
+ ];
+
+ (* Workaround. Some characters seems to be missing in
+ Camomile's category tables. We add them manually. *)
+ mk_lookup_table_from_unicode_tables_for Letter
+ [
+ [(0x01D00, 0x01D7F)]; (* Phonetic Extensions. *)
+ [(0x01D80, 0x01DBF)]; (* Phonetic Extensions Suppl. *)
+ [(0x01DC0, 0x01DFF)]; (* Combining Diacritical Marks Suppl.*)
+ ];
+
+ (* Exceptions (from a previous version of this function). *)
+ mk_lookup_table_from_unicode_tables_for Symbol
+ [
+ [(0x000B2, 0x000B3)]; (* Superscript 2-3. *)
+ single 0x000B9; (* Superscript 1. *)
+ single 0x02070; (* Superscript 0. *)
+ [(0x02074, 0x02079)]; (* Superscript 4-9. *)
+ single 0x0002E; (* Dot. *)
+ ];
+ mk_lookup_table_from_unicode_tables_for IdentSep
+ [
+ single 0x005F; (* Underscore. *)
+ single 0x00A0; (* Non breaking space. *)
+ ];
+ mk_lookup_table_from_unicode_tables_for IdentPart
+ [
+ single 0x0027; (* Single quote. *)
+ ];
+ (* Lookup *)
+ lookup
+
+exception End_of_input
+
+let utf8_of_unicode n =
+ if n < 128 then
+ String.make 1 (Char.chr n)
+ else
+ let (m,s) = if n < 2048 then (2,192) else if n < 65536 then (3,224) else (4,240) in
+ String.init m (fun i ->
+ let j = (n lsr ((m - 1 - i) * 6)) land 63 in
+ Char.chr (j + if i = 0 then s else 128))
+
+(* If [s] is some UTF-8 encoded string
+ and [i] is a position of some UTF-8 character within [s]
+ then [next_utf8 s i] returns [(j,n)] where:
+ - [j] indicates the position of the next UTF-8 character
+ - [n] represents the UTF-8 character at index [i] *)
+let next_utf8 s i =
+ let err () = invalid_arg "utf8" in
+ let l = String.length s - i in
+ if l = 0 then raise End_of_input
+ else let a = Char.code s.[i] in if a <= 0x7F then
+ 1, a
+ else if a land 0x40 = 0 || l = 1 then err ()
+ else let b = Char.code s.[i+1] in if b land 0xC0 <> 0x80 then err ()
+ else if a land 0x20 = 0 then
+ 2, (a land 0x1F) lsl 6 + (b land 0x3F)
+ else if l = 2 then err ()
+ else let c = Char.code s.[i+2] in if c land 0xC0 <> 0x80 then err ()
+ else if a land 0x10 = 0 then
+ 3, (a land 0x0F) lsl 12 + (b land 0x3F) lsl 6 + (c land 0x3F)
+ else if l = 3 then err ()
+ else let d = Char.code s.[i+3] in if d land 0xC0 <> 0x80 then err ()
+ else if a land 0x08 = 0 then
+ 4, (a land 0x07) lsl 18 + (b land 0x3F) lsl 12 +
+ (c land 0x3F) lsl 6 + (d land 0x3F)
+ else err ()
+
+let is_utf8 s =
+ let rec check i =
+ let (off, _) = next_utf8 s i in
+ check (i + off)
+ in
+ try check 0 with End_of_input -> true | Invalid_argument _ -> false
+
+(* Escape string if it contains non-utf8 characters *)
+
+let escaped_non_utf8 s =
+ let mk_escape x = Printf.sprintf "%%%X" x in
+ let buff = Buffer.create (String.length s * 3) in
+ let rec process_trailing_aux i j =
+ if i = j then i else
+ match String.unsafe_get s i with
+ | '\128'..'\191' -> process_trailing_aux (i+1) j
+ | _ -> i in
+ let process_trailing i n =
+ let j = if i+n-1 >= String.length s then i+1 else process_trailing_aux (i+1) (i+n) in
+ (if j = i+n then
+ Buffer.add_string buff (String.sub s i n)
+ else
+ let v = Array.init (j-i) (fun k -> mk_escape (Char.code s.[i+k])) in
+ Buffer.add_string buff (String.concat "" (Array.to_list v)));
+ j in
+ let rec process i =
+ if i >= String.length s then Buffer.contents buff else
+ let c = String.unsafe_get s i in
+ match c with
+ | '\000'..'\127' -> Buffer.add_char buff c; process (i+1)
+ | '\128'..'\191' | '\248'..'\255' -> Buffer.add_string buff (mk_escape (Char.code c)); process (i+1)
+ | '\192'..'\223' -> process (process_trailing i 2)
+ | '\224'..'\239' -> process (process_trailing i 3)
+ | '\240'..'\247' -> process (process_trailing i 4)
+ in
+ process 0
+
+let escaped_if_non_utf8 s =
+ if is_utf8 s then s else escaped_non_utf8 s
+
+(* Check the well-formedness of an identifier *)
+
+let is_valid_ident_initial = function
+ | Letter | IdentSep -> true
+ | IdentPart | Symbol | Unknown -> false
+
+let initial_refutation j n s =
+ if is_valid_ident_initial (classify n) then None
+ else
+ let c = String.sub s 0 j in
+ Some (false,
+ "Invalid character '"^c^"' at beginning of identifier \""^s^"\".")
+
+let is_valid_ident_trailing = function
+ | Letter | IdentSep | IdentPart -> true
+ | Symbol | Unknown -> false
+
+let trailing_refutation i j n s =
+ if is_valid_ident_trailing (classify n) then None
+ else
+ let c = String.sub s i j in
+ Some (false,
+ "Invalid character '"^c^"' in identifier \""^s^"\".")
+
+let is_unknown = function
+ | Unknown -> true
+ | Letter | IdentSep | IdentPart | Symbol -> false
+
+let is_ident_part = function
+ | IdentPart -> true
+ | Letter | IdentSep | Symbol | Unknown -> false
+
+let is_ident_sep = function
+ | IdentSep -> true
+ | Letter | IdentPart | Symbol | Unknown -> false
+
+let ident_refutation s =
+ if s = ".." then None else try
+ let j, n = next_utf8 s 0 in
+ match initial_refutation j n s with
+ |None ->
+ begin try
+ let rec aux i =
+ let j, n = next_utf8 s i in
+ match trailing_refutation i j n s with
+ |None -> aux (i + j)
+ |x -> x
+ in aux j
+ with End_of_input -> None
+ end
+ |x -> x
+ with
+ | End_of_input -> Some (true,"The empty string is not an identifier.")
+ | Invalid_argument _ -> Some (true,escaped_non_utf8 s^": invalid utf8 sequence.")
+
+let lowercase_unicode =
+ let tree = Segmenttree.make Unicodetable.to_lower in
+ fun unicode ->
+ try
+ match Segmenttree.lookup unicode tree with
+ | `Abs c -> c
+ | `Delta d -> unicode + d
+ with Not_found -> unicode
+
+let lowercase_first_char s =
+ assert (s <> "");
+ let j, n = next_utf8 s 0 in
+ utf8_of_unicode (lowercase_unicode n)
+
+let split_at_first_letter s =
+ let n, v = next_utf8 s 0 in
+ if ((* optim *) n = 1 && s.[0] != '_') || not (is_ident_sep (classify v)) then None
+ else begin
+ let n = ref n in
+ let p = ref 0 in
+ while !n < String.length s &&
+ let n', v = next_utf8 s !n in
+ p := n';
+ (* Test if not letter *)
+ ((* optim *) n' = 1 && (s.[!n] = '_' || s.[!n] = '\''))
+ || let st = classify v in
+ is_ident_sep st || is_ident_part st
+ do n := !n + !p
+ done;
+ let s1 = String.sub s 0 !n in
+ let s2 = String.sub s !n (String.length s - !n) in
+ Some (s1,s2)
+ end
+
+(** For extraction, we need to encode unicode character into ascii ones *)
+
+let is_basic_ascii s =
+ let ok = ref true in
+ String.iter (fun c -> if Char.code c >= 128 then ok := false) s;
+ !ok
+
+let ascii_of_ident s =
+ let len = String.length s in
+ let has_UU i =
+ i+2 < len && s.[i]='_' && s.[i+1]='U' && s.[i+2]='U'
+ in
+ let i = ref 0 in
+ while !i < len && Char.code s.[!i] < 128 && not (has_UU !i) do
+ incr i
+ done;
+ if !i = len then s else
+ let out = Buffer.create (2*len) in
+ Buffer.add_substring out s 0 !i;
+ while !i < len do
+ let j, n = next_utf8 s !i in
+ if n >= 128 then
+ (Printf.bprintf out "_UU%04x_" n; i := !i + j)
+ else if has_UU !i then
+ (Buffer.add_string out "_UUU"; i := !i + 3)
+ else
+ (Buffer.add_char out s.[!i]; incr i)
+ done;
+ Buffer.contents out
+
+(* Compute length of an UTF-8 encoded string
+ Rem 1 : utf8_length <= String.length (equal if pure ascii)
+ Rem 2 : if used for an iso8859_1 encoded string, the result is
+ wrong in very rare cases. Such a wrong case corresponds to any
+ sequence of a character in range 192..253 immediately followed by a
+ character in range 128..191 (typical case in french is "déçu" which
+ is counted 3 instead of 4); then no real harm to use always
+ utf8_length even if using an iso8859_1 encoding *)
+
+(** FIXME: duplicate code with Pp *)
+
+let utf8_length s =
+ let len = String.length s
+ and cnt = ref 0
+ and nc = ref 0
+ and p = ref 0 in
+ while !p < len do
+ begin
+ match s.[!p] with
+ | '\000'..'\127' -> nc := 0 (* ascii char *)
+ | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *)
+ | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
+ | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
+ | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
+ | '\248'..'\255' -> nc := 0 (* invalid byte *)
+ end ;
+ incr p ;
+ while !p < len && !nc > 0 do
+ match s.[!p] with
+ | '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc
+ | _ (* not a continuation byte *) -> nc := 0
+ done ;
+ incr cnt
+ done ;
+ !cnt
+
+(* Variant of String.sub for UTF8 character positions *)
+let utf8_sub s start_u len_u =
+ let len_b = String.length s
+ and end_u = start_u + len_u
+ and cnt = ref 0
+ and nc = ref 0
+ and p = ref 0 in
+ let start_b = ref len_b in
+ while !p < len_b && !cnt < end_u do
+ if !cnt <= start_u then start_b := !p ;
+ begin
+ match s.[!p] with
+ | '\000'..'\127' -> nc := 0 (* ascii char *)
+ | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *)
+ | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
+ | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
+ | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
+ | '\248'..'\255' -> nc := 0 (* invalid byte *)
+ end ;
+ incr p ;
+ while !p < len_b && !nc > 0 do
+ match s.[!p] with
+ | '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc
+ | _ (* not a continuation byte *) -> nc := 0
+ done ;
+ incr cnt
+ done ;
+ let end_b = !p in
+ String.sub s !start_b (end_b - !start_b)