diff options
author | letouzey <letouzey@85f007b7-540e-0410-9357-904b9bb8a0f7> | 2012-09-18 15:06:42 +0000 |
---|---|---|
committer | letouzey <letouzey@85f007b7-540e-0410-9357-904b9bb8a0f7> | 2012-09-18 15:06:42 +0000 |
commit | bf08866eabad4408de975bae92f3b3c1f718322c (patch) | |
tree | faab2a63879ce58c26ed5439a21af47a06226051 /lib | |
parent | 4422e16f529359bb96c7eee214b2b6648958ef48 (diff) |
More cleanup of Util: utf8 aspects moved to a new file unicode.ml
git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/coq/trunk@15818 85f007b7-540e-0410-9357-904b9bb8a0f7
Diffstat (limited to 'lib')
-rw-r--r-- | lib/clib.mllib | 2 | ||||
-rw-r--r-- | lib/lib.mllib | 3 | ||||
-rw-r--r-- | lib/system.ml | 2 | ||||
-rw-r--r-- | lib/unicode.ml | 226 | ||||
-rw-r--r-- | lib/unicode.mli | 27 | ||||
-rw-r--r-- | lib/util.ml | 217 | ||||
-rw-r--r-- | lib/util.mli | 10 |
7 files changed, 257 insertions, 230 deletions
diff --git a/lib/clib.mllib b/lib/clib.mllib index 0dd24d74c..ca0e97bb0 100644 --- a/lib/clib.mllib +++ b/lib/clib.mllib @@ -1,8 +1,6 @@ Pp_control Pp Coq_config -Segmenttree -Unicodetable Deque CObj CList diff --git a/lib/lib.mllib b/lib/lib.mllib index 618ff80b6..21eb7a4a5 100644 --- a/lib/lib.mllib +++ b/lib/lib.mllib @@ -6,6 +6,9 @@ Errors Bigint Hashcons Dyn +Segmenttree +Unicodetable +Unicode System Gmap Fset diff --git a/lib/system.ml b/lib/system.ml index e721b1f65..cf650d6ab 100644 --- a/lib/system.ml +++ b/lib/system.ml @@ -24,7 +24,7 @@ let exclude_search_in_dirname f = skipped_dirnames := f :: !skipped_dirnames let ok_dirname f = f <> "" && f.[0] <> '.' && not (List.mem f !skipped_dirnames) && - match ident_refutation f with |None -> true |_ -> false + (Unicode.ident_refutation f = None) let all_subdirs ~unix_path:root = let l = ref [] in diff --git a/lib/unicode.ml b/lib/unicode.ml new file mode 100644 index 000000000..75b3b5899 --- /dev/null +++ b/lib/unicode.ml @@ -0,0 +1,226 @@ +(***********************************************************************) +(* v * The Coq Proof Assistant / The Coq Development Team *) +(* <O___,, * INRIA-Rocquencourt & LRI-CNRS-Orsay *) +(* \VV/ *************************************************************) +(* // * This file is distributed under the terms of the *) +(* * GNU Lesser General Public License Version 2.1 *) +(***********************************************************************) + +(** Unicode utilities *) + +type status = Letter | IdentPart | Symbol + +exception Unsupported + +(* The following table stores classes of Unicode characters that + are used by the lexer. There are 3 different classes so 2 bits are + allocated for each character. We only use 16 bits over the 31 bits + to simplify the masking process. (This choice seems to be a good + trade-off between speed and space after some benchmarks.) *) + +(* A 256ko table, initially filled with zeros. *) +let table = Array.create (1 lsl 17) 0 + +(* Associate a 2-bit pattern to each status at position [i]. + Only the 3 lowest bits of [i] are taken into account to + define the position of the pattern in the word. + Notice that pattern "00" means "undefined". *) +let mask i = function + | Letter -> 1 lsl ((i land 7) lsl 1) (* 01 *) + | IdentPart -> 2 lsl ((i land 7) lsl 1) (* 10 *) + | Symbol -> 3 lsl ((i land 7) lsl 1) (* 11 *) + +(* Helper to reset 2 bits in a word. *) +let reset_mask i = + lnot (3 lsl ((i land 7) lsl 1)) + +(* Initialize the lookup table from a list of segments, assigning + a status to every character of each segment. The order of these + assignments is relevant: it is possible to assign status [s] to + a segment [(c1, c2)] and later assign [s'] to [c] even if [c] is + between [c1] and [c2]. *) +let mk_lookup_table_from_unicode_tables_for status tables = + List.iter + (List.iter + (fun (c1, c2) -> + for i = c1 to c2 do + table.(i lsr 3) <- + (table.(i lsr 3) land (reset_mask i)) lor (mask i status) + done)) + tables + +(* Look up into the table and interpret the found pattern. *) +let lookup x = + let v = (table.(x lsr 3) lsr ((x land 7) lsl 1)) land 3 in + if v = 1 then Letter + else if v = 2 then IdentPart + else if v = 3 then Symbol + else raise Unsupported + +(* [classify] discriminates between 3 different kinds of + symbols based on the standard unicode classification (extracted from + Camomile). *) +let classify = + let single c = [ (c, c) ] in + (* General tables. *) + mk_lookup_table_from_unicode_tables_for Symbol + [ + Unicodetable.sm; (* Symbol, maths. *) + Unicodetable.sc; (* Symbol, currency. *) + Unicodetable.so; (* Symbol, modifier. *) + Unicodetable.pd; (* Punctation, dash. *) + Unicodetable.pc; (* Punctation, connector. *) + Unicodetable.pe; (* Punctation, open. *) + Unicodetable.ps; (* Punctation, close. *) + Unicodetable.pi; (* Punctation, initial quote. *) + Unicodetable.pf; (* Punctation, final quote. *) + Unicodetable.po; (* Punctation, other. *) + ]; + mk_lookup_table_from_unicode_tables_for Letter + [ + Unicodetable.lu; (* Letter, uppercase. *) + Unicodetable.ll; (* Letter, lowercase. *) + Unicodetable.lt; (* Letter, titlecase. *) + Unicodetable.lo; (* Letter, others. *) + ]; + mk_lookup_table_from_unicode_tables_for IdentPart + [ + Unicodetable.nd; (* Number, decimal digits. *) + Unicodetable.nl; (* Number, letter. *) + Unicodetable.no; (* Number, other. *) + ]; + (* Exceptions (from a previous version of this function). *) + mk_lookup_table_from_unicode_tables_for Symbol + [ + single 0x000B2; (* Squared. *) + single 0x0002E; (* Dot. *) + ]; + mk_lookup_table_from_unicode_tables_for Letter + [ + single 0x005F; (* Underscore. *) + single 0x00A0; (* Non breaking space. *) + ]; + mk_lookup_table_from_unicode_tables_for IdentPart + [ + single 0x0027; (* Special space. *) + ]; + (* Lookup *) + lookup + +exception End_of_input + +let utf8_of_unicode n = + if n < 128 then + String.make 1 (Char.chr n) + else if n < 2048 then + let s = String.make 2 (Char.chr (128 + n mod 64)) in + begin + s.[0] <- Char.chr (192 + n / 64); + s + end + else if n < 65536 then + let s = String.make 3 (Char.chr (128 + n mod 64)) in + begin + s.[1] <- Char.chr (128 + (n / 64) mod 64); + s.[0] <- Char.chr (224 + n / 4096); + s + end + else + let s = String.make 4 (Char.chr (128 + n mod 64)) in + begin + s.[2] <- Char.chr (128 + (n / 64) mod 64); + s.[1] <- Char.chr (128 + (n / 4096) mod 64); + s.[0] <- Char.chr (240 + n / 262144); + s + end + +let next_utf8 s i = + let err () = invalid_arg "utf8" in + let l = String.length s - i in + if l = 0 then raise End_of_input + else let a = Char.code s.[i] in if a <= 0x7F then + 1, a + else if a land 0x40 = 0 or l = 1 then err () + else let b = Char.code s.[i+1] in if b land 0xC0 <> 0x80 then err () + else if a land 0x20 = 0 then + 2, (a land 0x1F) lsl 6 + (b land 0x3F) + else if l = 2 then err () + else let c = Char.code s.[i+2] in if c land 0xC0 <> 0x80 then err () + else if a land 0x10 = 0 then + 3, (a land 0x0F) lsl 12 + (b land 0x3F) lsl 6 + (c land 0x3F) + else if l = 3 then err () + else let d = Char.code s.[i+3] in if d land 0xC0 <> 0x80 then err () + else if a land 0x08 = 0 then + 4, (a land 0x07) lsl 18 + (b land 0x3F) lsl 12 + + (c land 0x3F) lsl 6 + (d land 0x3F) + else err () + +(* Check the well-formedness of an identifier *) + +let initial_refutation j n s = + match classify n with + | Letter -> None + | _ -> + let c = String.sub s 0 j in + Some (false,"Invalid character '"^c^"' at beginning of identifier \""^s^"\".") + +let trailing_refutation i j n s = + match classify n with + | Letter | IdentPart -> None + | _ -> + let c = String.sub s i j in + Some (false,"Invalid character '"^c^"' in identifier \""^s^"\".") + +let ident_refutation s = + if s = ".." then None else try + let j, n = next_utf8 s 0 in + match initial_refutation j n s with + |None -> + begin try + let rec aux i = + let j, n = next_utf8 s i in + match trailing_refutation i j n s with + |None -> aux (i + j) + |x -> x + in aux j + with End_of_input -> None + end + |x -> x + with + | End_of_input -> Some (true,"The empty string is not an identifier.") + | Unsupported -> Some (true,s^": unsupported character in utf8 sequence.") + | Invalid_argument _ -> Some (true,s^": invalid utf8 sequence.") + +let lowercase_unicode = + let tree = Segmenttree.make Unicodetable.to_lower in + fun unicode -> + try + match Segmenttree.lookup unicode tree with + | `Abs c -> c + | `Delta d -> unicode + d + with Not_found -> unicode + +let lowercase_first_char s = + assert (s <> ""); + let j, n = next_utf8 s 0 in + utf8_of_unicode (lowercase_unicode n) + +(** For extraction, we need to encode unicode character into ascii ones *) + +let ascii_of_ident s = + let check_ascii s = + let ok = ref true in + String.iter (fun c -> if Char.code c >= 128 then ok := false) s; + !ok + in + if check_ascii s then s else + let i = ref 0 and out = ref "" in + begin try while true do + let j, n = next_utf8 s !i in + out := + if n >= 128 + then Printf.sprintf "%s__U%04x_" !out n + else Printf.sprintf "%s%c" !out s.[!i]; + i := !i + j + done with End_of_input -> () end; + !out diff --git a/lib/unicode.mli b/lib/unicode.mli new file mode 100644 index 000000000..763123306 --- /dev/null +++ b/lib/unicode.mli @@ -0,0 +1,27 @@ +(************************************************************************) +(* v * The Coq Proof Assistant / The Coq Development Team *) +(* <O___,, * INRIA - CNRS - LIX - LRI - PPS - Copyright 1999-2012 *) +(* \VV/ **************************************************************) +(* // * This file is distributed under the terms of the *) +(* * GNU Lesser General Public License Version 2.1 *) +(************************************************************************) + +(** Unicode utilities *) + +type status = Letter | IdentPart | Symbol + +exception Unsupported + +(** Classify a unicode char into 3 classes, or raise [Unsupported] *) +val classify : int -> status + +(** Check whether a given string be used as a legal identifier. + - [None] means yes + - [Some (b,s)] means no, with explanation [s] and severity [b] *) +val ident_refutation : string -> (bool * string) option + +(** First char of a string, converted to lowercase *) +val lowercase_first_char : string -> string + +(** For extraction, turn a unicode string into an ascii-only one *) +val ascii_of_ident : string -> string diff --git a/lib/util.ml b/lib/util.ml index 6c1fc39b5..624075956 100644 --- a/lib/util.ml +++ b/lib/util.ml @@ -146,223 +146,6 @@ module Stringset = Set.Make(struct type t = string let compare (x:t) (y:t) = com module Stringmap = Map.Make(struct type t = string let compare (x:t) (y:t) = compare x y end) -type utf8_status = UnicodeLetter | UnicodeIdentPart | UnicodeSymbol - -exception UnsupportedUtf8 - -(* The following table stores classes of Unicode characters that - are used by the lexer. There are 3 different classes so 2 bits are - allocated for each character. We only use 16 bits over the 31 bits - to simplify the masking process. (This choice seems to be a good - trade-off between speed and space after some benchmarks.) *) - -(* A 256ko table, initially filled with zeros. *) -let table = Array.create (1 lsl 17) 0 - -(* Associate a 2-bit pattern to each status at position [i]. - Only the 3 lowest bits of [i] are taken into account to - define the position of the pattern in the word. - Notice that pattern "00" means "undefined". *) -let mask i = function - | UnicodeLetter -> 1 lsl ((i land 7) lsl 1) (* 01 *) - | UnicodeIdentPart -> 2 lsl ((i land 7) lsl 1) (* 10 *) - | UnicodeSymbol -> 3 lsl ((i land 7) lsl 1) (* 11 *) - -(* Helper to reset 2 bits in a word. *) -let reset_mask i = - lnot (3 lsl ((i land 7) lsl 1)) - -(* Initialize the lookup table from a list of segments, assigning - a status to every character of each segment. The order of these - assignments is relevant: it is possible to assign status [s] to - a segment [(c1, c2)] and later assign [s'] to [c] even if [c] is - between [c1] and [c2]. *) -let mk_lookup_table_from_unicode_tables_for status tables = - List.iter - (List.iter - (fun (c1, c2) -> - for i = c1 to c2 do - table.(i lsr 3) <- - (table.(i lsr 3) land (reset_mask i)) lor (mask i status) - done)) - tables - -(* Look up into the table and interpret the found pattern. *) -let lookup x = - let v = (table.(x lsr 3) lsr ((x land 7) lsl 1)) land 3 in - if v = 1 then UnicodeLetter - else if v = 2 then UnicodeIdentPart - else if v = 3 then UnicodeSymbol - else raise UnsupportedUtf8 - -(* [classify_unicode] discriminates between 3 different kinds of - symbols based on the standard unicode classification (extracted from - Camomile). *) -let classify_unicode = - let single c = [ (c, c) ] in - (* General tables. *) - mk_lookup_table_from_unicode_tables_for UnicodeSymbol - [ - Unicodetable.sm; (* Symbol, maths. *) - Unicodetable.sc; (* Symbol, currency. *) - Unicodetable.so; (* Symbol, modifier. *) - Unicodetable.pd; (* Punctation, dash. *) - Unicodetable.pc; (* Punctation, connector. *) - Unicodetable.pe; (* Punctation, open. *) - Unicodetable.ps; (* Punctation, close. *) - Unicodetable.pi; (* Punctation, initial quote. *) - Unicodetable.pf; (* Punctation, final quote. *) - Unicodetable.po; (* Punctation, other. *) - ]; - mk_lookup_table_from_unicode_tables_for UnicodeLetter - [ - Unicodetable.lu; (* Letter, uppercase. *) - Unicodetable.ll; (* Letter, lowercase. *) - Unicodetable.lt; (* Letter, titlecase. *) - Unicodetable.lo; (* Letter, others. *) - ]; - mk_lookup_table_from_unicode_tables_for UnicodeIdentPart - [ - Unicodetable.nd; (* Number, decimal digits. *) - Unicodetable.nl; (* Number, letter. *) - Unicodetable.no; (* Number, other. *) - ]; - (* Exceptions (from a previous version of this function). *) - mk_lookup_table_from_unicode_tables_for UnicodeSymbol - [ - single 0x000B2; (* Squared. *) - single 0x0002E; (* Dot. *) - ]; - mk_lookup_table_from_unicode_tables_for UnicodeLetter - [ - single 0x005F; (* Underscore. *) - single 0x00A0; (* Non breaking space. *) - ]; - mk_lookup_table_from_unicode_tables_for UnicodeIdentPart - [ - single 0x0027; (* Special space. *) - ]; - (* Lookup *) - lookup - -exception End_of_input - -let utf8_of_unicode n = - if n < 128 then - String.make 1 (Char.chr n) - else if n < 2048 then - let s = String.make 2 (Char.chr (128 + n mod 64)) in - begin - s.[0] <- Char.chr (192 + n / 64); - s - end - else if n < 65536 then - let s = String.make 3 (Char.chr (128 + n mod 64)) in - begin - s.[1] <- Char.chr (128 + (n / 64) mod 64); - s.[0] <- Char.chr (224 + n / 4096); - s - end - else - let s = String.make 4 (Char.chr (128 + n mod 64)) in - begin - s.[2] <- Char.chr (128 + (n / 64) mod 64); - s.[1] <- Char.chr (128 + (n / 4096) mod 64); - s.[0] <- Char.chr (240 + n / 262144); - s - end - -let next_utf8 s i = - let err () = invalid_arg "utf8" in - let l = String.length s - i in - if l = 0 then raise End_of_input - else let a = Char.code s.[i] in if a <= 0x7F then - 1, a - else if a land 0x40 = 0 or l = 1 then err () - else let b = Char.code s.[i+1] in if b land 0xC0 <> 0x80 then err () - else if a land 0x20 = 0 then - 2, (a land 0x1F) lsl 6 + (b land 0x3F) - else if l = 2 then err () - else let c = Char.code s.[i+2] in if c land 0xC0 <> 0x80 then err () - else if a land 0x10 = 0 then - 3, (a land 0x0F) lsl 12 + (b land 0x3F) lsl 6 + (c land 0x3F) - else if l = 3 then err () - else let d = Char.code s.[i+3] in if d land 0xC0 <> 0x80 then err () - else if a land 0x08 = 0 then - 4, (a land 0x07) lsl 18 + (b land 0x3F) lsl 12 + - (c land 0x3F) lsl 6 + (d land 0x3F) - else err () - -(* Check the well-formedness of an identifier *) - -let initial_refutation j n s = - match classify_unicode n with - | UnicodeLetter -> None - | _ -> - let c = String.sub s 0 j in - Some (false,"Invalid character '"^c^"' at beginning of identifier \""^s^"\".") - -let trailing_refutation i j n s = - match classify_unicode n with - | UnicodeLetter | UnicodeIdentPart -> None - | _ -> - let c = String.sub s i j in - Some (false,"Invalid character '"^c^"' in identifier \""^s^"\".") - -let ident_refutation s = - if s = ".." then None else try - let j, n = next_utf8 s 0 in - match initial_refutation j n s with - |None -> - begin try - let rec aux i = - let j, n = next_utf8 s i in - match trailing_refutation i j n s with - |None -> aux (i + j) - |x -> x - in aux j - with End_of_input -> None - end - |x -> x - with - | End_of_input -> Some (true,"The empty string is not an identifier.") - | UnsupportedUtf8 -> Some (true,s^": unsupported character in utf8 sequence.") - | Invalid_argument _ -> Some (true,s^": invalid utf8 sequence.") - -let lowercase_unicode = - let tree = Segmenttree.make Unicodetable.to_lower in - fun unicode -> - try - match Segmenttree.lookup unicode tree with - | `Abs c -> c - | `Delta d -> unicode + d - with Not_found -> unicode - -let lowercase_first_char_utf8 s = - assert (s <> ""); - let j, n = next_utf8 s 0 in - utf8_of_unicode (lowercase_unicode n) - -(** For extraction, we need to encode unicode character into ascii ones *) - -let ascii_of_ident s = - let check_ascii s = - let ok = ref true in - String.iter (fun c -> if Char.code c >= 128 then ok := false) s; - !ok - in - if check_ascii s then s else - let i = ref 0 and out = ref "" in - begin try while true do - let j, n = next_utf8 s !i in - out := - if n >= 128 - then Printf.sprintf "%s__U%04x_" !out n - else Printf.sprintf "%s%c" !out s.[!i]; - i := !i + j - done with End_of_input -> () end; - !out - (* Lists *) module List : CList.ExtS = CList diff --git a/lib/util.mli b/lib/util.mli index 1333d1854..694e79ce4 100644 --- a/lib/util.mli +++ b/lib/util.mli @@ -33,7 +33,6 @@ val is_letter : char -> bool val is_digit : char -> bool val is_ident_tail : char -> bool val is_blank : char -> bool -val next_utf8 : string -> int -> int * int (** {6 Strings. } *) @@ -55,15 +54,6 @@ val parse_loadpath : string -> string list module Stringset : Set.S with type elt = string module Stringmap : Map.S with type key = string -type utf8_status = UnicodeLetter | UnicodeIdentPart | UnicodeSymbol - -exception UnsupportedUtf8 - -val ident_refutation : string -> (bool * string) option -val classify_unicode : int -> utf8_status -val lowercase_first_char_utf8 : string -> string -val ascii_of_ident : string -> string - (** {6 Lists. } *) module List : CList.ExtS |