More cleanup of Util: utf8 aspects moved to a new file unicode.ml

git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/coq/trunk@15818 85f007b7-540e-0410-9357-904b9bb8a0f7
author: letouzey <letouzey@85f007b7-540e-0410-9357-904b9bb8a0f7> 2012-09-18 15:06:42 +0000
committer: letouzey <letouzey@85f007b7-540e-0410-9357-904b9bb8a0f7> 2012-09-18 15:06:42 +0000
commit: bf08866eabad4408de975bae92f3b3c1f718322c (patch)
tree: faab2a63879ce58c26ed5439a21af47a06226051 /lib
parent: 4422e16f529359bb96c7eee214b2b6648958ef48 (diff)
7 files changed, 257 insertions, 230 deletions
diff --git a/lib/clib.mllib b/lib/clib.mllib
index 0dd24d74c..ca0e97bb0 100644
--- a/lib/clib.mllib
+++ b/lib/clib.mllib
@@ -1,8 +1,6 @@
 Pp_control
 Pp
 Coq_config
-Segmenttree
-Unicodetable
 Deque
 CObj
 CList
diff --git a/lib/lib.mllib b/lib/lib.mllib
index 618ff80b6..21eb7a4a5 100644
--- a/lib/lib.mllib
+++ b/lib/lib.mllib
@@ -6,6 +6,9 @@ Errors
 Bigint
 Hashcons
 Dyn
+Segmenttree
+Unicodetable
+Unicode
 System
 Gmap
 Fset
diff --git a/lib/system.ml b/lib/system.ml
index e721b1f65..cf650d6ab 100644
--- a/lib/system.ml
+++ b/lib/system.ml
@@ -24,7 +24,7 @@ let exclude_search_in_dirname f = skipped_dirnames := f :: !skipped_dirnames
 
 let ok_dirname f =
   f <> "" && f.[0] <> '.' && not (List.mem f !skipped_dirnames) &&
-  match ident_refutation f with |None -> true |_ -> false
+  (Unicode.ident_refutation f = None)
 
 let all_subdirs ~unix_path:root =
   let l = ref [] in
diff --git a/lib/unicode.ml b/lib/unicode.ml
new file mode 100644
index 000000000..75b3b5899
--- /dev/null
+++ b/lib/unicode.ml
@@ -0,0 +1,226 @@
+(***********************************************************************)
+(*  v      *   The Coq Proof Assistant  /  The Coq Development Team    *)
+(* <O___,, *        INRIA-Rocquencourt  &  LRI-CNRS-Orsay              *)
+(*   \VV/  *************************************************************)
+(*    //   *      This file is distributed under the terms of the      *)
+(*         *       GNU Lesser General Public License Version 2.1       *)
+(***********************************************************************)
+
+(** Unicode utilities *)
+
+type status = Letter | IdentPart | Symbol
+
+exception Unsupported
+
+(* The following table stores classes of Unicode characters that
+   are used by the lexer. There are 3 different classes so 2 bits are
+   allocated for each character. We only use 16 bits over the 31 bits
+   to simplify the masking process. (This choice seems to be a good
+   trade-off between speed and space after some benchmarks.) *)
+
+(* A 256ko table, initially filled with zeros. *)
+let table = Array.create (1 lsl 17) 0
+
+(* Associate a 2-bit pattern to each status at position [i].
+   Only the 3 lowest bits of [i] are taken into account to
+   define the position of the pattern in the word.
+   Notice that pattern "00" means "undefined". *)
+let mask i = function
+  | Letter    -> 1 lsl ((i land 7) lsl 1) (* 01 *)
+  | IdentPart -> 2 lsl ((i land 7) lsl 1) (* 10 *)
+  | Symbol    -> 3 lsl ((i land 7) lsl 1) (* 11 *)
+
+(* Helper to reset 2 bits in a word. *)
+let reset_mask i =
+  lnot (3 lsl ((i land 7) lsl 1))
+
+(* Initialize the lookup table from a list of segments, assigning
+   a status to every character of each segment. The order of these
+   assignments is relevant: it is possible to assign status [s] to
+   a segment [(c1, c2)] and later assign [s'] to [c] even if [c] is
+   between [c1] and [c2]. *)
+let mk_lookup_table_from_unicode_tables_for status tables =
+  List.iter
+    (List.iter
+       (fun (c1, c2) ->
+          for i = c1 to c2 do
+            table.(i lsr 3) <-
+              (table.(i lsr 3) land (reset_mask i)) lor (mask i status)
+          done))
+    tables
+
+(* Look up into the table and interpret the found pattern. *)
+let lookup x =
+  let v = (table.(x lsr 3) lsr ((x land 7) lsl 1)) land 3 in
+    if      v = 1 then Letter
+    else if v = 2 then IdentPart
+    else if v = 3 then Symbol
+    else raise Unsupported
+
+(* [classify] discriminates between 3 different kinds of
+   symbols based on the standard unicode classification (extracted from
+   Camomile). *)
+let classify =
+  let single c = [ (c, c) ] in
+    (* General tables. *)
+    mk_lookup_table_from_unicode_tables_for Symbol
+      [
+        Unicodetable.sm;           (* Symbol, maths.             *)
+        Unicodetable.sc;           (* Symbol, currency.          *)
+        Unicodetable.so;           (* Symbol, modifier.          *)
+        Unicodetable.pd;           (* Punctation, dash.          *)
+        Unicodetable.pc;           (* Punctation, connector.     *)
+        Unicodetable.pe;           (* Punctation, open.          *)
+        Unicodetable.ps;           (* Punctation, close.         *)
+        Unicodetable.pi;           (* Punctation, initial quote. *)
+        Unicodetable.pf;           (* Punctation, final quote.   *)
+        Unicodetable.po;           (* Punctation, other.         *)
+      ];
+    mk_lookup_table_from_unicode_tables_for Letter
+      [
+        Unicodetable.lu;           (* Letter, uppercase.         *)
+        Unicodetable.ll;           (* Letter, lowercase.         *)
+        Unicodetable.lt;           (* Letter, titlecase.         *)
+        Unicodetable.lo;           (* Letter, others.            *)
+      ];
+    mk_lookup_table_from_unicode_tables_for IdentPart
+      [
+        Unicodetable.nd;           (* Number, decimal digits.    *)
+        Unicodetable.nl;           (* Number, letter.            *)
+        Unicodetable.no;           (* Number, other.             *)
+      ];
+    (* Exceptions (from a previous version of this function). *)
+    mk_lookup_table_from_unicode_tables_for Symbol
+      [
+        single 0x000B2;            (* Squared.                   *)
+        single 0x0002E;            (* Dot.                       *)
+      ];
+    mk_lookup_table_from_unicode_tables_for Letter
+      [
+        single 0x005F;             (* Underscore.                *)
+        single 0x00A0;             (* Non breaking space.        *)
+      ];
+    mk_lookup_table_from_unicode_tables_for IdentPart
+      [
+        single 0x0027;             (* Special space.             *)
+      ];
+    (* Lookup *)
+    lookup
+
+exception End_of_input
+
+let utf8_of_unicode n =
+  if n < 128 then
+    String.make 1 (Char.chr n)
+  else if n < 2048 then
+    let s = String.make 2 (Char.chr (128 + n mod 64)) in
+    begin
+      s.[0] <- Char.chr (192 + n / 64);
+      s
+    end
+  else if n < 65536 then
+    let s = String.make 3 (Char.chr (128 + n mod 64)) in
+    begin
+      s.[1] <- Char.chr (128 + (n / 64) mod 64);
+      s.[0] <- Char.chr (224 + n / 4096);
+      s
+    end
+  else
+    let s = String.make 4 (Char.chr (128 + n mod 64)) in
+    begin
+      s.[2] <- Char.chr (128 + (n / 64) mod 64);
+      s.[1] <- Char.chr (128 + (n / 4096) mod 64);
+      s.[0] <- Char.chr (240 + n / 262144);
+      s
+    end
+
+let next_utf8 s i =
+  let err () = invalid_arg "utf8" in
+  let l = String.length s - i in
+  if l = 0 then raise End_of_input
+  else let a = Char.code s.[i] in if a <= 0x7F then
+    1, a
+  else if a land 0x40 = 0 or l = 1 then err ()
+  else let b = Char.code s.[i+1] in if b land 0xC0 <> 0x80 then err ()
+  else if a land 0x20 = 0 then
+    2, (a land 0x1F) lsl 6 + (b land 0x3F)
+  else if l = 2 then err ()
+  else let c = Char.code s.[i+2] in if c land 0xC0 <> 0x80 then err ()
+  else if a land 0x10 = 0 then
+    3, (a land 0x0F) lsl 12 + (b land 0x3F) lsl 6 + (c land 0x3F)
+  else if l = 3 then err ()
+  else let d = Char.code s.[i+3] in if d land 0xC0 <> 0x80 then err ()
+  else if a land 0x08 = 0 then
+    4, (a land 0x07) lsl 18 + (b land 0x3F) lsl 12 +
+       (c land 0x3F) lsl 6 + (d land 0x3F)
+  else err ()
+
+(* Check the well-formedness of an identifier *)
+
+let initial_refutation j n s =
+  match classify n with
+  | Letter -> None
+  | _ ->
+      let c = String.sub s 0 j in
+      Some (false,"Invalid character '"^c^"' at beginning of identifier \""^s^"\".")
+
+let trailing_refutation i j n s =
+  match classify n with
+  | Letter | IdentPart -> None
+  | _ ->
+      let c = String.sub s i j in
+      Some (false,"Invalid character '"^c^"' in identifier \""^s^"\".")
+
+let ident_refutation s =
+  if s = ".." then None else try
+    let j, n = next_utf8 s 0 in
+      match initial_refutation j n s with
+	|None ->
+	   begin try
+	     let rec aux i =
+	       let j, n = next_utf8 s i in
+		 match trailing_refutation i j n s with
+		   |None -> aux (i + j)
+		   |x -> x
+	     in aux j
+	   with End_of_input -> None
+	   end
+	|x -> x
+  with
+  | End_of_input -> Some (true,"The empty string is not an identifier.")
+  | Unsupported -> Some (true,s^": unsupported character in utf8 sequence.")
+  | Invalid_argument _ -> Some (true,s^": invalid utf8 sequence.")
+
+let lowercase_unicode =
+  let tree = Segmenttree.make Unicodetable.to_lower in
+  fun unicode ->
+    try
+      match Segmenttree.lookup unicode tree with
+	| `Abs c -> c
+	| `Delta d -> unicode + d
+    with Not_found -> unicode
+
+let lowercase_first_char s =
+  assert (s <> "");
+  let j, n = next_utf8 s 0 in
+  utf8_of_unicode (lowercase_unicode n)
+
+(** For extraction, we need to encode unicode character into ascii ones *)
+
+let ascii_of_ident s =
+  let check_ascii s =
+    let ok = ref true in
+    String.iter (fun c -> if Char.code c >= 128 then ok := false) s;
+    !ok
+  in
+  if check_ascii s then s else
+    let i = ref 0 and out = ref "" in
+    begin try while true do
+      let j, n = next_utf8 s !i in
+      out :=
+	if n >= 128
+	then Printf.sprintf "%s__U%04x_" !out n
+	else Printf.sprintf "%s%c" !out s.[!i];
+      i := !i + j
+    done with End_of_input -> () end;
+    !out
diff --git a/lib/unicode.mli b/lib/unicode.mli
new file mode 100644
index 000000000..763123306
--- /dev/null
+++ b/lib/unicode.mli
@@ -0,0 +1,27 @@
+(************************************************************************)
+(*  v      *   The Coq Proof Assistant  /  The Coq Development Team     *)
+(* <O___,, *   INRIA - CNRS - LIX - LRI - PPS - Copyright 1999-2012     *)
+(*   \VV/  **************************************************************)
+(*    //   *      This file is distributed under the terms of the       *)
+(*         *       GNU Lesser General Public License Version 2.1        *)
+(************************************************************************)
+
+(** Unicode utilities *)
+
+type status = Letter | IdentPart | Symbol
+
+exception Unsupported
+
+(** Classify a unicode char into 3 classes, or raise [Unsupported] *)
+val classify : int -> status
+
+(** Check whether a given string be used as a legal identifier.
+    - [None] means yes
+    - [Some (b,s)] means no, with explanation [s] and severity [b] *)
+val ident_refutation : string -> (bool * string) option
+
+(** First char of a string, converted to lowercase *)
+val lowercase_first_char : string -> string
+
+(** For extraction, turn a unicode string into an ascii-only one *)
+val ascii_of_ident : string -> string
diff --git a/lib/util.ml b/lib/util.ml
index 6c1fc39b5..624075956 100644
--- a/lib/util.ml
+++ b/lib/util.ml
@@ -146,223 +146,6 @@ module Stringset = Set.Make(struct type t = string let compare (x:t) (y:t) = com
 
 module Stringmap = Map.Make(struct type t = string let compare (x:t) (y:t) = compare x y end)
 
-type utf8_status = UnicodeLetter | UnicodeIdentPart | UnicodeSymbol
-
-exception UnsupportedUtf8
-
-(* The following table stores classes of Unicode characters that
-   are used by the lexer. There are 3 different classes so 2 bits are
-   allocated for each character. We only use 16 bits over the 31 bits
-   to simplify the masking process. (This choice seems to be a good
-   trade-off between speed and space after some benchmarks.) *)
-
-(* A 256ko table, initially filled with zeros. *)
-let table = Array.create (1 lsl 17) 0
-
-(* Associate a 2-bit pattern to each status at position [i]. 
-   Only the 3 lowest bits of [i] are taken into account to 
-   define the position of the pattern in the word. 
-   Notice that pattern "00" means "undefined". *)
-let mask i = function
-  | UnicodeLetter    -> 1 lsl ((i land 7) lsl 1) (* 01 *)
-  | UnicodeIdentPart -> 2 lsl ((i land 7) lsl 1) (* 10 *)
-  | UnicodeSymbol    -> 3 lsl ((i land 7) lsl 1) (* 11 *)
-
-(* Helper to reset 2 bits in a word. *)
-let reset_mask i =
-  lnot (3 lsl ((i land 7) lsl 1))
-
-(* Initialize the lookup table from a list of segments, assigning
-   a status to every character of each segment. The order of these
-   assignments is relevant: it is possible to assign status [s] to
-   a segment [(c1, c2)] and later assign [s'] to [c] even if [c] is
-   between [c1] and [c2]. *)
-let mk_lookup_table_from_unicode_tables_for status tables =
-  List.iter
-    (List.iter
-       (fun (c1, c2) ->
-          for i = c1 to c2 do
-            table.(i lsr 3) <-
-              (table.(i lsr 3) land (reset_mask i)) lor (mask i status)
-          done))
-    tables
-
-(* Look up into the table and interpret the found pattern. *)
-let lookup x =
-  let v = (table.(x lsr 3) lsr ((x land 7) lsl 1)) land 3 in
-    if      v = 1 then UnicodeLetter
-    else if v = 2 then UnicodeIdentPart
-    else if v = 3 then UnicodeSymbol
-    else raise UnsupportedUtf8
-
-(* [classify_unicode] discriminates between 3 different kinds of
-   symbols based on the standard unicode classification (extracted from
-   Camomile). *)
-let classify_unicode =
-  let single c = [ (c, c) ] in
-    (* General tables. *)
-    mk_lookup_table_from_unicode_tables_for UnicodeSymbol
-      [
-        Unicodetable.sm;           (* Symbol, maths.             *)
-        Unicodetable.sc;           (* Symbol, currency.          *)
-        Unicodetable.so;           (* Symbol, modifier.          *)
-        Unicodetable.pd;           (* Punctation, dash.          *)
-        Unicodetable.pc;           (* Punctation, connector.     *)
-        Unicodetable.pe;           (* Punctation, open.          *)
-        Unicodetable.ps;           (* Punctation, close.         *)
-        Unicodetable.pi;           (* Punctation, initial quote. *)
-        Unicodetable.pf;           (* Punctation, final quote.   *)
-        Unicodetable.po;           (* Punctation, other.         *)
-      ];
-    mk_lookup_table_from_unicode_tables_for UnicodeLetter
-      [
-        Unicodetable.lu;           (* Letter, uppercase.         *)
-        Unicodetable.ll;           (* Letter, lowercase.         *)
-        Unicodetable.lt;           (* Letter, titlecase.         *)
-        Unicodetable.lo;           (* Letter, others.            *)
-      ];
-    mk_lookup_table_from_unicode_tables_for UnicodeIdentPart
-      [
-        Unicodetable.nd;           (* Number, decimal digits.    *)
-        Unicodetable.nl;           (* Number, letter.            *)
-        Unicodetable.no;           (* Number, other.             *)
-      ];
-    (* Exceptions (from a previous version of this function). *)
-    mk_lookup_table_from_unicode_tables_for UnicodeSymbol
-      [
-        single 0x000B2;            (* Squared.                   *)
-        single 0x0002E;            (* Dot.                       *)
-      ];
-    mk_lookup_table_from_unicode_tables_for UnicodeLetter
-      [
-        single 0x005F;             (* Underscore.                *)
-        single 0x00A0;             (* Non breaking space.        *)
-      ];
-    mk_lookup_table_from_unicode_tables_for UnicodeIdentPart
-      [
-        single 0x0027;             (* Special space.             *)
-      ];
-    (* Lookup *)
-    lookup
-
-exception End_of_input
-
-let utf8_of_unicode n =
-  if n < 128 then
-    String.make 1 (Char.chr n)
-  else if n < 2048 then
-    let s = String.make 2 (Char.chr (128 + n mod 64)) in
-    begin
-      s.[0] <- Char.chr (192 + n / 64);
-      s
-    end
-  else if n < 65536 then
-    let s = String.make 3 (Char.chr (128 + n mod 64)) in
-    begin
-      s.[1] <- Char.chr (128 + (n / 64) mod 64);
-      s.[0] <- Char.chr (224 + n / 4096);
-      s
-    end
-  else
-    let s = String.make 4 (Char.chr (128 + n mod 64)) in
-    begin
-      s.[2] <- Char.chr (128 + (n / 64) mod 64);
-      s.[1] <- Char.chr (128 + (n / 4096) mod 64);
-      s.[0] <- Char.chr (240 + n / 262144);
-      s
-    end
-
-let next_utf8 s i =
-  let err () = invalid_arg "utf8" in
-  let l = String.length s - i in
-  if l = 0 then raise End_of_input
-  else let a = Char.code s.[i] in if a <= 0x7F then
-    1, a
-  else if a land 0x40 = 0 or l = 1 then err ()
-  else let b = Char.code s.[i+1] in if b land 0xC0 <> 0x80 then err ()
-  else if a land 0x20 = 0 then
-    2, (a land 0x1F) lsl 6 + (b land 0x3F)
-  else if l = 2 then err ()
-  else let c = Char.code s.[i+2] in if c land 0xC0 <> 0x80 then err ()
-  else if a land 0x10 = 0 then
-    3, (a land 0x0F) lsl 12 + (b land 0x3F) lsl 6 + (c land 0x3F)
-  else if l = 3 then err ()
-  else let d = Char.code s.[i+3] in if d land 0xC0 <> 0x80 then err ()
-  else if a land 0x08 = 0 then
-    4, (a land 0x07) lsl 18 + (b land 0x3F) lsl 12 +
-       (c land 0x3F) lsl 6 + (d land 0x3F)
-  else err ()
-
-(* Check the well-formedness of an identifier *)
-
-let initial_refutation j n s =
-  match classify_unicode n with
-  | UnicodeLetter -> None
-  | _ ->
-      let c = String.sub s 0 j in
-      Some (false,"Invalid character '"^c^"' at beginning of identifier \""^s^"\".")
-
-let trailing_refutation i j n s =
-  match classify_unicode n with
-  | UnicodeLetter | UnicodeIdentPart -> None
-  | _ ->
-      let c = String.sub s i j in
-      Some (false,"Invalid character '"^c^"' in identifier \""^s^"\".")
-
-let ident_refutation s =
-  if s = ".." then None else try
-    let j, n = next_utf8 s 0 in
-      match initial_refutation j n s with
-	|None ->
-	   begin try
-	     let rec aux i =
-	       let j, n = next_utf8 s i in
-		 match trailing_refutation i j n s with
-		   |None -> aux (i + j)
-		   |x -> x
-	     in aux j
-	   with End_of_input -> None
-	   end
-	|x -> x
-  with
-  | End_of_input -> Some (true,"The empty string is not an identifier.")
-  | UnsupportedUtf8 -> Some (true,s^": unsupported character in utf8 sequence.")
-  | Invalid_argument _ -> Some (true,s^": invalid utf8 sequence.")
-
-let lowercase_unicode =
-  let tree = Segmenttree.make Unicodetable.to_lower in
-  fun unicode ->
-    try
-      match Segmenttree.lookup unicode tree with
-	| `Abs c -> c
-	| `Delta d -> unicode + d
-    with Not_found -> unicode
-
-let lowercase_first_char_utf8 s =
-  assert (s <> "");
-  let j, n = next_utf8 s 0 in
-  utf8_of_unicode (lowercase_unicode n)
-
-(** For extraction, we need to encode unicode character into ascii ones *)
-
-let ascii_of_ident s =
-  let check_ascii s =
-    let ok = ref true in
-    String.iter (fun c -> if Char.code c >= 128 then ok := false) s;
-    !ok
-  in
-  if check_ascii s then s else
-    let i = ref 0 and out = ref "" in
-    begin try while true do
-      let j, n = next_utf8 s !i in
-      out :=
-	if n >= 128
-	then Printf.sprintf "%s__U%04x_" !out n
-	else Printf.sprintf "%s%c" !out s.[!i];
-      i := !i + j
-    done with End_of_input -> () end;
-    !out
-
 (* Lists *)
 
 module List : CList.ExtS = CList
diff --git a/lib/util.mli b/lib/util.mli
index 1333d1854..694e79ce4 100644
--- a/lib/util.mli
+++ b/lib/util.mli
@@ -33,7 +33,6 @@ val is_letter : char -> bool
 val is_digit : char -> bool
 val is_ident_tail : char -> bool
 val is_blank : char -> bool
-val next_utf8 : string -> int -> int * int
 
 (** {6 Strings. } *)
 
@@ -55,15 +54,6 @@ val parse_loadpath : string -> string list
 module Stringset : Set.S with type elt = string
 module Stringmap : Map.S with type key = string
 
-type utf8_status = UnicodeLetter | UnicodeIdentPart | UnicodeSymbol
-
-exception UnsupportedUtf8
-
-val ident_refutation : string -> (bool * string) option
-val classify_unicode : int -> utf8_status
-val lowercase_first_char_utf8 : string -> string
-val ascii_of_ident : string -> string
-
 (** {6 Lists. } *)
 
 module List : CList.ExtS
author	letouzey <letouzey@85f007b7-540e-0410-9357-904b9bb8a0f7>	2012-09-18 15:06:42 +0000
committer	letouzey <letouzey@85f007b7-540e-0410-9357-904b9bb8a0f7>	2012-09-18 15:06:42 +0000
commit	bf08866eabad4408de975bae92f3b3c1f718322c (patch)
tree	faab2a63879ce58c26ed5439a21af47a06226051 /lib
parent	4422e16f529359bb96c7eee214b2b6648958ef48 (diff)