More cleanup of Util: utf8 aspects moved to a new file unicode.ml

git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/coq/trunk@15818 85f007b7-540e-0410-9357-904b9bb8a0f7
author: letouzey <letouzey@85f007b7-540e-0410-9357-904b9bb8a0f7> 2012-09-18 15:06:42 +0000
committer: letouzey <letouzey@85f007b7-540e-0410-9357-904b9bb8a0f7> 2012-09-18 15:06:42 +0000
commit: bf08866eabad4408de975bae92f3b3c1f718322c (patch)
tree: faab2a63879ce58c26ed5439a21af47a06226051 /lib/util.ml
parent: 4422e16f529359bb96c7eee214b2b6648958ef48 (diff)
1 files changed, 0 insertions, 217 deletions
diff --git a/lib/util.ml b/lib/util.ml
index 6c1fc39b5..624075956 100644
--- a/lib/util.ml
+++ b/lib/util.ml
@@ -146,223 +146,6 @@ module Stringset = Set.Make(struct type t = string let compare (x:t) (y:t) = com
 
 module Stringmap = Map.Make(struct type t = string let compare (x:t) (y:t) = compare x y end)
 
-type utf8_status = UnicodeLetter | UnicodeIdentPart | UnicodeSymbol
-
-exception UnsupportedUtf8
-
-(* The following table stores classes of Unicode characters that
-   are used by the lexer. There are 3 different classes so 2 bits are
-   allocated for each character. We only use 16 bits over the 31 bits
-   to simplify the masking process. (This choice seems to be a good
-   trade-off between speed and space after some benchmarks.) *)
-
-(* A 256ko table, initially filled with zeros. *)
-let table = Array.create (1 lsl 17) 0
-
-(* Associate a 2-bit pattern to each status at position [i]. 
-   Only the 3 lowest bits of [i] are taken into account to 
-   define the position of the pattern in the word. 
-   Notice that pattern "00" means "undefined". *)
-let mask i = function
-  | UnicodeLetter    -> 1 lsl ((i land 7) lsl 1) (* 01 *)
-  | UnicodeIdentPart -> 2 lsl ((i land 7) lsl 1) (* 10 *)
-  | UnicodeSymbol    -> 3 lsl ((i land 7) lsl 1) (* 11 *)
-
-(* Helper to reset 2 bits in a word. *)
-let reset_mask i =
-  lnot (3 lsl ((i land 7) lsl 1))
-
-(* Initialize the lookup table from a list of segments, assigning
-   a status to every character of each segment. The order of these
-   assignments is relevant: it is possible to assign status [s] to
-   a segment [(c1, c2)] and later assign [s'] to [c] even if [c] is
-   between [c1] and [c2]. *)
-let mk_lookup_table_from_unicode_tables_for status tables =
-  List.iter
-    (List.iter
-       (fun (c1, c2) ->
-          for i = c1 to c2 do
-            table.(i lsr 3) <-
-              (table.(i lsr 3) land (reset_mask i)) lor (mask i status)
-          done))
-    tables
-
-(* Look up into the table and interpret the found pattern. *)
-let lookup x =
-  let v = (table.(x lsr 3) lsr ((x land 7) lsl 1)) land 3 in
-    if      v = 1 then UnicodeLetter
-    else if v = 2 then UnicodeIdentPart
-    else if v = 3 then UnicodeSymbol
-    else raise UnsupportedUtf8
-
-(* [classify_unicode] discriminates between 3 different kinds of
-   symbols based on the standard unicode classification (extracted from
-   Camomile). *)
-let classify_unicode =
-  let single c = [ (c, c) ] in
-    (* General tables. *)
-    mk_lookup_table_from_unicode_tables_for UnicodeSymbol
-      [
-        Unicodetable.sm;           (* Symbol, maths.             *)
-        Unicodetable.sc;           (* Symbol, currency.          *)
-        Unicodetable.so;           (* Symbol, modifier.          *)
-        Unicodetable.pd;           (* Punctation, dash.          *)
-        Unicodetable.pc;           (* Punctation, connector.     *)
-        Unicodetable.pe;           (* Punctation, open.          *)
-        Unicodetable.ps;           (* Punctation, close.         *)
-        Unicodetable.pi;           (* Punctation, initial quote. *)
-        Unicodetable.pf;           (* Punctation, final quote.   *)
-        Unicodetable.po;           (* Punctation, other.         *)
-      ];
-    mk_lookup_table_from_unicode_tables_for UnicodeLetter
-      [
-        Unicodetable.lu;           (* Letter, uppercase.         *)
-        Unicodetable.ll;           (* Letter, lowercase.         *)
-        Unicodetable.lt;           (* Letter, titlecase.         *)
-        Unicodetable.lo;           (* Letter, others.            *)
-      ];
-    mk_lookup_table_from_unicode_tables_for UnicodeIdentPart
-      [
-        Unicodetable.nd;           (* Number, decimal digits.    *)
-        Unicodetable.nl;           (* Number, letter.            *)
-        Unicodetable.no;           (* Number, other.             *)
-      ];
-    (* Exceptions (from a previous version of this function). *)
-    mk_lookup_table_from_unicode_tables_for UnicodeSymbol
-      [
-        single 0x000B2;            (* Squared.                   *)
-        single 0x0002E;            (* Dot.                       *)
-      ];
-    mk_lookup_table_from_unicode_tables_for UnicodeLetter
-      [
-        single 0x005F;             (* Underscore.                *)
-        single 0x00A0;             (* Non breaking space.        *)
-      ];
-    mk_lookup_table_from_unicode_tables_for UnicodeIdentPart
-      [
-        single 0x0027;             (* Special space.             *)
-      ];
-    (* Lookup *)
-    lookup
-
-exception End_of_input
-
-let utf8_of_unicode n =
-  if n < 128 then
-    String.make 1 (Char.chr n)
-  else if n < 2048 then
-    let s = String.make 2 (Char.chr (128 + n mod 64)) in
-    begin
-      s.[0] <- Char.chr (192 + n / 64);
-      s
-    end
-  else if n < 65536 then
-    let s = String.make 3 (Char.chr (128 + n mod 64)) in
-    begin
-      s.[1] <- Char.chr (128 + (n / 64) mod 64);
-      s.[0] <- Char.chr (224 + n / 4096);
-      s
-    end
-  else
-    let s = String.make 4 (Char.chr (128 + n mod 64)) in
-    begin
-      s.[2] <- Char.chr (128 + (n / 64) mod 64);
-      s.[1] <- Char.chr (128 + (n / 4096) mod 64);
-      s.[0] <- Char.chr (240 + n / 262144);
-      s
-    end
-
-let next_utf8 s i =
-  let err () = invalid_arg "utf8" in
-  let l = String.length s - i in
-  if l = 0 then raise End_of_input
-  else let a = Char.code s.[i] in if a <= 0x7F then
-    1, a
-  else if a land 0x40 = 0 or l = 1 then err ()
-  else let b = Char.code s.[i+1] in if b land 0xC0 <> 0x80 then err ()
-  else if a land 0x20 = 0 then
-    2, (a land 0x1F) lsl 6 + (b land 0x3F)
-  else if l = 2 then err ()
-  else let c = Char.code s.[i+2] in if c land 0xC0 <> 0x80 then err ()
-  else if a land 0x10 = 0 then
-    3, (a land 0x0F) lsl 12 + (b land 0x3F) lsl 6 + (c land 0x3F)
-  else if l = 3 then err ()
-  else let d = Char.code s.[i+3] in if d land 0xC0 <> 0x80 then err ()
-  else if a land 0x08 = 0 then
-    4, (a land 0x07) lsl 18 + (b land 0x3F) lsl 12 +
-       (c land 0x3F) lsl 6 + (d land 0x3F)
-  else err ()
-
-(* Check the well-formedness of an identifier *)
-
-let initial_refutation j n s =
-  match classify_unicode n with
-  | UnicodeLetter -> None
-  | _ ->
-      let c = String.sub s 0 j in
-      Some (false,"Invalid character '"^c^"' at beginning of identifier \""^s^"\".")
-
-let trailing_refutation i j n s =
-  match classify_unicode n with
-  | UnicodeLetter | UnicodeIdentPart -> None
-  | _ ->
-      let c = String.sub s i j in
-      Some (false,"Invalid character '"^c^"' in identifier \""^s^"\".")
-
-let ident_refutation s =
-  if s = ".." then None else try
-    let j, n = next_utf8 s 0 in
-      match initial_refutation j n s with
-	|None ->
-	   begin try
-	     let rec aux i =
-	       let j, n = next_utf8 s i in
-		 match trailing_refutation i j n s with
-		   |None -> aux (i + j)
-		   |x -> x
-	     in aux j
-	   with End_of_input -> None
-	   end
-	|x -> x
-  with
-  | End_of_input -> Some (true,"The empty string is not an identifier.")
-  | UnsupportedUtf8 -> Some (true,s^": unsupported character in utf8 sequence.")
-  | Invalid_argument _ -> Some (true,s^": invalid utf8 sequence.")
-
-let lowercase_unicode =
-  let tree = Segmenttree.make Unicodetable.to_lower in
-  fun unicode ->
-    try
-      match Segmenttree.lookup unicode tree with
-	| `Abs c -> c
-	| `Delta d -> unicode + d
-    with Not_found -> unicode
-
-let lowercase_first_char_utf8 s =
-  assert (s <> "");
-  let j, n = next_utf8 s 0 in
-  utf8_of_unicode (lowercase_unicode n)
-
-(** For extraction, we need to encode unicode character into ascii ones *)
-
-let ascii_of_ident s =
-  let check_ascii s =
-    let ok = ref true in
-    String.iter (fun c -> if Char.code c >= 128 then ok := false) s;
-    !ok
-  in
-  if check_ascii s then s else
-    let i = ref 0 and out = ref "" in
-    begin try while true do
-      let j, n = next_utf8 s !i in
-      out :=
-	if n >= 128
-	then Printf.sprintf "%s__U%04x_" !out n
-	else Printf.sprintf "%s%c" !out s.[!i];
-      i := !i + j
-    done with End_of_input -> () end;
-    !out
-
 (* Lists *)
 
 module List : CList.ExtS = CList
author	letouzey <letouzey@85f007b7-540e-0410-9357-904b9bb8a0f7>	2012-09-18 15:06:42 +0000
committer	letouzey <letouzey@85f007b7-540e-0410-9357-904b9bb8a0f7>	2012-09-18 15:06:42 +0000
commit	bf08866eabad4408de975bae92f3b3c1f718322c (patch)
tree	faab2a63879ce58c26ed5439a21af47a06226051 /lib/util.ml
parent	4422e16f529359bb96c7eee214b2b6648958ef48 (diff)