aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar letouzey <letouzey@85f007b7-540e-0410-9357-904b9bb8a0f7>2012-09-18 15:06:42 +0000
committerGravatar letouzey <letouzey@85f007b7-540e-0410-9357-904b9bb8a0f7>2012-09-18 15:06:42 +0000
commitbf08866eabad4408de975bae92f3b3c1f718322c (patch)
treefaab2a63879ce58c26ed5439a21af47a06226051
parent4422e16f529359bb96c7eee214b2b6648958ef48 (diff)
More cleanup of Util: utf8 aspects moved to a new file unicode.ml
git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/coq/trunk@15818 85f007b7-540e-0410-9357-904b9bb8a0f7
-rw-r--r--checker/check.mllib1
-rw-r--r--dev/printers.mllib1
-rw-r--r--grammar/grammar.mllib5
-rw-r--r--kernel/names.ml4
-rw-r--r--lib/clib.mllib2
-rw-r--r--lib/lib.mllib3
-rw-r--r--lib/system.ml2
-rw-r--r--lib/unicode.ml226
-rw-r--r--lib/unicode.mli27
-rw-r--r--lib/util.ml217
-rw-r--r--lib/util.mli10
-rw-r--r--parsing/lexer.ml420
-rw-r--r--plugins/extraction/common.ml2
-rw-r--r--pretyping/namegen.ml2
14 files changed, 276 insertions, 246 deletions
diff --git a/checker/check.mllib b/checker/check.mllib
index 2a6486ddd..6203c1fbb 100644
--- a/checker/check.mllib
+++ b/checker/check.mllib
@@ -6,6 +6,7 @@ Pp
Loc
Segmenttree
Unicodetable
+Unicode
Errors
CObj
CList
diff --git a/dev/printers.mllib b/dev/printers.mllib
index 286c6faea..c2a77f215 100644
--- a/dev/printers.mllib
+++ b/dev/printers.mllib
@@ -7,6 +7,7 @@ Pp
Loc
Segmenttree
Unicodetable
+Unicode
Errors
CObj
CList
diff --git a/grammar/grammar.mllib b/grammar/grammar.mllib
index dc63525d5..960dcb6de 100644
--- a/grammar/grammar.mllib
+++ b/grammar/grammar.mllib
@@ -5,8 +5,6 @@ Compat
Flags
Pp
Loc
-Segmenttree
-Unicodetable
Errors
CList
CArray
@@ -15,6 +13,9 @@ Bigint
Hashcons
Predicate
Option
+Segmenttree
+Unicodetable
+Unicode
Names
Libnames
diff --git a/kernel/names.ml b/kernel/names.ml
index aa5313842..599e8fb12 100644
--- a/kernel/names.ml
+++ b/kernel/names.ml
@@ -29,9 +29,9 @@ type identifier = string
let check_ident_soft x =
Option.iter (fun (fatal,x) ->
if fatal then error x else Pp.msg_warning (str x))
- (ident_refutation x)
+ (Unicode.ident_refutation x)
let check_ident x =
- Option.iter (fun (_,x) -> Errors.error x) (ident_refutation x)
+ Option.iter (fun (_,x) -> Errors.error x) (Unicode.ident_refutation x)
let id_of_string s = check_ident_soft s; String.copy s
let string_of_id id = String.copy id
diff --git a/lib/clib.mllib b/lib/clib.mllib
index 0dd24d74c..ca0e97bb0 100644
--- a/lib/clib.mllib
+++ b/lib/clib.mllib
@@ -1,8 +1,6 @@
Pp_control
Pp
Coq_config
-Segmenttree
-Unicodetable
Deque
CObj
CList
diff --git a/lib/lib.mllib b/lib/lib.mllib
index 618ff80b6..21eb7a4a5 100644
--- a/lib/lib.mllib
+++ b/lib/lib.mllib
@@ -6,6 +6,9 @@ Errors
Bigint
Hashcons
Dyn
+Segmenttree
+Unicodetable
+Unicode
System
Gmap
Fset
diff --git a/lib/system.ml b/lib/system.ml
index e721b1f65..cf650d6ab 100644
--- a/lib/system.ml
+++ b/lib/system.ml
@@ -24,7 +24,7 @@ let exclude_search_in_dirname f = skipped_dirnames := f :: !skipped_dirnames
let ok_dirname f =
f <> "" && f.[0] <> '.' && not (List.mem f !skipped_dirnames) &&
- match ident_refutation f with |None -> true |_ -> false
+ (Unicode.ident_refutation f = None)
let all_subdirs ~unix_path:root =
let l = ref [] in
diff --git a/lib/unicode.ml b/lib/unicode.ml
new file mode 100644
index 000000000..75b3b5899
--- /dev/null
+++ b/lib/unicode.ml
@@ -0,0 +1,226 @@
+(***********************************************************************)
+(* v * The Coq Proof Assistant / The Coq Development Team *)
+(* <O___,, * INRIA-Rocquencourt & LRI-CNRS-Orsay *)
+(* \VV/ *************************************************************)
+(* // * This file is distributed under the terms of the *)
+(* * GNU Lesser General Public License Version 2.1 *)
+(***********************************************************************)
+
+(** Unicode utilities *)
+
+type status = Letter | IdentPart | Symbol
+
+exception Unsupported
+
+(* The following table stores classes of Unicode characters that
+ are used by the lexer. There are 3 different classes so 2 bits are
+ allocated for each character. We only use 16 bits over the 31 bits
+ to simplify the masking process. (This choice seems to be a good
+ trade-off between speed and space after some benchmarks.) *)
+
+(* A 256ko table, initially filled with zeros. *)
+let table = Array.create (1 lsl 17) 0
+
+(* Associate a 2-bit pattern to each status at position [i].
+ Only the 3 lowest bits of [i] are taken into account to
+ define the position of the pattern in the word.
+ Notice that pattern "00" means "undefined". *)
+let mask i = function
+ | Letter -> 1 lsl ((i land 7) lsl 1) (* 01 *)
+ | IdentPart -> 2 lsl ((i land 7) lsl 1) (* 10 *)
+ | Symbol -> 3 lsl ((i land 7) lsl 1) (* 11 *)
+
+(* Helper to reset 2 bits in a word. *)
+let reset_mask i =
+ lnot (3 lsl ((i land 7) lsl 1))
+
+(* Initialize the lookup table from a list of segments, assigning
+ a status to every character of each segment. The order of these
+ assignments is relevant: it is possible to assign status [s] to
+ a segment [(c1, c2)] and later assign [s'] to [c] even if [c] is
+ between [c1] and [c2]. *)
+let mk_lookup_table_from_unicode_tables_for status tables =
+ List.iter
+ (List.iter
+ (fun (c1, c2) ->
+ for i = c1 to c2 do
+ table.(i lsr 3) <-
+ (table.(i lsr 3) land (reset_mask i)) lor (mask i status)
+ done))
+ tables
+
+(* Look up into the table and interpret the found pattern. *)
+let lookup x =
+ let v = (table.(x lsr 3) lsr ((x land 7) lsl 1)) land 3 in
+ if v = 1 then Letter
+ else if v = 2 then IdentPart
+ else if v = 3 then Symbol
+ else raise Unsupported
+
+(* [classify] discriminates between 3 different kinds of
+ symbols based on the standard unicode classification (extracted from
+ Camomile). *)
+let classify =
+ let single c = [ (c, c) ] in
+ (* General tables. *)
+ mk_lookup_table_from_unicode_tables_for Symbol
+ [
+ Unicodetable.sm; (* Symbol, maths. *)
+ Unicodetable.sc; (* Symbol, currency. *)
+ Unicodetable.so; (* Symbol, modifier. *)
+ Unicodetable.pd; (* Punctation, dash. *)
+ Unicodetable.pc; (* Punctation, connector. *)
+ Unicodetable.pe; (* Punctation, open. *)
+ Unicodetable.ps; (* Punctation, close. *)
+ Unicodetable.pi; (* Punctation, initial quote. *)
+ Unicodetable.pf; (* Punctation, final quote. *)
+ Unicodetable.po; (* Punctation, other. *)
+ ];
+ mk_lookup_table_from_unicode_tables_for Letter
+ [
+ Unicodetable.lu; (* Letter, uppercase. *)
+ Unicodetable.ll; (* Letter, lowercase. *)
+ Unicodetable.lt; (* Letter, titlecase. *)
+ Unicodetable.lo; (* Letter, others. *)
+ ];
+ mk_lookup_table_from_unicode_tables_for IdentPart
+ [
+ Unicodetable.nd; (* Number, decimal digits. *)
+ Unicodetable.nl; (* Number, letter. *)
+ Unicodetable.no; (* Number, other. *)
+ ];
+ (* Exceptions (from a previous version of this function). *)
+ mk_lookup_table_from_unicode_tables_for Symbol
+ [
+ single 0x000B2; (* Squared. *)
+ single 0x0002E; (* Dot. *)
+ ];
+ mk_lookup_table_from_unicode_tables_for Letter
+ [
+ single 0x005F; (* Underscore. *)
+ single 0x00A0; (* Non breaking space. *)
+ ];
+ mk_lookup_table_from_unicode_tables_for IdentPart
+ [
+ single 0x0027; (* Special space. *)
+ ];
+ (* Lookup *)
+ lookup
+
+exception End_of_input
+
+let utf8_of_unicode n =
+ if n < 128 then
+ String.make 1 (Char.chr n)
+ else if n < 2048 then
+ let s = String.make 2 (Char.chr (128 + n mod 64)) in
+ begin
+ s.[0] <- Char.chr (192 + n / 64);
+ s
+ end
+ else if n < 65536 then
+ let s = String.make 3 (Char.chr (128 + n mod 64)) in
+ begin
+ s.[1] <- Char.chr (128 + (n / 64) mod 64);
+ s.[0] <- Char.chr (224 + n / 4096);
+ s
+ end
+ else
+ let s = String.make 4 (Char.chr (128 + n mod 64)) in
+ begin
+ s.[2] <- Char.chr (128 + (n / 64) mod 64);
+ s.[1] <- Char.chr (128 + (n / 4096) mod 64);
+ s.[0] <- Char.chr (240 + n / 262144);
+ s
+ end
+
+let next_utf8 s i =
+ let err () = invalid_arg "utf8" in
+ let l = String.length s - i in
+ if l = 0 then raise End_of_input
+ else let a = Char.code s.[i] in if a <= 0x7F then
+ 1, a
+ else if a land 0x40 = 0 or l = 1 then err ()
+ else let b = Char.code s.[i+1] in if b land 0xC0 <> 0x80 then err ()
+ else if a land 0x20 = 0 then
+ 2, (a land 0x1F) lsl 6 + (b land 0x3F)
+ else if l = 2 then err ()
+ else let c = Char.code s.[i+2] in if c land 0xC0 <> 0x80 then err ()
+ else if a land 0x10 = 0 then
+ 3, (a land 0x0F) lsl 12 + (b land 0x3F) lsl 6 + (c land 0x3F)
+ else if l = 3 then err ()
+ else let d = Char.code s.[i+3] in if d land 0xC0 <> 0x80 then err ()
+ else if a land 0x08 = 0 then
+ 4, (a land 0x07) lsl 18 + (b land 0x3F) lsl 12 +
+ (c land 0x3F) lsl 6 + (d land 0x3F)
+ else err ()
+
+(* Check the well-formedness of an identifier *)
+
+let initial_refutation j n s =
+ match classify n with
+ | Letter -> None
+ | _ ->
+ let c = String.sub s 0 j in
+ Some (false,"Invalid character '"^c^"' at beginning of identifier \""^s^"\".")
+
+let trailing_refutation i j n s =
+ match classify n with
+ | Letter | IdentPart -> None
+ | _ ->
+ let c = String.sub s i j in
+ Some (false,"Invalid character '"^c^"' in identifier \""^s^"\".")
+
+let ident_refutation s =
+ if s = ".." then None else try
+ let j, n = next_utf8 s 0 in
+ match initial_refutation j n s with
+ |None ->
+ begin try
+ let rec aux i =
+ let j, n = next_utf8 s i in
+ match trailing_refutation i j n s with
+ |None -> aux (i + j)
+ |x -> x
+ in aux j
+ with End_of_input -> None
+ end
+ |x -> x
+ with
+ | End_of_input -> Some (true,"The empty string is not an identifier.")
+ | Unsupported -> Some (true,s^": unsupported character in utf8 sequence.")
+ | Invalid_argument _ -> Some (true,s^": invalid utf8 sequence.")
+
+let lowercase_unicode =
+ let tree = Segmenttree.make Unicodetable.to_lower in
+ fun unicode ->
+ try
+ match Segmenttree.lookup unicode tree with
+ | `Abs c -> c
+ | `Delta d -> unicode + d
+ with Not_found -> unicode
+
+let lowercase_first_char s =
+ assert (s <> "");
+ let j, n = next_utf8 s 0 in
+ utf8_of_unicode (lowercase_unicode n)
+
+(** For extraction, we need to encode unicode character into ascii ones *)
+
+let ascii_of_ident s =
+ let check_ascii s =
+ let ok = ref true in
+ String.iter (fun c -> if Char.code c >= 128 then ok := false) s;
+ !ok
+ in
+ if check_ascii s then s else
+ let i = ref 0 and out = ref "" in
+ begin try while true do
+ let j, n = next_utf8 s !i in
+ out :=
+ if n >= 128
+ then Printf.sprintf "%s__U%04x_" !out n
+ else Printf.sprintf "%s%c" !out s.[!i];
+ i := !i + j
+ done with End_of_input -> () end;
+ !out
diff --git a/lib/unicode.mli b/lib/unicode.mli
new file mode 100644
index 000000000..763123306
--- /dev/null
+++ b/lib/unicode.mli
@@ -0,0 +1,27 @@
+(************************************************************************)
+(* v * The Coq Proof Assistant / The Coq Development Team *)
+(* <O___,, * INRIA - CNRS - LIX - LRI - PPS - Copyright 1999-2012 *)
+(* \VV/ **************************************************************)
+(* // * This file is distributed under the terms of the *)
+(* * GNU Lesser General Public License Version 2.1 *)
+(************************************************************************)
+
+(** Unicode utilities *)
+
+type status = Letter | IdentPart | Symbol
+
+exception Unsupported
+
+(** Classify a unicode char into 3 classes, or raise [Unsupported] *)
+val classify : int -> status
+
+(** Check whether a given string be used as a legal identifier.
+ - [None] means yes
+ - [Some (b,s)] means no, with explanation [s] and severity [b] *)
+val ident_refutation : string -> (bool * string) option
+
+(** First char of a string, converted to lowercase *)
+val lowercase_first_char : string -> string
+
+(** For extraction, turn a unicode string into an ascii-only one *)
+val ascii_of_ident : string -> string
diff --git a/lib/util.ml b/lib/util.ml
index 6c1fc39b5..624075956 100644
--- a/lib/util.ml
+++ b/lib/util.ml
@@ -146,223 +146,6 @@ module Stringset = Set.Make(struct type t = string let compare (x:t) (y:t) = com
module Stringmap = Map.Make(struct type t = string let compare (x:t) (y:t) = compare x y end)
-type utf8_status = UnicodeLetter | UnicodeIdentPart | UnicodeSymbol
-
-exception UnsupportedUtf8
-
-(* The following table stores classes of Unicode characters that
- are used by the lexer. There are 3 different classes so 2 bits are
- allocated for each character. We only use 16 bits over the 31 bits
- to simplify the masking process. (This choice seems to be a good
- trade-off between speed and space after some benchmarks.) *)
-
-(* A 256ko table, initially filled with zeros. *)
-let table = Array.create (1 lsl 17) 0
-
-(* Associate a 2-bit pattern to each status at position [i].
- Only the 3 lowest bits of [i] are taken into account to
- define the position of the pattern in the word.
- Notice that pattern "00" means "undefined". *)
-let mask i = function
- | UnicodeLetter -> 1 lsl ((i land 7) lsl 1) (* 01 *)
- | UnicodeIdentPart -> 2 lsl ((i land 7) lsl 1) (* 10 *)
- | UnicodeSymbol -> 3 lsl ((i land 7) lsl 1) (* 11 *)
-
-(* Helper to reset 2 bits in a word. *)
-let reset_mask i =
- lnot (3 lsl ((i land 7) lsl 1))
-
-(* Initialize the lookup table from a list of segments, assigning
- a status to every character of each segment. The order of these
- assignments is relevant: it is possible to assign status [s] to
- a segment [(c1, c2)] and later assign [s'] to [c] even if [c] is
- between [c1] and [c2]. *)
-let mk_lookup_table_from_unicode_tables_for status tables =
- List.iter
- (List.iter
- (fun (c1, c2) ->
- for i = c1 to c2 do
- table.(i lsr 3) <-
- (table.(i lsr 3) land (reset_mask i)) lor (mask i status)
- done))
- tables
-
-(* Look up into the table and interpret the found pattern. *)
-let lookup x =
- let v = (table.(x lsr 3) lsr ((x land 7) lsl 1)) land 3 in
- if v = 1 then UnicodeLetter
- else if v = 2 then UnicodeIdentPart
- else if v = 3 then UnicodeSymbol
- else raise UnsupportedUtf8
-
-(* [classify_unicode] discriminates between 3 different kinds of
- symbols based on the standard unicode classification (extracted from
- Camomile). *)
-let classify_unicode =
- let single c = [ (c, c) ] in
- (* General tables. *)
- mk_lookup_table_from_unicode_tables_for UnicodeSymbol
- [
- Unicodetable.sm; (* Symbol, maths. *)
- Unicodetable.sc; (* Symbol, currency. *)
- Unicodetable.so; (* Symbol, modifier. *)
- Unicodetable.pd; (* Punctation, dash. *)
- Unicodetable.pc; (* Punctation, connector. *)
- Unicodetable.pe; (* Punctation, open. *)
- Unicodetable.ps; (* Punctation, close. *)
- Unicodetable.pi; (* Punctation, initial quote. *)
- Unicodetable.pf; (* Punctation, final quote. *)
- Unicodetable.po; (* Punctation, other. *)
- ];
- mk_lookup_table_from_unicode_tables_for UnicodeLetter
- [
- Unicodetable.lu; (* Letter, uppercase. *)
- Unicodetable.ll; (* Letter, lowercase. *)
- Unicodetable.lt; (* Letter, titlecase. *)
- Unicodetable.lo; (* Letter, others. *)
- ];
- mk_lookup_table_from_unicode_tables_for UnicodeIdentPart
- [
- Unicodetable.nd; (* Number, decimal digits. *)
- Unicodetable.nl; (* Number, letter. *)
- Unicodetable.no; (* Number, other. *)
- ];
- (* Exceptions (from a previous version of this function). *)
- mk_lookup_table_from_unicode_tables_for UnicodeSymbol
- [
- single 0x000B2; (* Squared. *)
- single 0x0002E; (* Dot. *)
- ];
- mk_lookup_table_from_unicode_tables_for UnicodeLetter
- [
- single 0x005F; (* Underscore. *)
- single 0x00A0; (* Non breaking space. *)
- ];
- mk_lookup_table_from_unicode_tables_for UnicodeIdentPart
- [
- single 0x0027; (* Special space. *)
- ];
- (* Lookup *)
- lookup
-
-exception End_of_input
-
-let utf8_of_unicode n =
- if n < 128 then
- String.make 1 (Char.chr n)
- else if n < 2048 then
- let s = String.make 2 (Char.chr (128 + n mod 64)) in
- begin
- s.[0] <- Char.chr (192 + n / 64);
- s
- end
- else if n < 65536 then
- let s = String.make 3 (Char.chr (128 + n mod 64)) in
- begin
- s.[1] <- Char.chr (128 + (n / 64) mod 64);
- s.[0] <- Char.chr (224 + n / 4096);
- s
- end
- else
- let s = String.make 4 (Char.chr (128 + n mod 64)) in
- begin
- s.[2] <- Char.chr (128 + (n / 64) mod 64);
- s.[1] <- Char.chr (128 + (n / 4096) mod 64);
- s.[0] <- Char.chr (240 + n / 262144);
- s
- end
-
-let next_utf8 s i =
- let err () = invalid_arg "utf8" in
- let l = String.length s - i in
- if l = 0 then raise End_of_input
- else let a = Char.code s.[i] in if a <= 0x7F then
- 1, a
- else if a land 0x40 = 0 or l = 1 then err ()
- else let b = Char.code s.[i+1] in if b land 0xC0 <> 0x80 then err ()
- else if a land 0x20 = 0 then
- 2, (a land 0x1F) lsl 6 + (b land 0x3F)
- else if l = 2 then err ()
- else let c = Char.code s.[i+2] in if c land 0xC0 <> 0x80 then err ()
- else if a land 0x10 = 0 then
- 3, (a land 0x0F) lsl 12 + (b land 0x3F) lsl 6 + (c land 0x3F)
- else if l = 3 then err ()
- else let d = Char.code s.[i+3] in if d land 0xC0 <> 0x80 then err ()
- else if a land 0x08 = 0 then
- 4, (a land 0x07) lsl 18 + (b land 0x3F) lsl 12 +
- (c land 0x3F) lsl 6 + (d land 0x3F)
- else err ()
-
-(* Check the well-formedness of an identifier *)
-
-let initial_refutation j n s =
- match classify_unicode n with
- | UnicodeLetter -> None
- | _ ->
- let c = String.sub s 0 j in
- Some (false,"Invalid character '"^c^"' at beginning of identifier \""^s^"\".")
-
-let trailing_refutation i j n s =
- match classify_unicode n with
- | UnicodeLetter | UnicodeIdentPart -> None
- | _ ->
- let c = String.sub s i j in
- Some (false,"Invalid character '"^c^"' in identifier \""^s^"\".")
-
-let ident_refutation s =
- if s = ".." then None else try
- let j, n = next_utf8 s 0 in
- match initial_refutation j n s with
- |None ->
- begin try
- let rec aux i =
- let j, n = next_utf8 s i in
- match trailing_refutation i j n s with
- |None -> aux (i + j)
- |x -> x
- in aux j
- with End_of_input -> None
- end
- |x -> x
- with
- | End_of_input -> Some (true,"The empty string is not an identifier.")
- | UnsupportedUtf8 -> Some (true,s^": unsupported character in utf8 sequence.")
- | Invalid_argument _ -> Some (true,s^": invalid utf8 sequence.")
-
-let lowercase_unicode =
- let tree = Segmenttree.make Unicodetable.to_lower in
- fun unicode ->
- try
- match Segmenttree.lookup unicode tree with
- | `Abs c -> c
- | `Delta d -> unicode + d
- with Not_found -> unicode
-
-let lowercase_first_char_utf8 s =
- assert (s <> "");
- let j, n = next_utf8 s 0 in
- utf8_of_unicode (lowercase_unicode n)
-
-(** For extraction, we need to encode unicode character into ascii ones *)
-
-let ascii_of_ident s =
- let check_ascii s =
- let ok = ref true in
- String.iter (fun c -> if Char.code c >= 128 then ok := false) s;
- !ok
- in
- if check_ascii s then s else
- let i = ref 0 and out = ref "" in
- begin try while true do
- let j, n = next_utf8 s !i in
- out :=
- if n >= 128
- then Printf.sprintf "%s__U%04x_" !out n
- else Printf.sprintf "%s%c" !out s.[!i];
- i := !i + j
- done with End_of_input -> () end;
- !out
-
(* Lists *)
module List : CList.ExtS = CList
diff --git a/lib/util.mli b/lib/util.mli
index 1333d1854..694e79ce4 100644
--- a/lib/util.mli
+++ b/lib/util.mli
@@ -33,7 +33,6 @@ val is_letter : char -> bool
val is_digit : char -> bool
val is_ident_tail : char -> bool
val is_blank : char -> bool
-val next_utf8 : string -> int -> int * int
(** {6 Strings. } *)
@@ -55,15 +54,6 @@ val parse_loadpath : string -> string list
module Stringset : Set.S with type elt = string
module Stringmap : Map.S with type key = string
-type utf8_status = UnicodeLetter | UnicodeIdentPart | UnicodeSymbol
-
-exception UnsupportedUtf8
-
-val ident_refutation : string -> (bool * string) option
-val classify_unicode : int -> utf8_status
-val lowercase_first_char_utf8 : string -> string
-val ascii_of_ident : string -> string
-
(** {6 Lists. } *)
module List : CList.ExtS
diff --git a/parsing/lexer.ml4 b/parsing/lexer.ml4
index f277acfd1..dc120bd6d 100644
--- a/parsing/lexer.ml4
+++ b/parsing/lexer.ml4
@@ -107,7 +107,7 @@ let bad_token str = raise (Error.E (Bad_token str))
(* Lexer conventions on tokens *)
type token_kind =
- | Utf8Token of (utf8_status * int)
+ | Utf8Token of (Unicode.status * int)
| AsciiChar
| EmptyStream
@@ -161,8 +161,8 @@ let lookup_utf8_tail c cs =
(Char.code c3 land 0x3F) lsl 6 + (Char.code c4 land 0x3F)
| _ -> error_utf8 cs
in
- try classify_unicode unicode, n
- with UnsupportedUtf8 ->
+ try Unicode.classify unicode, n
+ with Unicode.Unsupported ->
njunk n cs; error_unsupported_unicode_character n unicode cs
let lookup_utf8 cs =
@@ -199,8 +199,8 @@ let check_ident str =
loop_id true s
| [< s >] ->
match unlocated lookup_utf8 s with
- | Utf8Token (UnicodeLetter, n) -> njunk n s; loop_id true s
- | Utf8Token (UnicodeIdentPart, n) when intail -> njunk n s; loop_id true s
+ | Utf8Token (Unicode.Letter, n) -> njunk n s; loop_id true s
+ | Utf8Token (Unicode.IdentPart, n) when intail -> njunk n s; loop_id true s
| EmptyStream -> ()
| Utf8Token _ | AsciiChar -> bad_token str
in
@@ -261,7 +261,7 @@ let rec ident_tail len = parser
ident_tail (store len c) s
| [< s >] ->
match lookup_utf8 s with
- | Utf8Token ((UnicodeIdentPart | UnicodeLetter), n) ->
+ | Utf8Token ((Unicode.IdentPart | Unicode.Letter), n) ->
ident_tail (nstore n len s) s
| _ -> len
@@ -445,7 +445,7 @@ let parse_after_special c bp =
token_of_special c (get_buff len)
| [< s >] ->
match lookup_utf8 s with
- | Utf8Token (UnicodeLetter, n) ->
+ | Utf8Token (Unicode.Letter, n) ->
token_of_special c (get_buff (ident_tail (nstore n 0 s) s))
| AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars bp c s)
@@ -457,7 +457,7 @@ let parse_after_qmark bp s =
| None -> KEYWORD "?"
| _ ->
match lookup_utf8 s with
- | Utf8Token (UnicodeLetter, _) -> LEFTQMARK
+ | Utf8Token (Unicode.Letter, _) -> LEFTQMARK
| AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars bp '?' s)
let blank_or_eof cs =
@@ -506,13 +506,13 @@ let rec next_token = parser bp
t
| [< s >] ->
match lookup_utf8 s with
- | Utf8Token (UnicodeLetter, n) ->
+ | Utf8Token (Unicode.Letter, n) ->
let len = ident_tail (nstore n 0 s) s in
let id = get_buff len in
let ep = Stream.count s in
comment_stop bp;
(try find_keyword id s with Not_found -> IDENT id), (bp, ep)
- | AsciiChar | Utf8Token ((UnicodeSymbol | UnicodeIdentPart), _) ->
+ | AsciiChar | Utf8Token ((Unicode.Symbol | Unicode.IdentPart), _) ->
let t = process_chars bp (Stream.next s) s in
comment_stop bp; t
| EmptyStream ->
diff --git a/plugins/extraction/common.ml b/plugins/extraction/common.ml
index 8cceb2a11..04cc167a8 100644
--- a/plugins/extraction/common.ml
+++ b/plugins/extraction/common.ml
@@ -22,7 +22,7 @@ let string_of_id id =
for i = 0 to String.length s - 2 do
if s.[i] = '_' && s.[i+1] = '_' then warning_id s
done;
- ascii_of_ident s
+ Unicode.ascii_of_ident s
let is_mp_bound = function MPbound _ -> true | _ -> false
diff --git a/pretyping/namegen.ml b/pretyping/namegen.ml
index 9d5777358..36355e130 100644
--- a/pretyping/namegen.ml
+++ b/pretyping/namegen.ml
@@ -63,7 +63,7 @@ let is_constructor id =
(* Generating "intuitive" names from its type *)
let lowercase_first_char id = (* First character of a constr *)
- lowercase_first_char_utf8 (string_of_id id)
+ Unicode.lowercase_first_char (string_of_id id)
let sort_hdchar = function
| Prop(_) -> "P"