aboutsummaryrefslogtreecommitdiffhomepage
path: root/lib/util.ml
diff options
context:
space:
mode:
authorGravatar regisgia <regisgia@85f007b7-540e-0410-9357-904b9bb8a0f7>2009-12-20 21:39:20 +0000
committerGravatar regisgia <regisgia@85f007b7-540e-0410-9357-904b9bb8a0f7>2009-12-20 21:39:20 +0000
commitc4ed1e18335c74750a55b22fc1d9c824fa32dc11 (patch)
treef25a5b7808f4bdbf8b929d5f3a58023204b057d3 /lib/util.ml
parentfc3b7b356e68dba0bbabd8bf755c3a836f6a6396 (diff)
* Rewrite [classify_unicode] using standard unicode tables.
(This should be a conservative extension of the old version.) git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/coq/trunk@12601 85f007b7-540e-0410-9357-904b9bb8a0f7
Diffstat (limited to 'lib/util.ml')
-rw-r--r--lib/util.ml222
1 files changed, 94 insertions, 128 deletions
diff --git a/lib/util.ml b/lib/util.ml
index 5e2a071a9..69a1e9f8b 100644
--- a/lib/util.ml
+++ b/lib/util.ml
@@ -167,133 +167,100 @@ type utf8_status = UnicodeLetter | UnicodeIdentPart | UnicodeSymbol
exception UnsupportedUtf8
-let classify_unicode unicode =
- match unicode land 0x1F000 with
- | 0x0 ->
- begin match unicode with
- (* utf-8 Basic Latin underscore *)
- | x when x = 0x005F -> UnicodeLetter
- (* utf-8 Basic Latin letters *)
- | x when 0x0041 <= x & x <= 0x005A -> UnicodeLetter
- | x when 0x0061 <= x & x <= 0x007A -> UnicodeLetter
- (* utf-8 Basic Latin digits and quote *)
- | x when 0x0030 <= x & x <= 0x0039 or x = 0x0027 -> UnicodeIdentPart
- (* utf-8 Basic Latin symbols *)
- | x when x <= 0x007F -> UnicodeSymbol
- (* utf-8 Latin-1 non breaking space U00A0 *)
- | 0x00A0 -> UnicodeLetter
- (* utf-8 Latin-1 symbols U00A1-00BF *)
- | x when 0x00A0 <= x & x <= 0x00BF -> UnicodeSymbol
- (* utf-8 Latin-1 letters U00C0-00D6 *)
- | x when 0x00C0 <= x & x <= 0x00D6 -> UnicodeLetter
- (* utf-8 Latin-1 symbol U00D7 *)
- | 0x00D7 -> UnicodeSymbol
- (* utf-8 Latin-1 letters U00D8-00F6 *)
- | x when 0x00D8 <= x & x <= 0x00F6 -> UnicodeLetter
- (* utf-8 Latin-1 symbol U00F7 *)
- | 0x00F7 -> UnicodeSymbol
- (* utf-8 Latin-1 letters U00F8-00FF *)
- | x when 0x00F8 <= x & x <= 0x00FF -> UnicodeLetter
- (* utf-8 Latin Extended A U0100-017F and Latin Extended B U0180-U0241 *)
- | x when 0x0100 <= x & x <= 0x0241 -> UnicodeLetter
- (* utf-8 Phonetic letters U0250-02AF *)
- | x when 0x0250 <= x & x <= 0x02AF -> UnicodeLetter
- (* utf-8 what do to with diacritics U0300-U036F ? *)
- (* utf-8 Greek letters U0380-03FF *)
- | x when 0x0380 <= x & x <= 0x03FF -> UnicodeLetter
- (* utf-8 Cyrillic letters U0400-0481 *)
- | x when 0x0400 <= x & x <= 0x0481 -> UnicodeLetter
- (* utf-8 Cyrillic symbol U0482 *)
- | 0x0482 -> UnicodeSymbol
- (* utf-8 what do to with diacritics U0483-U0489 \ U0487 ? *)
- (* utf-8 Cyrillic letters U048A-U4F9 (Warning: 04CF) *)
- | x when 0x048A <= x & x <= 0x04F9 -> UnicodeLetter
- (* utf-8 Cyrillic supplement letters U0500-U050F *)
- | x when 0x0500 <= x & x <= 0x050F -> UnicodeLetter
- (* utf-8 Hebrew letters U05D0-05EA *)
- | x when 0x05D0 <= x & x <= 0x05EA -> UnicodeLetter
- (* utf-8 Arabic letters U0621-064A *)
- | x when 0x0621 <= x & x <= 0x064A -> UnicodeLetter
- (* utf-8 Arabic supplement letters U0750-076D *)
- | x when 0x0750 <= x & x <= 0x076D -> UnicodeLetter
- | _ -> raise UnsupportedUtf8
- end
- | 0x1000 ->
- begin match unicode with
- (* utf-8 Georgian U10A0-10FF (has holes) *)
- | x when 0x10A0 <= x & x <= 0x10FF -> UnicodeLetter
- (* utf-8 Hangul Jamo U1100-11FF (has holes) *)
- | x when 0x1100 <= x & x <= 0x11FF -> UnicodeLetter
- (* utf-8 Latin additional letters U1E00-1E9B and U1EA0-1EF9 *)
- | x when 0x1E00 <= x & x <= 0x1E9B -> UnicodeLetter
- | x when 0x1EA0 <= x & x <= 0x1EF9 -> UnicodeLetter
- | _ -> raise UnsupportedUtf8
- end
- | 0x2000 ->
- begin match unicode with
- (* utf-8 general punctuation U2080-2089 *)
- (* Hyphens *)
- | x when 0x2010 <= x & x <= 0x2011 -> UnicodeLetter
- (* Dashes and other symbols *)
- | x when 0x2012 <= x & x <= 0x2027 -> UnicodeSymbol
- (* Per mille and per ten thousand signs *)
- | x when 0x2030 <= x & x <= 0x2031 -> UnicodeSymbol
- (* Prime letters *)
- | x when 0x2032 <= x & x <= 0x2034 or x = 0x2057 -> UnicodeIdentPart
- (* Miscellaneous punctuation *)
- | x when 0x2039 <= x & x <= 0x2056 -> UnicodeSymbol
- | x when 0x2058 <= x & x <= 0x205E -> UnicodeSymbol
- (* Invisible mathematical operators *)
- | x when 0x2061 <= x & x <= 0x2063 -> UnicodeSymbol
- (* utf-8 superscript U2070-207C *)
- | x when 0x2070 <= x & x <= 0x207C -> UnicodeSymbol
- (* utf-8 subscript U2080-2089 *)
- | x when 0x2080 <= x & x <= 0x2089 -> UnicodeIdentPart
- (* utf-8 letter-like U2100-214F *)
- | x when 0x2100 <= x & x <= 0x214F -> UnicodeLetter
- (* utf-8 number-forms U2153-2183 *)
- | x when 0x2153 <= x & x <= 0x2183 -> UnicodeSymbol
- (* utf-8 arrows A U2190-21FF *)
- (* utf-8 mathematical operators U2200-22FF *)
- (* utf-8 miscellaneous technical U2300-23FF *)
- | x when 0x2190 <= x & x <= 0x23FF -> UnicodeSymbol
- (* utf-8 box drawing U2500-257F has ceiling, etc. *)
- (* utf-8 block elements U2580-259F *)
- (* utf-8 geom. shapes U25A0-25FF (has triangles, losange, etc) *)
- (* utf-8 miscellaneous symbols U2600-26FF *)
- | x when 0x2500 <= x & x <= 0x26FF -> UnicodeSymbol
- (* utf-8 arrows B U2900-297F *)
- | x when 0x2900 <= x & x <= 0x297F -> UnicodeSymbol
- (* utf-8 mathematical operators U2A00-2AFF *)
- | x when 0x2A00 <= x & x <= 0x2AFF -> UnicodeSymbol
- (* utf-8 bold symbols U2768-U2775 *)
- | x when 0x2768 <= x & x <= 0x2775 -> UnicodeSymbol
- (* utf-8 arrows and brackets U27E0-U27FF *)
- | x when 0x27E0 <= x & x <= 0x27FF -> UnicodeSymbol
- (* utf-8 brackets, braces and parentheses *)
- | x when 0x2980 <= x & x <= 0x29FF -> UnicodeSymbol
- (* utf-8 miscellaneous including double-plus U29F0-U29FF *)
- | x when 0x29F0 <= x & x <= 0x29FF -> UnicodeSymbol
- | _ -> raise UnsupportedUtf8
- end
- | _ ->
- begin match unicode with
- (* utf-8 CJC Symbols and Punctuation *)
- | x when 0x3008 <= x & x <= 0x3020 -> UnicodeSymbol
- (* utf-8 Hiragana U3040-309F and Katakana U30A0-30FF *)
- | x when 0x3040 <= x & x <= 0x30FF -> UnicodeLetter
- (* utf-8 Unified CJK Ideographs U4E00-9FA5 *)
- | x when 0x4E00 <= x & x <= 0x9FA5 -> UnicodeLetter
- (* utf-8 Hangul syllables UAC00-D7AF *)
- | x when 0xAC00 <= x & x <= 0xD7AF -> UnicodeLetter
- (* utf-8 Gothic U10330-1034A *)
- | x when 0x10330 <= x & x <= 0x1034A -> UnicodeLetter
- (* utf-8 Math Alphanumeric Symbols U1D400-1D7FF (letters) (has holes) *)
- | x when 0x1D400 <= x & x <= 0x1D7CB -> UnicodeLetter
- (* utf-8 Math Alphanumeric Symbols U1D400-1D7FF (digits) *)
- | x when 0x1D7CE <= x & x <= 0x1D7FF -> UnicodeIdentPart
- | _ -> raise UnsupportedUtf8
- end
+(* The following table stores classes of Unicode characters that
+ are used by the lexer. There are 3 different classes so 2 bits are
+ allocated for each character. We only use 16 bits over the 31 bits
+ to simplify the masking process. (This choice seems to be a good
+ trade-off between speed and space after some benchmarks.) *)
+
+(* A 256ko table, initially filled with zeros. *)
+let table = Array.create (1 lsl 17) 0
+
+(* Associate a 2-bit pattern to each status at position [i].
+ Only the 3 lowest bits of [i] are taken into account to
+ define the position of the pattern in the word.
+ Notice that pattern "00" means "undefined". *)
+let mask i = function
+ | UnicodeLetter -> 1 lsl ((i land 7) lsl 1) (* 01 *)
+ | UnicodeIdentPart -> 2 lsl ((i land 7) lsl 1) (* 10 *)
+ | UnicodeSymbol -> 3 lsl ((i land 7) lsl 1) (* 11 *)
+
+(* Helper to reset 2 bits in a word. *)
+let reset_mask i =
+ lnot (3 lsl ((i land 7) lsl 1))
+
+(* Initialize the lookup table from a list of segments, assigning
+ a status to every character of each segment. The order of these
+ assignments is relevant: it is possible to assign status [s] to
+ a segment [(c1, c2)] and later assign [s'] to [c] even if [c] is
+ between [c1] and [c2]. *)
+let mk_lookup_table_from_unicode_tables_for status tables =
+ List.iter
+ (List.iter
+ (fun (c1, c2) ->
+ for i = c1 to c2 do
+ table.(i lsr 3) <-
+ (table.(i lsr 3) land (reset_mask i)) lor (mask i status)
+ done))
+ tables
+
+(* Look up into the table and interpret the found pattern. *)
+let lookup x =
+ let v = (table.(x lsr 3) lsr ((x land 7) lsl 1)) land 3 in
+ if v = 1 then UnicodeLetter
+ else if v = 2 then UnicodeIdentPart
+ else if v = 3 then UnicodeSymbol
+ else raise UnsupportedUtf8
+
+(* [classify_unicode] discriminates between 3 different kinds of
+ symbols based on the standard unicode classification (extracted from
+ Camomile). *)
+let classify_unicode =
+ let single c = [ (c, c) ] in
+ (* General tables. *)
+ mk_lookup_table_from_unicode_tables_for UnicodeSymbol
+ [
+ Unicodetable.sm; (* Symbol, maths. *)
+ Unicodetable.sc; (* Symbol, currency. *)
+ Unicodetable.so; (* Symbol, modifier. *)
+ Unicodetable.pd; (* Punctation, dash. *)
+ Unicodetable.pc; (* Punctation, connector. *)
+ Unicodetable.pe; (* Punctation, open. *)
+ Unicodetable.ps; (* Punctation, close. *)
+ Unicodetable.pi; (* Punctation, initial quote. *)
+ Unicodetable.pf; (* Punctation, final quote. *)
+ Unicodetable.po; (* Punctation, other. *)
+ ];
+ mk_lookup_table_from_unicode_tables_for UnicodeLetter
+ [
+ Unicodetable.lu; (* Letter, uppercase. *)
+ Unicodetable.ll; (* Letter, lowercase. *)
+ Unicodetable.lt; (* Letter, titlecase. *)
+ Unicodetable.lo; (* Letter, others. *)
+ ];
+ mk_lookup_table_from_unicode_tables_for UnicodeIdentPart
+ [
+ Unicodetable.nd; (* Number, decimal digits. *)
+ Unicodetable.nl; (* Number, letter. *)
+ Unicodetable.no; (* Number, other. *)
+ ];
+ (* Exceptions (from a previous version of this function). *)
+ mk_lookup_table_from_unicode_tables_for UnicodeSymbol
+ [
+ single 0x000B2; (* Squared. *)
+ single 0x0002E; (* Dot. *)
+ ];
+ mk_lookup_table_from_unicode_tables_for UnicodeLetter
+ [
+ single 0x005F; (* Underscore. *)
+ single 0x00A0; (* Non breaking space. *)
+ ];
+ mk_lookup_table_from_unicode_tables_for UnicodeIdentPart
+ [
+ single 0x0027; (* Special space. *)
+ ];
+ (* Lookup *)
+ lookup
exception End_of_input
@@ -1524,4 +1491,3 @@ let heap_size_kb () = (heap_size () + 1023) / 1024
let interrupt = ref false
let check_for_interrupt () =
if !interrupt then begin interrupt := false; raise Sys.Break end
-