diff options
author | regisgia <regisgia@85f007b7-540e-0410-9357-904b9bb8a0f7> | 2009-12-20 21:39:20 +0000 |
---|---|---|
committer | regisgia <regisgia@85f007b7-540e-0410-9357-904b9bb8a0f7> | 2009-12-20 21:39:20 +0000 |
commit | c4ed1e18335c74750a55b22fc1d9c824fa32dc11 (patch) | |
tree | f25a5b7808f4bdbf8b929d5f3a58023204b057d3 /lib/util.ml | |
parent | fc3b7b356e68dba0bbabd8bf755c3a836f6a6396 (diff) |
* Rewrite [classify_unicode] using standard unicode tables.
(This should be a conservative extension of the old version.)
git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/coq/trunk@12601 85f007b7-540e-0410-9357-904b9bb8a0f7
Diffstat (limited to 'lib/util.ml')
-rw-r--r-- | lib/util.ml | 222 |
1 files changed, 94 insertions, 128 deletions
diff --git a/lib/util.ml b/lib/util.ml index 5e2a071a9..69a1e9f8b 100644 --- a/lib/util.ml +++ b/lib/util.ml @@ -167,133 +167,100 @@ type utf8_status = UnicodeLetter | UnicodeIdentPart | UnicodeSymbol exception UnsupportedUtf8 -let classify_unicode unicode = - match unicode land 0x1F000 with - | 0x0 -> - begin match unicode with - (* utf-8 Basic Latin underscore *) - | x when x = 0x005F -> UnicodeLetter - (* utf-8 Basic Latin letters *) - | x when 0x0041 <= x & x <= 0x005A -> UnicodeLetter - | x when 0x0061 <= x & x <= 0x007A -> UnicodeLetter - (* utf-8 Basic Latin digits and quote *) - | x when 0x0030 <= x & x <= 0x0039 or x = 0x0027 -> UnicodeIdentPart - (* utf-8 Basic Latin symbols *) - | x when x <= 0x007F -> UnicodeSymbol - (* utf-8 Latin-1 non breaking space U00A0 *) - | 0x00A0 -> UnicodeLetter - (* utf-8 Latin-1 symbols U00A1-00BF *) - | x when 0x00A0 <= x & x <= 0x00BF -> UnicodeSymbol - (* utf-8 Latin-1 letters U00C0-00D6 *) - | x when 0x00C0 <= x & x <= 0x00D6 -> UnicodeLetter - (* utf-8 Latin-1 symbol U00D7 *) - | 0x00D7 -> UnicodeSymbol - (* utf-8 Latin-1 letters U00D8-00F6 *) - | x when 0x00D8 <= x & x <= 0x00F6 -> UnicodeLetter - (* utf-8 Latin-1 symbol U00F7 *) - | 0x00F7 -> UnicodeSymbol - (* utf-8 Latin-1 letters U00F8-00FF *) - | x when 0x00F8 <= x & x <= 0x00FF -> UnicodeLetter - (* utf-8 Latin Extended A U0100-017F and Latin Extended B U0180-U0241 *) - | x when 0x0100 <= x & x <= 0x0241 -> UnicodeLetter - (* utf-8 Phonetic letters U0250-02AF *) - | x when 0x0250 <= x & x <= 0x02AF -> UnicodeLetter - (* utf-8 what do to with diacritics U0300-U036F ? *) - (* utf-8 Greek letters U0380-03FF *) - | x when 0x0380 <= x & x <= 0x03FF -> UnicodeLetter - (* utf-8 Cyrillic letters U0400-0481 *) - | x when 0x0400 <= x & x <= 0x0481 -> UnicodeLetter - (* utf-8 Cyrillic symbol U0482 *) - | 0x0482 -> UnicodeSymbol - (* utf-8 what do to with diacritics U0483-U0489 \ U0487 ? *) - (* utf-8 Cyrillic letters U048A-U4F9 (Warning: 04CF) *) - | x when 0x048A <= x & x <= 0x04F9 -> UnicodeLetter - (* utf-8 Cyrillic supplement letters U0500-U050F *) - | x when 0x0500 <= x & x <= 0x050F -> UnicodeLetter - (* utf-8 Hebrew letters U05D0-05EA *) - | x when 0x05D0 <= x & x <= 0x05EA -> UnicodeLetter - (* utf-8 Arabic letters U0621-064A *) - | x when 0x0621 <= x & x <= 0x064A -> UnicodeLetter - (* utf-8 Arabic supplement letters U0750-076D *) - | x when 0x0750 <= x & x <= 0x076D -> UnicodeLetter - | _ -> raise UnsupportedUtf8 - end - | 0x1000 -> - begin match unicode with - (* utf-8 Georgian U10A0-10FF (has holes) *) - | x when 0x10A0 <= x & x <= 0x10FF -> UnicodeLetter - (* utf-8 Hangul Jamo U1100-11FF (has holes) *) - | x when 0x1100 <= x & x <= 0x11FF -> UnicodeLetter - (* utf-8 Latin additional letters U1E00-1E9B and U1EA0-1EF9 *) - | x when 0x1E00 <= x & x <= 0x1E9B -> UnicodeLetter - | x when 0x1EA0 <= x & x <= 0x1EF9 -> UnicodeLetter - | _ -> raise UnsupportedUtf8 - end - | 0x2000 -> - begin match unicode with - (* utf-8 general punctuation U2080-2089 *) - (* Hyphens *) - | x when 0x2010 <= x & x <= 0x2011 -> UnicodeLetter - (* Dashes and other symbols *) - | x when 0x2012 <= x & x <= 0x2027 -> UnicodeSymbol - (* Per mille and per ten thousand signs *) - | x when 0x2030 <= x & x <= 0x2031 -> UnicodeSymbol - (* Prime letters *) - | x when 0x2032 <= x & x <= 0x2034 or x = 0x2057 -> UnicodeIdentPart - (* Miscellaneous punctuation *) - | x when 0x2039 <= x & x <= 0x2056 -> UnicodeSymbol - | x when 0x2058 <= x & x <= 0x205E -> UnicodeSymbol - (* Invisible mathematical operators *) - | x when 0x2061 <= x & x <= 0x2063 -> UnicodeSymbol - (* utf-8 superscript U2070-207C *) - | x when 0x2070 <= x & x <= 0x207C -> UnicodeSymbol - (* utf-8 subscript U2080-2089 *) - | x when 0x2080 <= x & x <= 0x2089 -> UnicodeIdentPart - (* utf-8 letter-like U2100-214F *) - | x when 0x2100 <= x & x <= 0x214F -> UnicodeLetter - (* utf-8 number-forms U2153-2183 *) - | x when 0x2153 <= x & x <= 0x2183 -> UnicodeSymbol - (* utf-8 arrows A U2190-21FF *) - (* utf-8 mathematical operators U2200-22FF *) - (* utf-8 miscellaneous technical U2300-23FF *) - | x when 0x2190 <= x & x <= 0x23FF -> UnicodeSymbol - (* utf-8 box drawing U2500-257F has ceiling, etc. *) - (* utf-8 block elements U2580-259F *) - (* utf-8 geom. shapes U25A0-25FF (has triangles, losange, etc) *) - (* utf-8 miscellaneous symbols U2600-26FF *) - | x when 0x2500 <= x & x <= 0x26FF -> UnicodeSymbol - (* utf-8 arrows B U2900-297F *) - | x when 0x2900 <= x & x <= 0x297F -> UnicodeSymbol - (* utf-8 mathematical operators U2A00-2AFF *) - | x when 0x2A00 <= x & x <= 0x2AFF -> UnicodeSymbol - (* utf-8 bold symbols U2768-U2775 *) - | x when 0x2768 <= x & x <= 0x2775 -> UnicodeSymbol - (* utf-8 arrows and brackets U27E0-U27FF *) - | x when 0x27E0 <= x & x <= 0x27FF -> UnicodeSymbol - (* utf-8 brackets, braces and parentheses *) - | x when 0x2980 <= x & x <= 0x29FF -> UnicodeSymbol - (* utf-8 miscellaneous including double-plus U29F0-U29FF *) - | x when 0x29F0 <= x & x <= 0x29FF -> UnicodeSymbol - | _ -> raise UnsupportedUtf8 - end - | _ -> - begin match unicode with - (* utf-8 CJC Symbols and Punctuation *) - | x when 0x3008 <= x & x <= 0x3020 -> UnicodeSymbol - (* utf-8 Hiragana U3040-309F and Katakana U30A0-30FF *) - | x when 0x3040 <= x & x <= 0x30FF -> UnicodeLetter - (* utf-8 Unified CJK Ideographs U4E00-9FA5 *) - | x when 0x4E00 <= x & x <= 0x9FA5 -> UnicodeLetter - (* utf-8 Hangul syllables UAC00-D7AF *) - | x when 0xAC00 <= x & x <= 0xD7AF -> UnicodeLetter - (* utf-8 Gothic U10330-1034A *) - | x when 0x10330 <= x & x <= 0x1034A -> UnicodeLetter - (* utf-8 Math Alphanumeric Symbols U1D400-1D7FF (letters) (has holes) *) - | x when 0x1D400 <= x & x <= 0x1D7CB -> UnicodeLetter - (* utf-8 Math Alphanumeric Symbols U1D400-1D7FF (digits) *) - | x when 0x1D7CE <= x & x <= 0x1D7FF -> UnicodeIdentPart - | _ -> raise UnsupportedUtf8 - end +(* The following table stores classes of Unicode characters that + are used by the lexer. There are 3 different classes so 2 bits are + allocated for each character. We only use 16 bits over the 31 bits + to simplify the masking process. (This choice seems to be a good + trade-off between speed and space after some benchmarks.) *) + +(* A 256ko table, initially filled with zeros. *) +let table = Array.create (1 lsl 17) 0 + +(* Associate a 2-bit pattern to each status at position [i]. + Only the 3 lowest bits of [i] are taken into account to + define the position of the pattern in the word. + Notice that pattern "00" means "undefined". *) +let mask i = function + | UnicodeLetter -> 1 lsl ((i land 7) lsl 1) (* 01 *) + | UnicodeIdentPart -> 2 lsl ((i land 7) lsl 1) (* 10 *) + | UnicodeSymbol -> 3 lsl ((i land 7) lsl 1) (* 11 *) + +(* Helper to reset 2 bits in a word. *) +let reset_mask i = + lnot (3 lsl ((i land 7) lsl 1)) + +(* Initialize the lookup table from a list of segments, assigning + a status to every character of each segment. The order of these + assignments is relevant: it is possible to assign status [s] to + a segment [(c1, c2)] and later assign [s'] to [c] even if [c] is + between [c1] and [c2]. *) +let mk_lookup_table_from_unicode_tables_for status tables = + List.iter + (List.iter + (fun (c1, c2) -> + for i = c1 to c2 do + table.(i lsr 3) <- + (table.(i lsr 3) land (reset_mask i)) lor (mask i status) + done)) + tables + +(* Look up into the table and interpret the found pattern. *) +let lookup x = + let v = (table.(x lsr 3) lsr ((x land 7) lsl 1)) land 3 in + if v = 1 then UnicodeLetter + else if v = 2 then UnicodeIdentPart + else if v = 3 then UnicodeSymbol + else raise UnsupportedUtf8 + +(* [classify_unicode] discriminates between 3 different kinds of + symbols based on the standard unicode classification (extracted from + Camomile). *) +let classify_unicode = + let single c = [ (c, c) ] in + (* General tables. *) + mk_lookup_table_from_unicode_tables_for UnicodeSymbol + [ + Unicodetable.sm; (* Symbol, maths. *) + Unicodetable.sc; (* Symbol, currency. *) + Unicodetable.so; (* Symbol, modifier. *) + Unicodetable.pd; (* Punctation, dash. *) + Unicodetable.pc; (* Punctation, connector. *) + Unicodetable.pe; (* Punctation, open. *) + Unicodetable.ps; (* Punctation, close. *) + Unicodetable.pi; (* Punctation, initial quote. *) + Unicodetable.pf; (* Punctation, final quote. *) + Unicodetable.po; (* Punctation, other. *) + ]; + mk_lookup_table_from_unicode_tables_for UnicodeLetter + [ + Unicodetable.lu; (* Letter, uppercase. *) + Unicodetable.ll; (* Letter, lowercase. *) + Unicodetable.lt; (* Letter, titlecase. *) + Unicodetable.lo; (* Letter, others. *) + ]; + mk_lookup_table_from_unicode_tables_for UnicodeIdentPart + [ + Unicodetable.nd; (* Number, decimal digits. *) + Unicodetable.nl; (* Number, letter. *) + Unicodetable.no; (* Number, other. *) + ]; + (* Exceptions (from a previous version of this function). *) + mk_lookup_table_from_unicode_tables_for UnicodeSymbol + [ + single 0x000B2; (* Squared. *) + single 0x0002E; (* Dot. *) + ]; + mk_lookup_table_from_unicode_tables_for UnicodeLetter + [ + single 0x005F; (* Underscore. *) + single 0x00A0; (* Non breaking space. *) + ]; + mk_lookup_table_from_unicode_tables_for UnicodeIdentPart + [ + single 0x0027; (* Special space. *) + ]; + (* Lookup *) + lookup exception End_of_input @@ -1524,4 +1491,3 @@ let heap_size_kb () = (heap_size () + 1023) / 1024 let interrupt = ref false let check_for_interrupt () = if !interrupt then begin interrupt := false; raise Sys.Break end - |