From 9a2bb3a6d12a082c61dfda62be53c195fe3cb57c Mon Sep 17 00:00:00 2001 From: Hugo Herbelin Date: Wed, 4 Oct 2017 22:55:51 +0200 Subject: Fixing typos in comments of unicode.ml. --- lib/unicode.ml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'lib') diff --git a/lib/unicode.ml b/lib/unicode.ml index 959ccaf73..9e79f1048 100644 --- a/lib/unicode.ml +++ b/lib/unicode.ml @@ -67,13 +67,13 @@ let classify = Unicodetable.sm; (* Symbol, maths. *) Unicodetable.sc; (* Symbol, currency. *) Unicodetable.so; (* Symbol, modifier. *) - Unicodetable.pd; (* Punctation, dash. *) - Unicodetable.pc; (* Punctation, connector. *) - Unicodetable.pe; (* Punctation, open. *) - Unicodetable.ps; (* Punctation, close. *) - Unicodetable.pi; (* Punctation, initial quote. *) - Unicodetable.pf; (* Punctation, final quote. *) - Unicodetable.po; (* Punctation, other. *) + Unicodetable.pd; (* Punctuation, dash. *) + Unicodetable.pc; (* Punctuation, connector. *) + Unicodetable.pe; (* Punctuation, open. *) + Unicodetable.ps; (* Punctution, close. *) + Unicodetable.pi; (* Punctuation, initial quote. *) + Unicodetable.pf; (* Punctuation, final quote. *) + Unicodetable.po; (* Punctuation, other. *) ]; mk_lookup_table_from_unicode_tables_for Letter [ @@ -114,7 +114,7 @@ let classify = ]; mk_lookup_table_from_unicode_tables_for IdentPart [ - single 0x0027; (* Special space. *) + single 0x0027; (* Single quote. *) ]; (* Lookup *) lookup -- cgit v1.2.3 From 526791d917f9b0804376eae02a462a3b32dd7cba Mon Sep 17 00:00:00 2001 From: Hugo Herbelin Date: Wed, 4 Oct 2017 22:00:19 +0200 Subject: Distinguishing pseudo-letters out of the set of unicode letters. This includes _ and insecable space which can be used in idents and this allows more precise heuristics. --- lib/unicode.ml | 53 +++++++++++++++++++++++++++++++++-------------------- lib/unicode.mli | 11 ++++++++++- parsing/cLexer.ml4 | 16 ++++++++-------- 3 files changed, 51 insertions(+), 29 deletions(-) (limited to 'lib') diff --git a/lib/unicode.ml b/lib/unicode.ml index 9e79f1048..528d6434c 100644 --- a/lib/unicode.ml +++ b/lib/unicode.ml @@ -8,13 +8,14 @@ (** Unicode utilities *) -type status = Letter | IdentPart | Symbol | Unknown +type status = Letter | IdentPart | Symbol | IdentSep | Unknown (* The following table stores classes of Unicode characters that - are used by the lexer. There are 3 different classes so 2 bits are - allocated for each character. We only use 16 bits over the 31 bits - to simplify the masking process. (This choice seems to be a good - trade-off between speed and space after some benchmarks.) *) + are used by the lexer. There are 5 different classes so 3 bits + are allocated for each character. We encode the masks of 8 + characters per word, thus using 24 bits over the 31 available + bits. (This choice seems to be a good trade-off between speed + and space after some benchmarks.) *) (* A 256 KiB table, initially filled with zeros. *) let table = Array.make (1 lsl 17) 0 @@ -24,14 +25,15 @@ let table = Array.make (1 lsl 17) 0 define the position of the pattern in the word. Notice that pattern "00" means "undefined". *) let mask i = function - | Letter -> 1 lsl ((i land 7) lsl 1) (* 01 *) - | IdentPart -> 2 lsl ((i land 7) lsl 1) (* 10 *) - | Symbol -> 3 lsl ((i land 7) lsl 1) (* 11 *) - | Unknown -> 0 lsl ((i land 7) lsl 1) (* 00 *) + | Letter -> 1 lsl ((i land 7) * 3) (* 001 *) + | IdentPart -> 2 lsl ((i land 7) * 3) (* 010 *) + | Symbol -> 3 lsl ((i land 7) * 3) (* 011 *) + | IdentSep -> 4 lsl ((i land 7) * 3) (* 100 *) + | Unknown -> 0 lsl ((i land 7) * 3) (* 000 *) -(* Helper to reset 2 bits in a word. *) +(* Helper to reset 3 bits in a word. *) let reset_mask i = - lnot (3 lsl ((i land 7) lsl 1)) + lnot (7 lsl ((i land 7) * 3)) (* Initialize the lookup table from a list of segments, assigning a status to every character of each segment. The order of these @@ -50,13 +52,14 @@ let mk_lookup_table_from_unicode_tables_for status tables = (* Look up into the table and interpret the found pattern. *) let lookup x = - let v = (table.(x lsr 3) lsr ((x land 7) lsl 1)) land 3 in + let v = (table.(x lsr 3) lsr ((x land 7) * 3)) land 7 in if v = 1 then Letter else if v = 2 then IdentPart else if v = 3 then Symbol + else if v = 4 then IdentSep else Unknown -(* [classify] discriminates between 3 different kinds of +(* [classify] discriminates between 5 different kinds of symbols based on the standard unicode classification (extracted from Camomile). *) let classify = @@ -107,7 +110,7 @@ let classify = [(0x02074, 0x02079)]; (* Superscript 4-9. *) single 0x0002E; (* Dot. *) ]; - mk_lookup_table_from_unicode_tables_for Letter + mk_lookup_table_from_unicode_tables_for IdentSep [ single 0x005F; (* Underscore. *) single 0x00A0; (* Non breaking space. *) @@ -165,22 +168,32 @@ let is_utf8 s = (* Check the well-formedness of an identifier *) +let is_valid_ident_initial = function + | Letter | IdentSep -> true + | IdentPart | Symbol | Unknown -> false + let initial_refutation j n s = - match classify n with - | Letter -> None - | _ -> + if is_valid_ident_initial (classify n) then None + else let c = String.sub s 0 j in Some (false, "Invalid character '"^c^"' at beginning of identifier \""^s^"\".") +let is_valid_ident_trailing = function + | Letter | IdentSep | IdentPart -> true + | Symbol | Unknown -> false + let trailing_refutation i j n s = - match classify n with - | Letter | IdentPart -> None - | _ -> + if is_valid_ident_trailing (classify n) then None + else let c = String.sub s i j in Some (false, "Invalid character '"^c^"' in identifier \""^s^"\".") +let is_unknown = function + | Unknown -> true + | Letter | IdentSep | IdentPart | Symbol -> false + let ident_refutation s = if s = ".." then None else try let j, n = next_utf8 s 0 in diff --git a/lib/unicode.mli b/lib/unicode.mli index c7d742480..a608d5f02 100644 --- a/lib/unicode.mli +++ b/lib/unicode.mli @@ -8,7 +8,7 @@ (** Unicode utilities *) -type status = Letter | IdentPart | Symbol | Unknown +type status (** Classify a unicode char into 3 classes or unknown. *) val classify : int -> status @@ -17,6 +17,15 @@ val classify : int -> status Return [Some (b,s)] otherwise, where [s] is an explanation and [b] is severity. *) val ident_refutation : string -> (bool * string) option +(** Tells if a valid initial character for an identifier *) +val is_valid_ident_initial : status -> bool + +(** Tells if a valid non-initial character for an identifier *) +val is_valid_ident_trailing : status -> bool + +(** Tells if a character is unclassified *) +val is_unknown : status -> bool + (** First char of a string, converted to lowercase @raise Assert_failure if the input string is empty. *) val lowercase_first_char : string -> string diff --git a/parsing/cLexer.ml4 b/parsing/cLexer.ml4 index 9c9189ffe..f26398fa9 100644 --- a/parsing/cLexer.ml4 +++ b/parsing/cLexer.ml4 @@ -245,8 +245,8 @@ let check_ident str = loop_id true s | [< s >] -> match unlocated lookup_utf8 Ploc.dummy s with - | Utf8Token (Unicode.Letter, n) -> njunk n s; loop_id true s - | Utf8Token (Unicode.IdentPart, n) when intail -> + | Utf8Token (st, n) when not intail && Unicode.is_valid_ident_initial st -> njunk n s; loop_id true s + | Utf8Token (st, n) when intail && Unicode.is_valid_ident_trailing st -> njunk n s; loop_id true s | EmptyStream -> () @@ -311,9 +311,9 @@ let rec ident_tail loc len = parser ident_tail loc (store len c) s | [< s >] -> match lookup_utf8 loc s with - | Utf8Token ((Unicode.IdentPart | Unicode.Letter), n) -> + | Utf8Token (st, n) when Unicode.is_valid_ident_trailing st -> ident_tail loc (nstore n len s) s - | Utf8Token (Unicode.Unknown, n) -> + | Utf8Token (st, n) when Unicode.is_unknown st -> let id = get_buff len in let u = String.concat "" (List.map (String.make 1) (Stream.npeek n s)) in warn_unrecognized_unicode ~loc:!@loc (u,id); len @@ -539,7 +539,7 @@ let parse_after_dot loc c bp = (try find_keyword loc ("."^field) s with Not_found -> FIELD field) | [< s >] -> match lookup_utf8 loc s with - | Utf8Token (Unicode.Letter, n) -> + | Utf8Token (st, n) when Unicode.is_valid_ident_initial st -> let len = ident_tail loc (nstore n 0 s) s in let field = get_buff len in (try find_keyword loc ("."^field) s with Not_found -> FIELD field) @@ -553,7 +553,7 @@ let parse_after_qmark loc bp s = | None -> KEYWORD "?" | _ -> match lookup_utf8 loc s with - | Utf8Token (Unicode.Letter, _) -> LEFTQMARK + | Utf8Token (st, _) when Unicode.is_valid_ident_initial st -> LEFTQMARK | AsciiChar | Utf8Token _ | EmptyStream -> fst (process_chars loc bp '?' s) @@ -618,13 +618,13 @@ let rec next_token loc = parser bp comment_stop bp; between_commands := new_between_commands; t | [< s >] -> match lookup_utf8 loc s with - | Utf8Token (Unicode.Letter, n) -> + | Utf8Token (st, n) when Unicode.is_valid_ident_initial st -> let len = ident_tail loc (nstore n 0 s) s in let id = get_buff len in let ep = Stream.count s in comment_stop bp; (try find_keyword loc id s with Not_found -> IDENT id), set_loc_pos loc bp ep - | AsciiChar | Utf8Token ((Unicode.Symbol | Unicode.IdentPart | Unicode.Unknown), _) -> + | AsciiChar | Utf8Token _ -> let t = process_chars loc bp (Stream.next s) s in comment_stop bp; t | EmptyStream -> -- cgit v1.2.3 From 40260a31cd197f655e6d3e0570a88d96fc1a9cac Mon Sep 17 00:00:00 2001 From: Hugo Herbelin Date: Wed, 4 Oct 2017 15:37:36 +0200 Subject: Fixing BZ#5769 (variable of type "_something" was named after invalid "_"). --- engine/namegen.ml | 14 +++++++++++++- lib/unicode.ml | 28 ++++++++++++++++++++++++++++ lib/unicode.mli | 4 ++++ test-suite/bugs/closed/5769.v | 20 ++++++++++++++++++++ 4 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 test-suite/bugs/closed/5769.v (limited to 'lib') diff --git a/engine/namegen.ml b/engine/namegen.ml index a75fe721f..6c6a54127 100644 --- a/engine/namegen.ml +++ b/engine/namegen.ml @@ -43,6 +43,8 @@ let default_non_dependent_ident = Id.of_string default_non_dependent_string let default_dependent_ident = Id.of_string "x" +let default_generated_non_letter_string = "x" + (**********************************************************************) (* Globality of identifiers *) @@ -107,7 +109,17 @@ let head_name sigma c = (* Find the head constant of a constr if any *) hdrec c let lowercase_first_char id = (* First character of a constr *) - Unicode.lowercase_first_char (Id.to_string id) + let s = Id.to_string id in + match Unicode.split_at_first_letter s with + | None -> + (* General case: nat -> n *) + Unicode.lowercase_first_char s + | Some (s,s') -> + if String.length s' = 0 then + (* No letter, e.g. __, or __'_, etc. *) + default_generated_non_letter_string + else + s ^ Unicode.lowercase_first_char s' let sort_hdchar = function | Prop(_) -> "P" diff --git a/lib/unicode.ml b/lib/unicode.ml index 528d6434c..7f3743900 100644 --- a/lib/unicode.ml +++ b/lib/unicode.ml @@ -194,6 +194,14 @@ let is_unknown = function | Unknown -> true | Letter | IdentSep | IdentPart | Symbol -> false +let is_ident_part = function + | IdentPart -> true + | Letter | IdentSep | Symbol | Unknown -> false + +let is_ident_sep = function + | IdentSep -> true + | Letter | IdentPart | Symbol | Unknown -> false + let ident_refutation s = if s = ".." then None else try let j, n = next_utf8 s 0 in @@ -227,6 +235,26 @@ let lowercase_first_char s = let j, n = next_utf8 s 0 in utf8_of_unicode (lowercase_unicode n) +let split_at_first_letter s = + let n, v = next_utf8 s 0 in + if ((* optim *) n = 1 && s.[0] != '_') || not (is_ident_sep (classify v)) then None + else begin + let n = ref n in + let p = ref 0 in + while !n < String.length s && + let n', v = next_utf8 s !n in + p := n'; + (* Test if not letter *) + ((* optim *) n' = 1 && (s.[!n] = '_' || s.[!n] = '\'')) + || let st = classify v in + is_ident_sep st || is_ident_part st + do n := !n + !p + done; + let s1 = String.sub s 0 !n in + let s2 = String.sub s !n (String.length s - !n) in + Some (s1,s2) + end + (** For extraction, we need to encode unicode character into ascii ones *) let is_basic_ascii s = diff --git a/lib/unicode.mli b/lib/unicode.mli index a608d5f02..49b844493 100644 --- a/lib/unicode.mli +++ b/lib/unicode.mli @@ -30,6 +30,10 @@ val is_unknown : status -> bool @raise Assert_failure if the input string is empty. *) val lowercase_first_char : string -> string +(** Split a string supposed to be an ident at the first letter; + as an optimization, return None if the first character is a letter *) +val split_at_first_letter : string -> (string * string) option + (** Return [true] if all UTF-8 characters in the input string are just plain ASCII characters. Returns [false] otherwise. *) val is_basic_ascii : string -> bool diff --git a/test-suite/bugs/closed/5769.v b/test-suite/bugs/closed/5769.v new file mode 100644 index 000000000..42573aad8 --- /dev/null +++ b/test-suite/bugs/closed/5769.v @@ -0,0 +1,20 @@ +(* Check a few naming heuristics based on types *) +(* was buggy for types names _something *) + +Inductive _foo :=. +Lemma bob : (sigT (fun x : nat => _foo)) -> _foo. +destruct 1. +exact _f. +Abort. + +Inductive _'Foo :=. +Lemma bob : (sigT (fun x : nat => _'Foo)) -> _'Foo. +destruct 1. +exact _'f. +Abort. + +Inductive ____ :=. +Lemma bob : (sigT (fun x : nat => ____)) -> ____. +destruct 1. +exact x0. +Abort. -- cgit v1.2.3