diff options
author | Maxime Dénès <mail@maximedenes.fr> | 2017-10-05 18:49:12 +0200 |
---|---|---|
committer | Maxime Dénès <mail@maximedenes.fr> | 2017-10-05 18:49:12 +0200 |
commit | 73620255a7a62703a89026336f97a4bc8a913afd (patch) | |
tree | 0b28d62d27b3fe6e94a07845ca8900cc9b3c9a71 | |
parent | f78a39499ef210456dd96ecb9896874c9ce300ae (diff) | |
parent | a47a249e8c31dece9b816bc49d6c5a2aa50fb21b (diff) |
Merge PR #1041: Miscellaneous fixes about UTF-8 (including a fix to BZ#5715 to escape non-UTF-8 file names)
-rw-r--r-- | Makefile.build | 2 | ||||
-rw-r--r-- | doc/refman/RefMan-com.tex | 9 | ||||
-rw-r--r-- | lib/system.ml | 2 | ||||
-rw-r--r-- | lib/unicode.ml | 43 | ||||
-rw-r--r-- | lib/unicode.mli | 3 | ||||
-rw-r--r-- | library/library.ml | 20 | ||||
-rwxr-xr-x | test-suite/misc/deps-utf8.sh | 17 | ||||
-rw-r--r-- | test-suite/misc/deps/αβ/γδ.v | 4 | ||||
-rw-r--r-- | test-suite/misc/deps/αβ/εζ.v | 1 | ||||
-rw-r--r-- | tools/coqdep_lexer.mll | 42 | ||||
-rw-r--r-- | vernac/mltop.ml | 1 |
11 files changed, 94 insertions, 50 deletions
diff --git a/Makefile.build b/Makefile.build index 10d314f0d..ecaaccaaf 100644 --- a/Makefile.build +++ b/Makefile.build @@ -429,6 +429,7 @@ tools: $(TOOLS) $(OCAMLLIBDEP) $(COQDEPBOOT) # is being built. COQDEPBOOTSRC := lib/minisys.cmo \ + lib/segmenttree.cmo lib/unicodetable.cmo lib/unicode.cmo \ tools/coqdep_lexer.cmo tools/coqdep_common.cmo tools/coqdep_boot.cmo tools/coqdep_lexer.cmo : tools/coqdep_lexer.cmi @@ -449,6 +450,7 @@ $(OCAMLLIBDEP): $(call bestobj, tools/ocamllibdep.cmo) # The full coqdep (unused by this build, but distributed by make install) COQDEPCMO:=lib/clib.cma lib/cErrors.cmo lib/cWarnings.cmo lib/minisys.cmo \ + lib/segmenttree.cmo lib/unicodetable.cmo lib/unicode.cmo \ lib/system.cmo tools/coqdep_lexer.cmo tools/coqdep_common.cmo \ tools/coqdep.cmo diff --git a/doc/refman/RefMan-com.tex b/doc/refman/RefMan-com.tex index 9790111f1..892c9931b 100644 --- a/doc/refman/RefMan-com.tex +++ b/doc/refman/RefMan-com.tex @@ -107,6 +107,15 @@ The following command-line options are recognized by the commands {\tt recursively available from {\Coq} using absolute names (extending the {\dirpath} prefix) (see Section~\ref{LongNames}). + Note that only those subdirectories and files which obey the lexical + conventions of what is an {\ident} (see Section~\ref{lexical}) + are taken into account. Conversely, the underlying file systems or + operating systems may be more restrictive than {\Coq}. While Linux's + ext4 file system supports any {\Coq} recursive layout + (within the limit of 255 bytes per file name), the default on NTFS + (Windows) or HFS+ (MacOS X) file systems is on the contrary to + disallow two files differing only in the case in the same directory. + \SeeAlso Section~\ref{Libraries}. \item[{\tt -R} {\em directory} {\dirpath}]\ % diff --git a/lib/system.ml b/lib/system.ml index 12eacf2ea..0b64b237d 100644 --- a/lib/system.ml +++ b/lib/system.ml @@ -52,7 +52,7 @@ let dirmap = ref StrMap.empty let make_dir_table dir = let filter_dotfiles s f = if f.[0] = '.' then s else StrSet.add f s in - Array.fold_left filter_dotfiles StrSet.empty (readdir dir) + Array.fold_left filter_dotfiles StrSet.empty (Sys.readdir dir) let exists_in_dir_respecting_case dir bf = let cache_dir dir = diff --git a/lib/unicode.ml b/lib/unicode.ml index 959ccaf73..8eb2eb45d 100644 --- a/lib/unicode.ml +++ b/lib/unicode.ml @@ -163,6 +163,39 @@ let is_utf8 s = in try check 0 with End_of_input -> true | Invalid_argument _ -> false +(* Escape string if it contains non-utf8 characters *) + +let escaped_non_utf8 s = + let mk_escape x = Printf.sprintf "%%%X" x in + let buff = Buffer.create (String.length s * 3) in + let rec process_trailing_aux i j = + if i = j then i else + match String.unsafe_get s i with + | '\128'..'\191' -> process_trailing_aux (i+1) j + | _ -> i in + let process_trailing i n = + let j = if i+n-1 >= String.length s then i+1 else process_trailing_aux (i+1) (i+n) in + (if j = i+n then + Buffer.add_string buff (String.sub s i n) + else + let v = Array.init (j-i) (fun k -> mk_escape (Char.code s.[i+k])) in + Buffer.add_string buff (String.concat "" (Array.to_list v))); + j in + let rec process i = + if i >= String.length s then Buffer.contents buff else + let c = String.unsafe_get s i in + match c with + | '\000'..'\127' -> Buffer.add_char buff c; process (i+1) + | '\128'..'\191' | '\248'..'\255' -> Buffer.add_string buff (mk_escape (Char.code c)); process (i+1) + | '\192'..'\223' -> process (process_trailing i 2) + | '\224'..'\239' -> process (process_trailing i 3) + | '\240'..'\247' -> process (process_trailing i 4) + in + process 0 + +let escaped_if_non_utf8 s = + if is_utf8 s then s else escaped_non_utf8 s + (* Check the well-formedness of an identifier *) let initial_refutation j n s = @@ -198,7 +231,7 @@ let ident_refutation s = |x -> x with | End_of_input -> Some (true,"The empty string is not an identifier.") - | Invalid_argument _ -> Some (true,s^": invalid utf8 sequence.") + | Invalid_argument _ -> Some (true,escaped_non_utf8 s^": invalid utf8 sequence.") let lowercase_unicode = let tree = Segmenttree.make Unicodetable.to_lower in @@ -268,9 +301,7 @@ let utf8_length s = | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *) | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *) | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *) - | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *) - | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *) - | '\254'..'\255' -> nc := 0 (* invalid byte *) + | '\248'..'\255' -> nc := 0 (* invalid byte *) end ; incr p ; while !p < len && !nc > 0 do @@ -299,9 +330,7 @@ let utf8_sub s start_u len_u = | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *) | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *) | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *) - | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *) - | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *) - | '\254'..'\255' -> nc := 0 (* invalid byte *) + | '\248'..'\255' -> nc := 0 (* invalid byte *) end ; incr p ; while !p < len_b && !nc > 0 do diff --git a/lib/unicode.mli b/lib/unicode.mli index c7d742480..b8e7c33ad 100644 --- a/lib/unicode.mli +++ b/lib/unicode.mli @@ -40,3 +40,6 @@ val utf8_length : string -> int (** Variant of {!String.sub} for UTF-8 strings. *) val utf8_sub : string -> int -> int -> string + +(** Return a "%XX"-escaped string if it contains non UTF-8 characters. *) +val escaped_if_non_utf8 : string -> string diff --git a/library/library.ml b/library/library.ml index 28afa054e..1da2c591d 100644 --- a/library/library.ml +++ b/library/library.ml @@ -620,25 +620,6 @@ let check_coq_overwriting p id = (str "Cannot build module " ++ pr_dirpath p ++ str "." ++ pr_id id ++ str "." ++ spc () ++ str "it starts with prefix \"Coq\" which is reserved for the Coq library.") -(* Verifies that a string starts by a letter and do not contain - others caracters than letters, digits, or `_` *) - -let check_module_name s = - let msg c = - strbrk "Invalid module name: " ++ str s ++ strbrk " character " ++ - (if c = '\'' then str "\"'\"" else (str "'" ++ str (String.make 1 c) ++ str "'")) ++ - strbrk " is not allowed in module names\n" - in - let err c = user_err (msg c) in - match String.get s 0 with - | 'a' .. 'z' | 'A' .. 'Z' -> - for i = 1 to (String.length s)-1 do - match String.get s i with - | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '_' -> () - | c -> err c - done - | c -> err c - let start_library fo = let ldir0 = try @@ -648,7 +629,6 @@ let start_library fo = in let file = Filename.chop_extension (Filename.basename fo) in let id = Id.of_string file in - check_module_name file; check_coq_overwriting ldir0 id; let ldir = add_dirpath_suffix ldir0 id in Declaremods.start_library ldir; diff --git a/test-suite/misc/deps-utf8.sh b/test-suite/misc/deps-utf8.sh new file mode 100755 index 000000000..13e264c09 --- /dev/null +++ b/test-suite/misc/deps-utf8.sh @@ -0,0 +1,17 @@ +# Check reading directories matching non pure ascii idents +# See bug #5715 (utf-8 working on macos X and linux) +# Windows is still not compliant +a=`uname` +if [ "$a" = "Darwin" -o "$a" = "Linux" ]; then +rm -f misc/deps/théorèmes/*.v +tmpoutput=`mktemp /tmp/coqcheck.XXXXXX` +$coqc -R misc/deps AlphaBêta misc/deps/αβ/γδ.v +R=$? +$coqtop -R misc/deps AlphaBêta -load-vernac-source misc/deps/αβ/εζ.v +S=$? +if [ $R = 0 -a $S = 0 ]; then + exit 0 +else + exit 1 +fi +fi diff --git a/test-suite/misc/deps/αβ/γδ.v b/test-suite/misc/deps/αβ/γδ.v new file mode 100644 index 000000000..f43a2d657 --- /dev/null +++ b/test-suite/misc/deps/αβ/γδ.v @@ -0,0 +1,4 @@ +Theorem simple : forall A, A -> A. +Proof. +auto. +Qed. diff --git a/test-suite/misc/deps/αβ/εζ.v b/test-suite/misc/deps/αβ/εζ.v new file mode 100644 index 000000000..e7fd25c0d --- /dev/null +++ b/test-suite/misc/deps/αβ/εζ.v @@ -0,0 +1 @@ +Require Import γδ. diff --git a/tools/coqdep_lexer.mll b/tools/coqdep_lexer.mll index 8eeb59898..564e20d0e 100644 --- a/tools/coqdep_lexer.mll +++ b/tools/coqdep_lexer.mll @@ -25,8 +25,6 @@ exception Fin_fichier exception Syntax_error of int*int - let field_name s = String.sub s 1 (String.length s - 1) - let unquote_string s = String.sub s 1 (String.length s - 2) @@ -40,6 +38,18 @@ let syntax_error lexbuf = raise (Syntax_error (Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf)) + let check_valid lexbuf s = + match Unicode.ident_refutation s with + | None -> s + | Some _ -> syntax_error lexbuf + + let get_ident lexbuf = + let s = Lexing.lexeme lexbuf in check_valid lexbuf s + + let get_field_name lexbuf = + let s = Lexing.lexeme lexbuf in + check_valid lexbuf (String.sub s 1 (String.length s - 1)) + [@@@ocaml.warning "-3"] (* String.uncapitalize_ascii since 4.03.0 GPR#124 *) let uncapitalize = String.uncapitalize [@@@ocaml.warning "+3"] @@ -52,20 +62,8 @@ let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9'] let caml_up_ident = uppercase identchar* let caml_low_ident = lowercase identchar* -let coq_firstchar = - (* This is only an approximation, refer to lib/util.ml for correct def *) - ['A'-'Z' 'a'-'z' '_'] | - (* superscript 1 *) - '\194' '\185' | - (* utf-8 latin 1 supplement *) - '\195' ['\128'-'\150'] | '\195' ['\152'-'\182'] | '\195' ['\184'-'\191'] | - (* utf-8 letters *) - '\206' (['\145'-'\161'] | ['\163'-'\187']) - '\226' ('\130' [ '\128'-'\137' ] (* subscripts *) - | '\129' [ '\176'-'\187' ] (* superscripts *) - | '\132' ['\128'-'\191'] | '\133' ['\128'-'\143']) -let coq_identchar = coq_firstchar | ['\'' '0'-'9'] -let coq_ident = coq_firstchar coq_identchar* +(* This is an overapproximation, we check correctness afterwards *) +let coq_ident = ['A'-'Z' 'a'-'z' '_' '\128'-'\255'] ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9' '\128'-'\255']* let coq_field = '.' coq_ident let dot = '.' ( space+ | eof) @@ -102,7 +100,7 @@ and from_rule = parse | space+ { from_rule lexbuf } | coq_ident - { let from = coq_qual_id_tail [Lexing.lexeme lexbuf] lexbuf in + { let from = coq_qual_id_tail [get_ident lexbuf] lexbuf in consume_require (Some from) lexbuf } | eof { syntax_error lexbuf } @@ -241,7 +239,7 @@ and load_file = parse parse_dot lexbuf; Load (unquote_vfile_string s) } | coq_ident - { let s = lexeme lexbuf in skip_to_dot lexbuf; Load s } + { let s = get_ident lexbuf in skip_to_dot lexbuf; Load s } | eof { syntax_error lexbuf } | _ @@ -253,7 +251,7 @@ and require_file from = parse | space+ { require_file from lexbuf } | coq_ident - { let name = coq_qual_id_tail [Lexing.lexeme lexbuf] lexbuf in + { let name = coq_qual_id_tail [get_ident lexbuf] lexbuf in let qid = coq_qual_id_list [name] lexbuf in parse_dot lexbuf; Require (from, qid) } @@ -278,7 +276,7 @@ and coq_qual_id = parse | space+ { coq_qual_id lexbuf } | coq_ident - { coq_qual_id_tail [Lexing.lexeme lexbuf] lexbuf } + { coq_qual_id_tail [get_ident lexbuf] lexbuf } | _ { syntax_error lexbuf } @@ -288,7 +286,7 @@ and coq_qual_id_tail module_name = parse | space+ { coq_qual_id_tail module_name lexbuf } | coq_field - { coq_qual_id_tail (field_name (Lexing.lexeme lexbuf) :: module_name) lexbuf } + { coq_qual_id_tail (get_field_name lexbuf :: module_name) lexbuf } | eof { syntax_error lexbuf } | _ @@ -301,7 +299,7 @@ and coq_qual_id_list module_names = parse | space+ { coq_qual_id_list module_names lexbuf } | coq_ident - { let name = coq_qual_id_tail [Lexing.lexeme lexbuf] lexbuf in + { let name = coq_qual_id_tail [get_ident lexbuf] lexbuf in coq_qual_id_list (name :: module_names) lexbuf } | eof diff --git a/vernac/mltop.ml b/vernac/mltop.ml index 1edbd1a85..d3de10235 100644 --- a/vernac/mltop.ml +++ b/vernac/mltop.ml @@ -174,6 +174,7 @@ let warn_cannot_use_directory = let convert_string d = try Names.Id.of_string d with UserError _ -> + let d = Unicode.escaped_if_non_utf8 d in warn_cannot_use_directory d; raise Exit |