summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joey@kitenet.net>2014-03-19 14:49:01 -0400
committerGravatar Joey Hess <joey@kitenet.net>2014-03-19 15:57:56 -0400
commit0bc0364c5cd75695bc66181cc3bd52a4d26c4c87 (patch)
tree56bce6cf12e7cfdc7c591d3b36f56221c5b5a2d1
parent3b0e263a342cf8a369fcea6b1e41e0533ba2cc7f (diff)
Windows: Fix some filename encoding bugs.
-rw-r--r--Annex/CatFile.hs2
-rw-r--r--Command/Unused.hs3
-rw-r--r--Git/CatFile.hs2
-rw-r--r--Test.hs3
-rw-r--r--Utility/FileSystemEncoding.hs43
-rw-r--r--debian/changelog1
-rw-r--r--doc/bugs/Unicode_file_names_ignored_on_Windows.mdwn4
-rw-r--r--doc/todo/windows_support.mdwn36
8 files changed, 86 insertions, 8 deletions
diff --git a/Annex/CatFile.hs b/Annex/CatFile.hs
index fc722c8e7..6a778db03 100644
--- a/Annex/CatFile.hs
+++ b/Annex/CatFile.hs
@@ -80,7 +80,7 @@ catKey = catKey' True
catKey' :: Bool -> Ref -> FileMode -> Annex (Maybe Key)
catKey' modeguaranteed ref mode
| isSymLink mode = do
- l <- fromInternalGitPath . encodeW8 . L.unpack <$> get
+ l <- fromInternalGitPath . decodeBS <$> get
return $ if isLinkToAnnex l
then fileKey $ takeFileName l
else Nothing
diff --git a/Command/Unused.hs b/Command/Unused.hs
index c174cd256..3e844e5a8 100644
--- a/Command/Unused.hs
+++ b/Command/Unused.hs
@@ -10,7 +10,6 @@
module Command.Unused where
import qualified Data.Set as S
-import qualified Data.ByteString.Lazy as L
import Data.BloomFilter
import Data.BloomFilter.Easy
import Data.BloomFilter.Hash
@@ -296,7 +295,7 @@ withKeysReferencedInGitRef a ref = do
liftIO $ void clean
where
tKey True = fmap fst <$$> Backend.lookupFile . getTopFilePath . DiffTree.file
- tKey False = fileKey . takeFileName . encodeW8 . L.unpack <$$>
+ tKey False = fileKey . takeFileName . decodeBS <$$>
catFile ref . getTopFilePath . DiffTree.file
{- Looks in the specified directory for bad/tmp keys, and returns a list
diff --git a/Git/CatFile.hs b/Git/CatFile.hs
index c7c51b894..8e64fc558 100644
--- a/Git/CatFile.hs
+++ b/Git/CatFile.hs
@@ -108,6 +108,6 @@ catTree h treeref = go <$> catObjectDetails h treeref
dropsha = L.drop 21
parsemodefile b =
- let (modestr, file) = separate (== ' ') (encodeW8 $ L.unpack b)
+ let (modestr, file) = separate (== ' ') (decodeBS b)
in (file, readmode modestr)
readmode = fst . fromMaybe (0, undefined) . headMaybe . readOct
diff --git a/Test.hs b/Test.hs
index 1c448b357..209242fb3 100644
--- a/Test.hs
+++ b/Test.hs
@@ -22,7 +22,6 @@ import qualified Data.Map as M
import System.IO.HVFS (SystemFS(..))
import qualified Text.JSON
import System.Path
-import qualified Data.ByteString.Lazy as L
import Common
@@ -1272,7 +1271,7 @@ test_add_subdirs env = intmpclonerepo env $ do
{- Regression test for Windows bug where symlinks were not
- calculated correctly for files in subdirs. -}
git_annex env "sync" [] @? "sync failed"
- l <- annexeval $ encodeW8 . L.unpack <$> Annex.CatFile.catObject (Git.Types.Ref "HEAD:dir/foo")
+ l <- annexeval $ decodeBS <$> Annex.CatFile.catObject (Git.Types.Ref "HEAD:dir/foo")
"../.git/annex/" `isPrefixOf` l @? ("symlink from subdir to .git/annex is wrong: " ++ l)
createDirectory "dir2"
diff --git a/Utility/FileSystemEncoding.hs b/Utility/FileSystemEncoding.hs
index ac105e73d..690942cba 100644
--- a/Utility/FileSystemEncoding.hs
+++ b/Utility/FileSystemEncoding.hs
@@ -1,14 +1,17 @@
{- GHC File system encoding handling.
-
- - Copyright 2012-2013 Joey Hess <joey@kitenet.net>
+ - Copyright 2012-2014 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
+{-# LANGUAGE CPP #-}
+
module Utility.FileSystemEncoding (
fileEncoding,
withFilePath,
md5FilePath,
+ decodeBS,
decodeW8,
encodeW8,
truncateFilePath,
@@ -22,13 +25,24 @@ import System.IO.Unsafe
import qualified Data.Hash.MD5 as MD5
import Data.Word
import Data.Bits.Utils
+import qualified Data.ByteString.Lazy as L
+#ifdef mingw32_HOST_OS
+import qualified Data.ByteString.Lazy.UTF8 as L8
+#endif
{- Sets a Handle to use the filesystem encoding. This causes data
- written or read from it to be encoded/decoded the same
- as ghc 7.4 does to filenames etc. This special encoding
- - allows "arbitrary undecodable bytes to be round-tripped through it". -}
+ - allows "arbitrary undecodable bytes to be round-tripped through it".
+ -}
fileEncoding :: Handle -> IO ()
+#ifndef mingw32_HOST_OS
fileEncoding h = hSetEncoding h =<< Encoding.getFileSystemEncoding
+#else
+{- The file system encoding does not work well on Windows,
+ - and Windows only has utf FilePaths anyway. -}
+fileEncoding h = hSetEncoding h Encoding.utf8
+#endif
{- Marshal a Haskell FilePath into a NUL terminated C string using temporary
- storage. The FilePath is encoded using the filesystem encoding,
@@ -60,6 +74,16 @@ _encodeFilePath fp = unsafePerformIO $ do
md5FilePath :: FilePath -> MD5.Str
md5FilePath = MD5.Str . _encodeFilePath
+{- Decodes a ByteString into a FilePath, applying the filesystem encoding. -}
+decodeBS :: L.ByteString -> FilePath
+#ifndef mingw32_HOST_OS
+decodeBS = encodeW8 . L.unpack
+#else
+{- On Windows, we assume that the ByteString is utf-8, since Windows
+ - only uses unicode for filenames. -}
+decodeBS = L8.toString
+#endif
+
{- Converts a [Word8] to a FilePath, encoding using the filesystem encoding.
-
- w82c produces a String, which may contain Chars that are invalid
@@ -84,6 +108,7 @@ decodeW8 = s2w8 . _encodeFilePath
- cost of efficiency when running on a large FilePath.
-}
truncateFilePath :: Int -> FilePath -> FilePath
+#ifndef mingw32_HOST_OS
truncateFilePath n = go . reverse
where
go f =
@@ -91,3 +116,17 @@ truncateFilePath n = go . reverse
in if length bytes <= n
then reverse f
else go (drop 1 f)
+#else
+{- On Windows, count the number of bytes used by each utf8 character. -}
+truncateFilePath n = reverse . go [] n . L8.fromString
+ where
+ go coll cnt bs
+ | cnt <= 0 = coll
+ | otherwise = case L8.decode bs of
+ Just (c, x) | c /= L8.replacement_char ->
+ let x' = fromIntegral x
+ in if cnt - x' < 0
+ then coll
+ else go (c:coll) (cnt - x') (L8.drop 1 bs)
+ _ -> coll
+#endif
diff --git a/debian/changelog b/debian/changelog
index 6ef4e5e84..f082cbbfb 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -30,6 +30,7 @@ git-annex (5.20140307) UNRELEASED; urgency=medium
* Each for each metadata field, there's now an automatically maintained
"$field-lastchanged" that gives the timestamp of the last change to that
field.
+ * Windows: Fix some filename encoding bugs.
-- Joey Hess <joeyh@debian.org> Thu, 06 Mar 2014 16:17:01 -0400
diff --git a/doc/bugs/Unicode_file_names_ignored_on_Windows.mdwn b/doc/bugs/Unicode_file_names_ignored_on_Windows.mdwn
index af3877dbe..b58cf2571 100644
--- a/doc/bugs/Unicode_file_names_ignored_on_Windows.mdwn
+++ b/doc/bugs/Unicode_file_names_ignored_on_Windows.mdwn
@@ -35,3 +35,7 @@ According to https://github.com/msysgit/msysgit/wiki/Git-for-Windows-Unicode-Sup
[2014-03-18 14:28:03 Central Europe Standard Time] read: git ["--git-dir=D:\\anntest\\.git","--work-tree=D:\\anntest","-c","core.bare=false","ls-files","--modified","-z","--","h\225\269ky.txt"]
I can provide additional information, just tell me what you need.
+
+> [[fixed|done]], although this is not the end of encoding issues
+> on Windows. Updating [[windows_support]] to discuss some other ones.
+> --[[Joey]]
diff --git a/doc/todo/windows_support.mdwn b/doc/todo/windows_support.mdwn
index 17accd62e..af78d517f 100644
--- a/doc/todo/windows_support.mdwn
+++ b/doc/todo/windows_support.mdwn
@@ -29,6 +29,42 @@ now! --[[Joey]]
* Deleting a git repository from inside the webapp fails "RemoveDirectory
permision denied ... file is being used by another process"
+## potential encoding problems
+
+[[bugs/Unicode_file_names_ignored_on_Windows]] is fixed, but some potential
+problems remain, since the FileSystemEncoding that git-annex relies on
+seems unreliable/broken on Windows.
+
+* When git-annex displays a filename that it's acting on, there
+ can be mojibake on Windows. For example, "háčky.txt" displays
+ the accented characters as instead the pairs of bytes making
+ up the utf-8. Tried doing various things to the stdout handle
+ to avoid this, but only ended up with encoding crashes, or worse
+ mojibake than this.
+
+* `md5FilePath` still uses the filesystem encoding, and so may produce the
+ wrong value on Windows. This would impact keys that contain problem characters
+ (probably coming from the filename extension), and might cause
+ interoperability problems when git-annex generates the hash directories of a
+ remote, for example a rsync remote.
+
+* `encodeW8` is used in Git.UnionMerge, and while I fixed the other calls to
+ encodeW8, which all involved ByteStrings reading from git and so can just
+ treat it as utf-8 on Windows (via `decodeBS`), in the union merge case,
+ the ByteString has no defined encoding. It may have been written on Unix
+ and contain keys with invalid unicode in them. On windows, the union
+ merge code should probably check if it's valid utf-8, and if not,
+ abort the merge.
+
+* If interoperating with a git-annex repository from a unix system, it's
+ possible for a key to contain some invalid utf-8, which means its filename
+ cannot even be represented on Windows, so who knows what will happen in that
+ case -- probably it will fail in some way when adding the object file
+ to the Windows repo.
+
+* If data from the git repo does not have a unicode encoding, it will be
+ mangled in various places on Windows, which can lead to undefined behavior.
+
## minor problems
* rsync special remotes with a rsyncurl of a local directory are known