aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joey@kitenet.net>2011-02-10 14:21:44 -0400
committerGravatar Joey Hess <joey@kitenet.net>2011-02-10 14:21:44 -0400
commitfe55b4644e67bba60b35e07abcdd312b65c9d6f3 (patch)
tree4631f428f86f72d614f9b5388772b6ec58a3fb8d
parente7a3475704f5366e89aebe78cefbeb58ff5ab181 (diff)
Fix display of unicode filenames.
Internally, the filenames are stored as un-decoded unicode. I tried decoding them, but then haskell tries to access the wrong files. Hmm. So, I've unhappily chosen option "B", which is to decode filenames before they are displayed.
-rw-r--r--Backend/File.hs6
-rw-r--r--Backend/SHA1.hs2
-rw-r--r--Backend/WORM.hs2
-rw-r--r--Command/Find.hs3
-rw-r--r--Command/PreCommit.hs2
-rw-r--r--Command/Unused.hs2
-rw-r--r--Content.hs2
-rw-r--r--Messages.hs9
-rw-r--r--debian/changelog1
-rw-r--r--doc/bugs/problems_with_utf8_names.mdwn22
-rw-r--r--doc/bugs/unhappy_without_UTF8_locale.mdwn33
11 files changed, 63 insertions, 21 deletions
diff --git a/Backend/File.hs b/Backend/File.hs
index d76cd2939..fca385a1e 100644
--- a/Backend/File.hs
+++ b/Backend/File.hs
@@ -193,14 +193,14 @@ checkKeyNumCopies key file numcopies = do
missingNote :: String -> Int -> Int -> String -> String
missingNote file 0 _ [] =
- "** No known copies of " ++ file ++ " exist!"
+ "** No known copies of " ++ showFile file ++ " exist!"
missingNote file 0 _ untrusted =
- "Only these untrusted locations may have copies of " ++ file ++
+ "Only these untrusted locations may have copies of " ++ showFile file ++
"\n" ++ untrusted ++
"Back it up to trusted locations with git-annex copy."
missingNote file present needed [] =
"Only " ++ show present ++ " of " ++ show needed ++
- " trustworthy copies of " ++ file ++ " exist." ++
+ " trustworthy copies of " ++ showFile file ++ " exist." ++
"\nBack it up with git-annex copy."
missingNote file present needed untrusted =
missingNote file present needed [] ++
diff --git a/Backend/SHA1.hs b/Backend/SHA1.hs
index 3d868dbd1..f1092492e 100644
--- a/Backend/SHA1.hs
+++ b/Backend/SHA1.hs
@@ -58,5 +58,5 @@ checkKeySHA1 key = do
then return True
else do
dest <- moveBad key
- warning $ "Bad file content; moved to "++dest
+ warning $ "Bad file content; moved to " ++ showFile dest
return False
diff --git a/Backend/WORM.hs b/Backend/WORM.hs
index 20a81d841..7f40a2acb 100644
--- a/Backend/WORM.hs
+++ b/Backend/WORM.hs
@@ -67,5 +67,5 @@ checkKeySize key = do
then return True
else do
dest <- moveBad key
- warning $ "Bad file size; moved to "++dest
+ warning $ "Bad file size; moved to " ++ showFile dest
return False
diff --git a/Command/Find.hs b/Command/Find.hs
index 3ed15c153..45156af05 100644
--- a/Command/Find.hs
+++ b/Command/Find.hs
@@ -12,6 +12,7 @@ import Control.Monad.State (liftIO)
import Command
import Content
+import Messages
command :: [Command]
command = [Command "find" (paramOptional $ paramRepeating paramPath) seek
@@ -24,5 +25,5 @@ seek = [withFilesInGit start]
start :: CommandStartString
start file = isAnnexed file $ \(key, _) -> do
exists <- inAnnex key
- when exists $ liftIO $ putStrLn file
+ when exists $ liftIO $ putStrLn $ showFile file
return Nothing
diff --git a/Command/PreCommit.hs b/Command/PreCommit.hs
index 12e5ed806..f22300a03 100644
--- a/Command/PreCommit.hs
+++ b/Command/PreCommit.hs
@@ -32,7 +32,7 @@ perform pair@(file, _) = do
ok <- doCommand $ Command.Add.start pair
if ok
then return $ Just $ cleanup file
- else error $ "failed to add " ++ file ++ "; canceling commit"
+ else error $ "failed to add " ++ showFile file ++ "; canceling commit"
cleanup :: FilePath -> CommandCleanup
cleanup file = do
diff --git a/Command/Unused.hs b/Command/Unused.hs
index d9f4e3978..2b390b956 100644
--- a/Command/Unused.hs
+++ b/Command/Unused.hs
@@ -68,7 +68,7 @@ checkUnused = do
dropmsg = ["(To remove unwanted data: git-annex dropunused NUMBER)"]
table l = [" NUMBER KEY"] ++ map cols l
- cols (n,k) = " " ++ pad 6 (show n) ++ " " ++ show k
+ cols (n,k) = " " ++ pad 6 (show n) ++ " " ++ (showFile . show) k
pad n s = s ++ replicate (n - length s) ' '
number :: Int -> [a] -> [(Int, a)]
diff --git a/Content.hs b/Content.hs
index e16ad883c..fae980bae 100644
--- a/Content.hs
+++ b/Content.hs
@@ -49,7 +49,7 @@ calcGitLink file key = do
cwd <- liftIO $ getCurrentDirectory
let absfile = case absNormPath cwd file of
Just f -> f
- Nothing -> error $ "unable to normalize " ++ file
+ Nothing -> error $ "unable to normalize " ++ showFile file
return $ relPathDirToDir (parentDir absfile) (Git.workTree g) ++
annexLocation key
diff --git a/Messages.hs b/Messages.hs
index 6f4ec1e62..12a836d3c 100644
--- a/Messages.hs
+++ b/Messages.hs
@@ -11,6 +11,7 @@ import Control.Monad.State (liftIO)
import System.IO
import Control.Monad (unless)
import Data.String.Utils
+import Codec.Binary.UTF8.String as UTF8
import Types
import qualified Annex
@@ -25,7 +26,7 @@ showSideAction s = verbose $ liftIO $ putStrLn $ "(" ++ s ++ ")"
showStart :: String -> String -> Annex ()
showStart command file = verbose $ do
- liftIO $ putStr $ command ++ " " ++ file ++ " "
+ liftIO $ putStr $ command ++ " " ++ showFile file ++ " "
liftIO $ hFlush stdout
showNote :: String -> Annex ()
@@ -45,7 +46,6 @@ showEndOk = verbose $ liftIO $ putStrLn "ok"
showEndFail :: Annex ()
showEndFail = verbose $ liftIO $ putStrLn "\nfailed"
-{- Exception pretty-printing. -}
showErr :: (Show a) => a -> Annex ()
showErr e = warning $ "git-annex: " ++ show e
@@ -57,3 +57,8 @@ warning w = do
indent :: String -> String
indent s = join "\n" $ map (\l -> " " ++ l) $ lines s
+
+{- Prepares a filename for display. This is needed because strings are
+ - internally represented in git-annex is non-decoded form. -}
+showFile :: String -> String
+showFile = decodeString
diff --git a/debian/changelog b/debian/changelog
index 2e5b97c0f..277869b02 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -4,6 +4,7 @@ git-annex (0.21) UNRELEASED; urgency=low
* unannex: Fix recently introduced bug when attempting to unannex more
than one file at a time.
* test: Set git user name and email in case git can't guess values.
+ * Fix display of unicode filenames.
-- Joey Hess <joeyh@debian.org> Wed, 09 Feb 2011 00:12:11 -0400
diff --git a/doc/bugs/problems_with_utf8_names.mdwn b/doc/bugs/problems_with_utf8_names.mdwn
index 30f3495f4..257f8dff2 100644
--- a/doc/bugs/problems_with_utf8_names.mdwn
+++ b/doc/bugs/problems_with_utf8_names.mdwn
@@ -37,10 +37,22 @@ It looks like the common latin1-to-UTF8 encoding. Functionality other than otupu
> encoded in utf-8 (an archive could have historical filenames using
> varying encodings), and you don't want which files are accessed to
> depend on locale settings.
+> > I tried to do this by making parts of GitRepo call
+> > Codec.Binary.UTF8.String.decodeString when reading filenames from
+> > git. This seemed to break attempts to operate on the files,
+> > weirdly encoded strings were seen in syscalls in strace.
> 1. Keep input and internal data un-decoded, but decode it when
> outputting a filename (assuming the filename is encoded using the
> user's configured encoding), and allow haskell's output encoding to then
> encode it according to the user's locale configuration.
+> > This is now [[implemented|done]]. I'm not very happy that I have to watch
+> > out for any place that a filename is output and call `showFile`
+> > on it, but there are really not too many such places in git-annex.
+> >
+> > Note that this only affects filenames apparently.
+> > (Names of files in the annex, and also some places where names
+> > of keys are displayed.) Utf-8 in the uuid.map file etc seems
+> > to be handled cleanly.
> 1. Avoid encodings entirely. Mostly what I'm doing now; probably
> could find a way to disable encoding of console output. Then the raw
> filename would be displayed, which should work ok. git-annex does
@@ -50,13 +62,3 @@ It looks like the common latin1-to-UTF8 encoding. Functionality other than otupu
> One other possible
> issue would be that this could cause problems if git-annex were
> translated.
->
-> BTW, for more fun, try unsetting LANG, and then you can see
-> stuff like this:
-
- joey@gnu:~/tmp/aa>git annex add ./Üa
- add add add add git-annex: <stdout>: commitAndReleaseBuffer: invalid
- argument (Invalid or incomplete multibyte or wide character)
-
-> (Add -q to work around this; once it doesn't need to print the filename,
-> it can act on it ok!)
diff --git a/doc/bugs/unhappy_without_UTF8_locale.mdwn b/doc/bugs/unhappy_without_UTF8_locale.mdwn
new file mode 100644
index 000000000..6f1df4fab
--- /dev/null
+++ b/doc/bugs/unhappy_without_UTF8_locale.mdwn
@@ -0,0 +1,33 @@
+Try unsetting LANG and passing git-annex unicode filenames.
+
+ joey@gnu:~/tmp/aa>git annex add ./Üa
+ add add add add git-annex: <stdout>: commitAndReleaseBuffer: invalid
+ argument (Invalid or incomplete multibyte or wide character)
+
+The same problem can be seen with a simple haskell program:
+
+ import System.Environment
+ import Codec.Binary.UTF8.String
+ main = do
+ args <- getArgs
+ putStrLn $ decodeString $ args !! 0
+
+ joey@gnu:~/src/git-annex>LANG= runghc ~/foo.hs Ü
+ foo.hs: <stdout>: hPutChar: invalid argument (Invalid or incomplete multibyte or wide character)
+
+(The call to `decodeString` is necessary to make the input
+unicode string be displayed properly in a utf8 locale, but
+does not contribute to this problem.)
+
+I guess that haskell is setting the IO encoding to latin1, which
+is [documented](http://haskell.org/ghc/docs/latest/html/libraries/base/System-IO.html#v:latin1)
+to error out on characters > 255.
+
+So this program doesn't have the problem -- but may output garbage
+on non-utf-8 capable terminals:
+
+ import System.IO
+ main = do
+ hSetEncoding stdout utf8
+ args <- getArgs
+ putStrLn $ decodeString $ args !! 0