summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Remote/Helper/Encryptable.hs24
-rw-r--r--Test.hs2
-rw-r--r--Types/MetaData.hs1
-rw-r--r--Utility/Base64.hs18
-rw-r--r--Utility/FileSystemEncoding.hs17
-rw-r--r--debian/changelog5
-rw-r--r--doc/bugs/Unicode_characters_lost__47__converted_in_metadata.mdwn2
-rw-r--r--doc/bugs/Unicode_characters_lost__47__converted_in_metadata/comment_1_799fd2dc62dc51a6a8fa8a6d15d7a1f9._comment32
8 files changed, 88 insertions, 13 deletions
diff --git a/Remote/Helper/Encryptable.hs b/Remote/Helper/Encryptable.hs
index c1243a518..2c1935ba9 100644
--- a/Remote/Helper/Encryptable.hs
+++ b/Remote/Helper/Encryptable.hs
@@ -20,13 +20,14 @@ module Remote.Helper.Encryptable (
) where
import qualified Data.Map as M
+import qualified "dataenc" Codec.Binary.Base64 as B64
+import Data.Bits.Utils
import Common.Annex
import Types.Remote
import Crypto
import Types.Crypto
import qualified Annex
-import Utility.Base64
-- Used to ensure that encryption has been set up before trying to
-- eg, store creds in the remote config that would need to use the
@@ -137,9 +138,9 @@ cipherKey c = fmap make <$> remoteCipher c
{- Stores an StorableCipher in a remote's configuration. -}
storeCipher :: RemoteConfig -> StorableCipher -> RemoteConfig
-storeCipher c (SharedCipher t) = M.insert "cipher" (toB64 t) c
+storeCipher c (SharedCipher t) = M.insert "cipher" (toB64bs t) c
storeCipher c (EncryptedCipher t _ ks) =
- M.insert "cipher" (toB64 t) $ M.insert "cipherkeys" (showkeys ks) c
+ M.insert "cipher" (toB64bs t) $ M.insert "cipherkeys" (showkeys ks) c
where
showkeys (KeyIds l) = intercalate "," l
@@ -149,11 +150,11 @@ extractCipher c = case (M.lookup "cipher" c,
M.lookup "cipherkeys" c,
M.lookup "encryption" c) of
(Just t, Just ks, encryption) | maybe True (== "hybrid") encryption ->
- Just $ EncryptedCipher (fromB64 t) Hybrid (readkeys ks)
+ Just $ EncryptedCipher (fromB64bs t) Hybrid (readkeys ks)
(Just t, Just ks, Just "pubkey") ->
- Just $ EncryptedCipher (fromB64 t) PubKey (readkeys ks)
+ Just $ EncryptedCipher (fromB64bs t) PubKey (readkeys ks)
(Just t, Nothing, encryption) | maybe True (== "shared") encryption ->
- Just $ SharedCipher (fromB64 t)
+ Just $ SharedCipher (fromB64bs t)
_ -> Nothing
where
readkeys = KeyIds . split ","
@@ -169,3 +170,14 @@ describeEncryption c = case extractCipher c of
PubKey -> Nothing
Hybrid -> Just "(hybrid mode)"
]
+
+{- Not using Utility.Base64 because these "Strings" are really
+ - bags of bytes and that would convert to unicode and not roung-trip
+ - cleanly. -}
+toB64bs :: String -> String
+toB64bs = B64.encode . s2w8
+
+fromB64bs :: String -> String
+fromB64bs s = fromMaybe bad $ w82s <$> B64.decode s
+ where
+ bad = error "bad base64 encoded data"
diff --git a/Test.hs b/Test.hs
index 290ff0b69..5286bc6f6 100644
--- a/Test.hs
+++ b/Test.hs
@@ -71,6 +71,7 @@ import qualified Utility.Hash
import qualified Utility.Scheduled
import qualified Utility.HumanTime
import qualified Utility.ThreadScheduler
+import qualified Utility.Base64
import qualified Command.Uninit
import qualified CmdLine.GitAnnex as GitAnnex
#ifndef mingw32_HOST_OS
@@ -163,6 +164,7 @@ properties = localOption (QuickCheckTests 1000) $ testGroup "QuickCheck"
, testProperty "prop_branchView_legal" Logs.View.prop_branchView_legal
, testProperty "prop_view_roundtrips" Annex.View.prop_view_roundtrips
, testProperty "prop_viewedFile_rountrips" Annex.View.ViewedFile.prop_viewedFile_roundtrips
+ , testProperty "prop_b64_roundtrips" Utility.Base64.prop_b64_roundtrips
]
{- These tests set up the test environment, but also test some basic parts
diff --git a/Types/MetaData.hs b/Types/MetaData.hs
index 2a6b3b864..1e8fb4aa2 100644
--- a/Types/MetaData.hs
+++ b/Types/MetaData.hs
@@ -224,6 +224,7 @@ data ModMeta
| DelMeta MetaField MetaValue
| SetMeta MetaField MetaValue -- removes any existing values
| MaybeSetMeta MetaField MetaValue -- when field has no existing value
+ deriving (Show)
{- Applies a ModMeta, generating the new MetaData.
- Note that the new MetaData does not include all the
diff --git a/Utility/Base64.hs b/Utility/Base64.hs
index 56637a117..80cc122a1 100644
--- a/Utility/Base64.hs
+++ b/Utility/Base64.hs
@@ -1,24 +1,28 @@
-{- Simple Base64 access
+{- Simple Base64 encoding of Strings
-
- Copyright 2011 Joey Hess <id@joeyh.name>
-
- License: BSD-2-clause
-}
-module Utility.Base64 (toB64, fromB64Maybe, fromB64) where
+module Utility.Base64 (toB64, fromB64Maybe, fromB64, prop_b64_roundtrips) where
-import "dataenc" Codec.Binary.Base64
-import Data.Bits.Utils
+import qualified "dataenc" Codec.Binary.Base64 as B64
import Control.Applicative
import Data.Maybe
+import qualified Data.ByteString.Lazy as L
+import Data.ByteString.Lazy.UTF8 (fromString, toString)
-toB64 :: String -> String
-toB64 = encode . s2w8
+toB64 :: String -> String
+toB64 = B64.encode . L.unpack . fromString
fromB64Maybe :: String -> Maybe String
-fromB64Maybe s = w82s <$> decode s
+fromB64Maybe s = toString . L.pack <$> B64.decode s
fromB64 :: String -> String
fromB64 = fromMaybe bad . fromB64Maybe
where
bad = error "bad base64 encoded data"
+
+prop_b64_roundtrips :: String -> Bool
+prop_b64_roundtrips s = s == fromB64 (toB64 s)
diff --git a/Utility/FileSystemEncoding.hs b/Utility/FileSystemEncoding.hs
index 844e81e59..139b74fe4 100644
--- a/Utility/FileSystemEncoding.hs
+++ b/Utility/FileSystemEncoding.hs
@@ -14,6 +14,8 @@ module Utility.FileSystemEncoding (
decodeBS,
decodeW8,
encodeW8,
+ encodeW8NUL,
+ decodeW8NUL,
truncateFilePath,
) where
@@ -25,6 +27,7 @@ import System.IO.Unsafe
import qualified Data.Hash.MD5 as MD5
import Data.Word
import Data.Bits.Utils
+import Data.List.Utils
import qualified Data.ByteString.Lazy as L
#ifdef mingw32_HOST_OS
import qualified Data.ByteString.Lazy.UTF8 as L8
@@ -89,6 +92,9 @@ decodeBS = L8.toString
- w82c produces a String, which may contain Chars that are invalid
- unicode. From there, this is really a simple matter of applying the
- file system encoding, only complicated by GHC's interface to doing so.
+ -
+ - Note that the encoding stops at any NUL in the input. FilePaths
+ - do not normally contain embedded NUL, but Haskell Strings may.
-}
{-# NOINLINE encodeW8 #-}
encodeW8 :: [Word8] -> FilePath
@@ -101,6 +107,17 @@ encodeW8 w8 = unsafePerformIO $ do
decodeW8 :: FilePath -> [Word8]
decodeW8 = s2w8 . _encodeFilePath
+{- Like encodeW8 and decodeW8, but NULs are passed through unchanged. -}
+encodeW8NUL :: [Word8] -> FilePath
+encodeW8NUL = join nul . map encodeW8 . split (s2w8 nul)
+ where
+ nul = ['\NUL']
+
+decodeW8NUL :: FilePath -> [Word8]
+decodeW8NUL = join (s2w8 nul) . map decodeW8 . split nul
+ where
+ nul = ['\NUL']
+
{- Truncates a FilePath to the given number of bytes (or less),
- as represented on disk.
-
diff --git a/debian/changelog b/debian/changelog
index 9fd6440f6..921c0ff4a 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -22,6 +22,11 @@ git-annex (5.2015022) UNRELEASED; urgency=medium
* When re-execing git-annex, use current program location, rather than
~/.config/git-annex/program, when possible.
* Submodules are now supported by git-annex!
+ * metadata: Fix encoding problem that led to mojibake when storing
+ metadata strings that contained both unicode characters and a space
+ (or '!') character.
+ * Also potentially fixes encoding problem when embedding credentials
+ that contain unicode characters.
-- Joey Hess <id@joeyh.name> Thu, 19 Feb 2015 14:16:03 -0400
diff --git a/doc/bugs/Unicode_characters_lost__47__converted_in_metadata.mdwn b/doc/bugs/Unicode_characters_lost__47__converted_in_metadata.mdwn
index 8d7475163..ebfef3b77 100644
--- a/doc/bugs/Unicode_characters_lost__47__converted_in_metadata.mdwn
+++ b/doc/bugs/Unicode_characters_lost__47__converted_in_metadata.mdwn
@@ -13,3 +13,5 @@ Unicode characters in metadata are pruned/converted/lost:
### What version of git-annex are you using? On what operating system?
5.20141125 Debian
+
+> [[fixed|done]]; test pass. --[[Joey]]
diff --git a/doc/bugs/Unicode_characters_lost__47__converted_in_metadata/comment_1_799fd2dc62dc51a6a8fa8a6d15d7a1f9._comment b/doc/bugs/Unicode_characters_lost__47__converted_in_metadata/comment_1_799fd2dc62dc51a6a8fa8a6d15d7a1f9._comment
new file mode 100644
index 000000000..1ae278289
--- /dev/null
+++ b/doc/bugs/Unicode_characters_lost__47__converted_in_metadata/comment_1_799fd2dc62dc51a6a8fa8a6d15d7a1f9._comment
@@ -0,0 +1,32 @@
+[[!comment format=mdwn
+ username="joey"
+ subject="""comment 1"""
+ date="2015-03-04T14:31:21Z"
+ content="""
+What I'm seeing is the unicode arrow is replaced with 0092 and the elipsis
+with &. It's losing the other byte.
+
+The problem seems to be in the base64 encoding that's done, when the metadata
+value contains spaces or a few other problem characters. These same
+unicode characters roundtrip through without a problem when not embedded
+in a string with spaces.
+
+<pre>
+*Utility.Base64> let s = "…"
+*Utility.Base64> (s, fromB64 $ toB64 s)
+("\8230","&")
+</pre>
+
+git-annex also uses base64 for encoding some creds
+(and for tagged pushes over XMPP, but only the JID is encoded).
+
+The real culprit is the use of `w82s`, which doesn't handle multi-byte
+characters. I can easily fix this by using `encodeW8` instead.
+Audited git-annex for other problem w82s uses and don't see any, so will
+only need to fix this once.
+
+Added a quickcheck test for fromB64 . toB64 roundtripping.
+
+Unfortunately, the entered unicode characters didn't get saved right,
+so git-annex can do nothing to fix data that was already entered.
+"""]]