summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joeyh@joeyh.name>2015-03-04 11:16:03 -0400
committerGravatar Joey Hess <joeyh@joeyh.name>2015-03-04 12:54:30 -0400
commit05697fe62116181511084a2eba28c5220e8a0363 (patch)
tree6965f56f5648d6dfa6c5e7d6e31e32eb3975b073
parent0c3570844cf60428808d01a73c808e4f7232f082 (diff)
metadata: Fix encoding problem that led to mojibake when storing metadata strings that contained both unicode characters and a space (or '!') character.
The fix is to stop using w82s, which does not properly reconstitute unicode strings. Instrad, use utf8 bytestring to get the [Word8] to base64. This passes unicode through perfectly, including any invalid filesystem encoded characters. Note that toB64 / fromB64 are also used for creds and cipher embedding. It would be unfortunate if this change broke those uses. For cipher embedding, note that ciphers can contain arbitrary bytes (should really be using ByteString.Char8 there). Testing indicated it's not safe to use the new fromB64 there; I think that characters were incorrectly combined. For credpair embedding, the username or password could contain unicode. Before, that unicode would fail to round-trip through the b64. So, I guess this is not going to break any embedded creds that worked before. This bug may have affected some creds before, and if so, this change will not fix old ones, but should fix new ones at least.
-rw-r--r--Remote/Helper/Encryptable.hs24
-rw-r--r--Test.hs2
-rw-r--r--Types/MetaData.hs1
-rw-r--r--Utility/Base64.hs18
-rw-r--r--Utility/FileSystemEncoding.hs17
-rw-r--r--debian/changelog5
-rw-r--r--doc/bugs/Unicode_characters_lost__47__converted_in_metadata.mdwn2
-rw-r--r--doc/bugs/Unicode_characters_lost__47__converted_in_metadata/comment_1_799fd2dc62dc51a6a8fa8a6d15d7a1f9._comment32
8 files changed, 88 insertions, 13 deletions
diff --git a/Remote/Helper/Encryptable.hs b/Remote/Helper/Encryptable.hs
index c1243a518..2c1935ba9 100644
--- a/Remote/Helper/Encryptable.hs
+++ b/Remote/Helper/Encryptable.hs
@@ -20,13 +20,14 @@ module Remote.Helper.Encryptable (
) where
import qualified Data.Map as M
+import qualified "dataenc" Codec.Binary.Base64 as B64
+import Data.Bits.Utils
import Common.Annex
import Types.Remote
import Crypto
import Types.Crypto
import qualified Annex
-import Utility.Base64
-- Used to ensure that encryption has been set up before trying to
-- eg, store creds in the remote config that would need to use the
@@ -137,9 +138,9 @@ cipherKey c = fmap make <$> remoteCipher c
{- Stores an StorableCipher in a remote's configuration. -}
storeCipher :: RemoteConfig -> StorableCipher -> RemoteConfig
-storeCipher c (SharedCipher t) = M.insert "cipher" (toB64 t) c
+storeCipher c (SharedCipher t) = M.insert "cipher" (toB64bs t) c
storeCipher c (EncryptedCipher t _ ks) =
- M.insert "cipher" (toB64 t) $ M.insert "cipherkeys" (showkeys ks) c
+ M.insert "cipher" (toB64bs t) $ M.insert "cipherkeys" (showkeys ks) c
where
showkeys (KeyIds l) = intercalate "," l
@@ -149,11 +150,11 @@ extractCipher c = case (M.lookup "cipher" c,
M.lookup "cipherkeys" c,
M.lookup "encryption" c) of
(Just t, Just ks, encryption) | maybe True (== "hybrid") encryption ->
- Just $ EncryptedCipher (fromB64 t) Hybrid (readkeys ks)
+ Just $ EncryptedCipher (fromB64bs t) Hybrid (readkeys ks)
(Just t, Just ks, Just "pubkey") ->
- Just $ EncryptedCipher (fromB64 t) PubKey (readkeys ks)
+ Just $ EncryptedCipher (fromB64bs t) PubKey (readkeys ks)
(Just t, Nothing, encryption) | maybe True (== "shared") encryption ->
- Just $ SharedCipher (fromB64 t)
+ Just $ SharedCipher (fromB64bs t)
_ -> Nothing
where
readkeys = KeyIds . split ","
@@ -169,3 +170,14 @@ describeEncryption c = case extractCipher c of
PubKey -> Nothing
Hybrid -> Just "(hybrid mode)"
]
+
+{- Not using Utility.Base64 because these "Strings" are really
+ - bags of bytes and that would convert to unicode and not roung-trip
+ - cleanly. -}
+toB64bs :: String -> String
+toB64bs = B64.encode . s2w8
+
+fromB64bs :: String -> String
+fromB64bs s = fromMaybe bad $ w82s <$> B64.decode s
+ where
+ bad = error "bad base64 encoded data"
diff --git a/Test.hs b/Test.hs
index 290ff0b69..5286bc6f6 100644
--- a/Test.hs
+++ b/Test.hs
@@ -71,6 +71,7 @@ import qualified Utility.Hash
import qualified Utility.Scheduled
import qualified Utility.HumanTime
import qualified Utility.ThreadScheduler
+import qualified Utility.Base64
import qualified Command.Uninit
import qualified CmdLine.GitAnnex as GitAnnex
#ifndef mingw32_HOST_OS
@@ -163,6 +164,7 @@ properties = localOption (QuickCheckTests 1000) $ testGroup "QuickCheck"
, testProperty "prop_branchView_legal" Logs.View.prop_branchView_legal
, testProperty "prop_view_roundtrips" Annex.View.prop_view_roundtrips
, testProperty "prop_viewedFile_rountrips" Annex.View.ViewedFile.prop_viewedFile_roundtrips
+ , testProperty "prop_b64_roundtrips" Utility.Base64.prop_b64_roundtrips
]
{- These tests set up the test environment, but also test some basic parts
diff --git a/Types/MetaData.hs b/Types/MetaData.hs
index 2a6b3b864..1e8fb4aa2 100644
--- a/Types/MetaData.hs
+++ b/Types/MetaData.hs
@@ -224,6 +224,7 @@ data ModMeta
| DelMeta MetaField MetaValue
| SetMeta MetaField MetaValue -- removes any existing values
| MaybeSetMeta MetaField MetaValue -- when field has no existing value
+ deriving (Show)
{- Applies a ModMeta, generating the new MetaData.
- Note that the new MetaData does not include all the
diff --git a/Utility/Base64.hs b/Utility/Base64.hs
index 56637a117..80cc122a1 100644
--- a/Utility/Base64.hs
+++ b/Utility/Base64.hs
@@ -1,24 +1,28 @@
-{- Simple Base64 access
+{- Simple Base64 encoding of Strings
-
- Copyright 2011 Joey Hess <id@joeyh.name>
-
- License: BSD-2-clause
-}
-module Utility.Base64 (toB64, fromB64Maybe, fromB64) where
+module Utility.Base64 (toB64, fromB64Maybe, fromB64, prop_b64_roundtrips) where
-import "dataenc" Codec.Binary.Base64
-import Data.Bits.Utils
+import qualified "dataenc" Codec.Binary.Base64 as B64
import Control.Applicative
import Data.Maybe
+import qualified Data.ByteString.Lazy as L
+import Data.ByteString.Lazy.UTF8 (fromString, toString)
-toB64 :: String -> String
-toB64 = encode . s2w8
+toB64 :: String -> String
+toB64 = B64.encode . L.unpack . fromString
fromB64Maybe :: String -> Maybe String
-fromB64Maybe s = w82s <$> decode s
+fromB64Maybe s = toString . L.pack <$> B64.decode s
fromB64 :: String -> String
fromB64 = fromMaybe bad . fromB64Maybe
where
bad = error "bad base64 encoded data"
+
+prop_b64_roundtrips :: String -> Bool
+prop_b64_roundtrips s = s == fromB64 (toB64 s)
diff --git a/Utility/FileSystemEncoding.hs b/Utility/FileSystemEncoding.hs
index 844e81e59..139b74fe4 100644
--- a/Utility/FileSystemEncoding.hs
+++ b/Utility/FileSystemEncoding.hs
@@ -14,6 +14,8 @@ module Utility.FileSystemEncoding (
decodeBS,
decodeW8,
encodeW8,
+ encodeW8NUL,
+ decodeW8NUL,
truncateFilePath,
) where
@@ -25,6 +27,7 @@ import System.IO.Unsafe
import qualified Data.Hash.MD5 as MD5
import Data.Word
import Data.Bits.Utils
+import Data.List.Utils
import qualified Data.ByteString.Lazy as L
#ifdef mingw32_HOST_OS
import qualified Data.ByteString.Lazy.UTF8 as L8
@@ -89,6 +92,9 @@ decodeBS = L8.toString
- w82c produces a String, which may contain Chars that are invalid
- unicode. From there, this is really a simple matter of applying the
- file system encoding, only complicated by GHC's interface to doing so.
+ -
+ - Note that the encoding stops at any NUL in the input. FilePaths
+ - do not normally contain embedded NUL, but Haskell Strings may.
-}
{-# NOINLINE encodeW8 #-}
encodeW8 :: [Word8] -> FilePath
@@ -101,6 +107,17 @@ encodeW8 w8 = unsafePerformIO $ do
decodeW8 :: FilePath -> [Word8]
decodeW8 = s2w8 . _encodeFilePath
+{- Like encodeW8 and decodeW8, but NULs are passed through unchanged. -}
+encodeW8NUL :: [Word8] -> FilePath
+encodeW8NUL = join nul . map encodeW8 . split (s2w8 nul)
+ where
+ nul = ['\NUL']
+
+decodeW8NUL :: FilePath -> [Word8]
+decodeW8NUL = join (s2w8 nul) . map decodeW8 . split nul
+ where
+ nul = ['\NUL']
+
{- Truncates a FilePath to the given number of bytes (or less),
- as represented on disk.
-
diff --git a/debian/changelog b/debian/changelog
index 9fd6440f6..921c0ff4a 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -22,6 +22,11 @@ git-annex (5.2015022) UNRELEASED; urgency=medium
* When re-execing git-annex, use current program location, rather than
~/.config/git-annex/program, when possible.
* Submodules are now supported by git-annex!
+ * metadata: Fix encoding problem that led to mojibake when storing
+ metadata strings that contained both unicode characters and a space
+ (or '!') character.
+ * Also potentially fixes encoding problem when embedding credentials
+ that contain unicode characters.
-- Joey Hess <id@joeyh.name> Thu, 19 Feb 2015 14:16:03 -0400
diff --git a/doc/bugs/Unicode_characters_lost__47__converted_in_metadata.mdwn b/doc/bugs/Unicode_characters_lost__47__converted_in_metadata.mdwn
index 8d7475163..ebfef3b77 100644
--- a/doc/bugs/Unicode_characters_lost__47__converted_in_metadata.mdwn
+++ b/doc/bugs/Unicode_characters_lost__47__converted_in_metadata.mdwn
@@ -13,3 +13,5 @@ Unicode characters in metadata are pruned/converted/lost:
### What version of git-annex are you using? On what operating system?
5.20141125 Debian
+
+> [[fixed|done]]; test pass. --[[Joey]]
diff --git a/doc/bugs/Unicode_characters_lost__47__converted_in_metadata/comment_1_799fd2dc62dc51a6a8fa8a6d15d7a1f9._comment b/doc/bugs/Unicode_characters_lost__47__converted_in_metadata/comment_1_799fd2dc62dc51a6a8fa8a6d15d7a1f9._comment
new file mode 100644
index 000000000..1ae278289
--- /dev/null
+++ b/doc/bugs/Unicode_characters_lost__47__converted_in_metadata/comment_1_799fd2dc62dc51a6a8fa8a6d15d7a1f9._comment
@@ -0,0 +1,32 @@
+[[!comment format=mdwn
+ username="joey"
+ subject="""comment 1"""
+ date="2015-03-04T14:31:21Z"
+ content="""
+What I'm seeing is the unicode arrow is replaced with 0092 and the elipsis
+with &. It's losing the other byte.
+
+The problem seems to be in the base64 encoding that's done, when the metadata
+value contains spaces or a few other problem characters. These same
+unicode characters roundtrip through without a problem when not embedded
+in a string with spaces.
+
+<pre>
+*Utility.Base64> let s = "…"
+*Utility.Base64> (s, fromB64 $ toB64 s)
+("\8230","&")
+</pre>
+
+git-annex also uses base64 for encoding some creds
+(and for tagged pushes over XMPP, but only the JID is encoded).
+
+The real culprit is the use of `w82s`, which doesn't handle multi-byte
+characters. I can easily fix this by using `encodeW8` instead.
+Audited git-annex for other problem w82s uses and don't see any, so will
+only need to fix this once.
+
+Added a quickcheck test for fromB64 . toB64 roundtripping.
+
+Unfortunately, the entered unicode characters didn't get saved right,
+so git-annex can do nothing to fix data that was already entered.
+"""]]