From 85feb21fb08874b00e96fa172d6e6c7f4cb02660 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 27 May 2014 14:16:33 -0400 Subject: Fix encoding of data written to git-annex branch. Avoid truncating unicode characters to 8 bits. Allow any encoding to be used, as with filenames (but utf8 is the sane choice). Affects metadata and repository descriptions, and preferred content expressions. The question of what's the right encoding for the git-annex branch is a vexing one. utf-8 would be a nice choice, but this leaves the possibility of bad data getting into a git-annex branch somehow, and this resulting in git-annex crashing with encoding errors, which is a failure mode I want to avoid. (Also, preferred content expressions can refer to filenames, and filenames can have any encoding, so limiting to utf-8 would not be ideal.) The union merge code already took care to not assume any encoding for a file. Except it assumes that any \n is a literal newline, and not part of some encoding of a character that happens to contain a newline. (At least utf-8 avoids using newline for anything except liternal newlines.) Adapted the git-annex branch code to use this same approach. Note that there is a potential interop problem with Windows, since FileSystemEncoding doesn't work there, and instead things are always decoded as utf-8. If someone uses non-utf8 encoding for data on the git-annex branch, this can lead to an encoding error on windows. However, this commit doesn't actually make that any worse, because the union merge code would similarly fail with an encoding error on windows in that situation. This commit was sponsored by Kyle Meyer. --- Annex/Branch.hs | 10 ++++++---- Annex/Journal.hs | 6 ++---- debian/changelog | 4 ++++ doc/bugs/forget_corrupts_non-ascii_chars.mdwn | 4 ++++ doc/bugs/unicode_tags.mdwn | 3 +++ 5 files changed, 19 insertions(+), 8 deletions(-) diff --git a/Annex/Branch.hs b/Annex/Branch.hs index 94c4c029c..7a75d8acf 100644 --- a/Annex/Branch.hs +++ b/Annex/Branch.hs @@ -25,9 +25,10 @@ module Annex.Branch ( performTransitions, ) where -import qualified Data.ByteString.Lazy.Char8 as L +import qualified Data.ByteString.Lazy as L import qualified Data.Set as S import qualified Data.Map as M +import Data.Bits.Utils import Common.Annex import Annex.BranchState @@ -199,7 +200,7 @@ getHistorical :: RefDate -> FilePath -> Annex String getHistorical date = getRef (Git.Ref.dateRef fullname date) getRef :: Ref -> FilePath -> Annex String -getRef ref file = withIndex $ L.unpack <$> catFile ref file +getRef ref file = withIndex $ decodeBS <$> catFile ref file {- Applies a function to modifiy the content of a file. - @@ -259,7 +260,8 @@ commitIndex' jl branchref message parents = do where -- look for "parent ref" lines and return the refs commitparents = map (Git.Ref . snd) . filter isparent . - map (toassoc . L.unpack) . L.lines + map (toassoc . decodeBS) . L.split newline + newline = c2w8 '\n' toassoc = separate (== ' ') isparent (k,_) = k == "parent" @@ -432,7 +434,7 @@ handleTransitions jl localts refs = do return True where getreftransition ref = do - ts <- parseTransitionsStrictly "remote" . L.unpack + ts <- parseTransitionsStrictly "remote" . decodeBS <$> catFile ref transitionsLog return (ref, ts) diff --git a/Annex/Journal.hs b/Annex/Journal.hs index 395e81d29..dcd3779de 100644 --- a/Annex/Journal.hs +++ b/Annex/Journal.hs @@ -13,8 +13,6 @@ module Annex.Journal where -import System.IO.Binary - import Common.Annex import Annex.Exception import qualified Git @@ -42,7 +40,7 @@ setJournalFile _jl file content = do jfile <- fromRepo $ journalFile file let tmpfile = tmp takeFileName jfile liftIO $ do - writeBinaryFile tmpfile content + writeFileAnyEncoding tmpfile content moveFile tmpfile jfile {- Gets any journalled content for a file in the branch. -} @@ -54,7 +52,7 @@ getJournalFile _jl = getJournalFileStale - changes. -} getJournalFileStale :: FilePath -> Annex (Maybe String) getJournalFileStale file = inRepo $ \g -> catchMaybeIO $ - readFileStrict $ journalFile file g + readFileStrictAnyEncoding $ journalFile file g {- List of files that have updated content in the journal. -} getJournalledFiles :: JournalLocked -> Annex [FilePath] diff --git a/debian/changelog b/debian/changelog index 8603adf17..68678ed21 100644 --- a/debian/changelog +++ b/debian/changelog @@ -13,6 +13,10 @@ git-annex (5.20140518) UNRELEASED; urgency=medium so that it can be easily enabled elsewhere. * android: Run busybox install with -s, since some versions of Android prohibit making hard links. + * Fix encoding of data written to git-annex branch. Avoid truncating + unicode characters to 8 bits. Allow any encoding to be used, as with + filenames (but utf8 is the sane choice). Affects metadata and repository + descriptions, and preferred content expressions. -- Joey Hess Mon, 19 May 2014 15:59:25 -0400 diff --git a/doc/bugs/forget_corrupts_non-ascii_chars.mdwn b/doc/bugs/forget_corrupts_non-ascii_chars.mdwn index daee6f63a..f4506c228 100644 --- a/doc/bugs/forget_corrupts_non-ascii_chars.mdwn +++ b/doc/bugs/forget_corrupts_non-ascii_chars.mdwn @@ -72,3 +72,7 @@ backend usage: """]] well that's interesting - the above paste is broken by ikiwiki as well... in the text area where i paste it, "rachel@topcrapn:~/Vidéos/anarcat" shows up as "rachel@topcrapn:~/Vidéos/anarcat" but when i preview, the character gets corrupted. and obviously, the second instance then gets *double* corrupted - wheepee. the original paste has "rachel@topcrapn:~/Vidéos/anarcat". --[[anarcat]] + +> [[fixed|done]]; writes to git-annex branch now preserve the original +> encoding, and I've tested that lots of interesting unicode is preserved +> across a forget run. --[[Joey]] diff --git a/doc/bugs/unicode_tags.mdwn b/doc/bugs/unicode_tags.mdwn index 6ecfad95a..ff3c78666 100644 --- a/doc/bugs/unicode_tags.mdwn +++ b/doc/bugs/unicode_tags.mdwn @@ -44,3 +44,6 @@ OS X 10.9 LC_TIME="en_US.UTF-8" LC_ALL= +> All strings written to the git-annex branch were truncated to 8 bits. +> I've fixed this, and this example works now (of course data written with +> an old git-annex remains truncated). [[done]] --[[Joey]] -- cgit v1.2.3