diff options
author | Joey Hess <joey@kitenet.net> | 2012-06-20 13:15:59 -0400 |
---|---|---|
committer | Joey Hess <joey@kitenet.net> | 2012-06-20 13:15:59 -0400 |
commit | 483b1b08c6fad82d5e17aaaded398bdad94124f1 (patch) | |
tree | 1dc6d610a7947c2169d51e9ef7b575e315f8d34c | |
parent | 5580af5789427fc5fd7cd74fd4a2529668621a68 (diff) | |
parent | 75b6ee81f9d9b921106c829380e30445415ec2f7 (diff) |
Merge branch 'master' into watch
24 files changed, 225 insertions, 48 deletions
diff --git a/Annex/CatFile.hs b/Annex/CatFile.hs index afb14c67f..88c498d31 100644 --- a/Annex/CatFile.hs +++ b/Annex/CatFile.hs @@ -12,7 +12,7 @@ module Annex.CatFile ( catFileHandle ) where -import qualified Data.ByteString.Lazy.Char8 as L +import qualified Data.ByteString.Lazy as L import Common.Annex import qualified Git diff --git a/Command/Unused.hs b/Command/Unused.hs index 03a709534..b115eee83 100644 --- a/Command/Unused.hs +++ b/Command/Unused.hs @@ -10,8 +10,7 @@ module Command.Unused where import qualified Data.Set as S -import qualified Data.Text.Lazy as L -import qualified Data.Text.Lazy.Encoding as L +import qualified Data.ByteString.Lazy as L import Data.BloomFilter import Data.BloomFilter.Easy import Data.BloomFilter.Hash @@ -265,8 +264,9 @@ withKeysReferencedInGitRef a ref = do go [] = noop go (l:ls) | isSymLink (LsTree.mode l) = do - content <- L.decodeUtf8 <$> catFile ref (LsTree.file l) - case fileKey (takeFileName $ L.unpack content) of + content <- encodeW8 . L.unpack + <$> catFile ref (LsTree.file l) + case fileKey (takeFileName content) of Nothing -> go ls Just k -> do a k @@ -26,7 +26,7 @@ module Crypto ( prop_hmacWithCipher_sane ) where -import qualified Data.ByteString.Lazy.Char8 as L +import qualified Data.ByteString.Lazy as L import Data.ByteString.Lazy.UTF8 (fromString) import Data.Digest.Pure.SHA import Control.Applicative diff --git a/Git/CatFile.hs b/Git/CatFile.hs index 8a320a712..e667b2087 100644 --- a/Git/CatFile.hs +++ b/Git/CatFile.hs @@ -15,8 +15,8 @@ module Git.CatFile ( ) where import System.IO -import qualified Data.ByteString.Char8 as S -import qualified Data.ByteString.Lazy.Char8 as L +import qualified Data.ByteString as S +import qualified Data.ByteString.Lazy as L import Common import Git diff --git a/Remote/Bup.hs b/Remote/Bup.hs index 3e7e9211f..f1a36e468 100644 --- a/Remote/Bup.hs +++ b/Remote/Bup.hs @@ -7,7 +7,7 @@ module Remote.Bup (remote) where -import qualified Data.ByteString.Lazy.Char8 as L +import qualified Data.ByteString.Lazy as L import qualified Data.Map as M import System.Process diff --git a/Remote/Directory.hs b/Remote/Directory.hs index 7521e7013..a5b0ff2a2 100644 --- a/Remote/Directory.hs +++ b/Remote/Directory.hs @@ -7,8 +7,8 @@ module Remote.Directory (remote) where -import qualified Data.ByteString.Lazy.Char8 as L -import qualified Data.ByteString.Char8 as S +import qualified Data.ByteString.Lazy as L +import qualified Data.ByteString as S import qualified Data.Map as M import qualified Control.Exception as E diff --git a/Remote/Hook.hs b/Remote/Hook.hs index 1202c6087..5fb793e65 100644 --- a/Remote/Hook.hs +++ b/Remote/Hook.hs @@ -7,7 +7,7 @@ module Remote.Hook (remote) where -import qualified Data.ByteString.Lazy.Char8 as L +import qualified Data.ByteString.Lazy as L import qualified Data.Map as M import System.Exit import System.Environment diff --git a/Remote/Rsync.hs b/Remote/Rsync.hs index 3c449b5de..6207e1425 100644 --- a/Remote/Rsync.hs +++ b/Remote/Rsync.hs @@ -7,7 +7,7 @@ module Remote.Rsync (remote) where -import qualified Data.ByteString.Lazy.Char8 as L +import qualified Data.ByteString.Lazy as L import qualified Data.Map as M import Common.Annex diff --git a/Utility/FileSystemEncoding.hs b/Utility/FileSystemEncoding.hs index cf1a6a731..d027ede48 100644 --- a/Utility/FileSystemEncoding.hs +++ b/Utility/FileSystemEncoding.hs @@ -13,6 +13,8 @@ import Foreign.C import System.IO import System.IO.Unsafe import qualified Data.Hash.MD5 as MD5 +import Data.Word +import Data.Bits.Utils {- Sets a Handle to use the filesystem encoding. This causes data - written or read from it to be encoded/decoded the same @@ -29,7 +31,7 @@ withFilePath :: FilePath -> (CString -> IO a) -> IO a withFilePath fp f = Encoding.getFileSystemEncoding >>= \enc -> GHC.withCString enc fp f -{- Encodes a FilePath into a Str, applying the filesystem encoding. +{- Encodes a FilePath into a Md5.Str, applying the filesystem encoding. - - This use of unsafePerformIO is belived to be safe; GHC's interface - only allows doing this conversion with CStrings, and the CString buffer @@ -41,3 +43,15 @@ encodeFilePath :: FilePath -> MD5.Str encodeFilePath fp = MD5.Str $ unsafePerformIO $ do enc <- Encoding.getFileSystemEncoding GHC.withCString enc fp $ GHC.peekCString Encoding.char8 + +{- Converts a [Word8] to a FilePath, encoding using the filesystem encoding. + - + - w82c produces a String, which may contain Chars that are invalid + - unicode. From there, this is really a simple matter of applying the + - file system encoding, only complicated by GHC's interface to doing so. + -} +{-# NOINLINE encodeW8 #-} +encodeW8 :: [Word8] -> FilePath +encodeW8 w8 = unsafePerformIO $ do + enc <- Encoding.getFileSystemEncoding + GHC.withCString Encoding.char8 (w82s w8) $ GHC.peekCString enc diff --git a/Utility/Gpg.hs b/Utility/Gpg.hs index ff6735ba5..e13afe5d4 100644 --- a/Utility/Gpg.hs +++ b/Utility/Gpg.hs @@ -7,7 +7,7 @@ module Utility.Gpg where -import qualified Data.ByteString.Lazy.Char8 as L +import qualified Data.ByteString.Lazy as L import System.Posix.Types import Control.Applicative import Control.Concurrent diff --git a/debian/changelog b/debian/changelog index f756a8538..a0e15946f 100644 --- a/debian/changelog +++ b/debian/changelog @@ -5,6 +5,7 @@ git-annex (3.20120616) UNRELEASED; urgency=low need to manually run git commands when manipulating files. Available on Linux, BSDs, and OSX! * Enable diskfree on kfreebsd, using statvfs. + * unused: Fix crash when key names contain invalid utf8. -- Joey Hess <joeyh@debian.org> Tue, 12 Jun 2012 11:35:59 -0400 diff --git a/doc/bugs/git_annex_unused_aborts_due_to_filename_encoding_problems.mdwn b/doc/bugs/git_annex_unused_aborts_due_to_filename_encoding_problems.mdwn new file mode 100644 index 000000000..fb0bdb093 --- /dev/null +++ b/doc/bugs/git_annex_unused_aborts_due_to_filename_encoding_problems.mdwn @@ -0,0 +1,15 @@ +What steps will reproduce the problem? +I don't know exactly when it started + +What is the expected output? What do you see instead? +When I run git annex unused I get + + unused . (checking for unused data...) (checking master...) git-annex: Cannot decode byte '\xb4': Data.Text.Encoding.decodeUtf8: Invalid UTF-8 stream + +Most likely I have added some file with a strange encoding that git-annex can't decode. The problem is that the unused process aborts because of this. + +What version of git-annex are you using? On what operating system? + 3.20120522, Debian testing + +> I've just fixed this bug in git, will be in the next release. --[[Joey]] +> [[done]] diff --git a/doc/bugs/git_annex_unused_aborts_due_to_filename_encoding_problems/comment_1_8ba4fdb9f2d3bd44db5e910526cb9124._comment b/doc/bugs/git_annex_unused_aborts_due_to_filename_encoding_problems/comment_1_8ba4fdb9f2d3bd44db5e910526cb9124._comment new file mode 100644 index 000000000..ddea8225e --- /dev/null +++ b/doc/bugs/git_annex_unused_aborts_due_to_filename_encoding_problems/comment_1_8ba4fdb9f2d3bd44db5e910526cb9124._comment @@ -0,0 +1,8 @@ +[[!comment format=mdwn + username="http://joeyh.name/" + ip="4.154.2.6" + subject="comment 1" + date="2012-06-20T14:30:27Z" + content=""" +Try running `git annex unused --debug`; this will tell us the git command that's outputing the data it cannot process. Then you can try running that git command and see what the problem filename is. +"""]] diff --git a/doc/bugs/git_annex_unused_aborts_due_to_filename_encoding_problems/comment_2_2a4a2b3e287a0444a1c8e8d98768a206._comment b/doc/bugs/git_annex_unused_aborts_due_to_filename_encoding_problems/comment_2_2a4a2b3e287a0444a1c8e8d98768a206._comment new file mode 100644 index 000000000..8afe3143c --- /dev/null +++ b/doc/bugs/git_annex_unused_aborts_due_to_filename_encoding_problems/comment_2_2a4a2b3e287a0444a1c8e8d98768a206._comment @@ -0,0 +1,8 @@ +[[!comment format=mdwn + username="http://joeyh.name/" + ip="4.154.2.6" + subject="comment 2" + date="2012-06-20T14:34:23Z" + content=""" +Your `locale` setting may also be relevant. FWIW, I've tried to create a file with `\xb4` in its name and have not gotten git-annex unused to crash on it. +"""]] diff --git a/doc/bugs/git_annex_unused_aborts_due_to_filename_encoding_problems/comment_3_dacfdb8322045fc4ceefc9128bf7c505._comment b/doc/bugs/git_annex_unused_aborts_due_to_filename_encoding_problems/comment_3_dacfdb8322045fc4ceefc9128bf7c505._comment new file mode 100644 index 000000000..8e2aa285a --- /dev/null +++ b/doc/bugs/git_annex_unused_aborts_due_to_filename_encoding_problems/comment_3_dacfdb8322045fc4ceefc9128bf7c505._comment @@ -0,0 +1,17 @@ +[[!comment format=mdwn + username="https://www.google.com/accounts/o8/id?id=AItOawnXgp-iIaBK5pnk22xqMVERQb97VyXaejs" + nickname="Kristian" + subject="comment 3" + date="2012-06-20T14:37:09Z" + content=""" +This is what happens when I add the debug parameter + +git annex unused --debug + +unused . (checking for unused data...) git [\"--git-dir=/home/kristian/AnnexMedia/.git\",\"--work-tree=/home/kristian/AnnexMedia\",\"ls-files\",\"--cached\",\"-z\",\"--\",\"/home/kristian/AnnexMedia\"] +git [\"--git-dir=/home/kristian/AnnexMedia/.git\",\"--work-tree=/home/kristian/AnnexMedia\",\"show-ref\"] +(checking master...) git [\"--git-dir=/home/kristian/AnnexMedia/.git\",\"--work-tree=/home/kristian/AnnexMedia\",\"ls-tree\",\"--full-tree\",\"-z\",\"-r\",\"--\",\"refs/heads/master\"] +git [\"--git-dir=/home/kristian/AnnexMedia/.git\",\"--work-tree=/home/kristian/AnnexMedia\",\"cat-file\",\"--batch\"] +git-annex: Cannot decode byte '\xb4': Data.Text.Encoding.decodeUtf8: Invalid UTF-8 stream + +"""]] diff --git a/doc/bugs/git_annex_unused_aborts_due_to_filename_encoding_problems/comment_4_7889a3ff5ce80c6322448aa674df8525._comment b/doc/bugs/git_annex_unused_aborts_due_to_filename_encoding_problems/comment_4_7889a3ff5ce80c6322448aa674df8525._comment new file mode 100644 index 000000000..da97b12f7 --- /dev/null +++ b/doc/bugs/git_annex_unused_aborts_due_to_filename_encoding_problems/comment_4_7889a3ff5ce80c6322448aa674df8525._comment @@ -0,0 +1,10 @@ +[[!comment format=mdwn + username="http://joeyh.name/" + ip="4.154.2.6" + subject="comment 4" + date="2012-06-20T14:49:09Z" + content=""" +Ah, reproduced it; need to use the WORM backend and have the file present in another branch.. + + +"""]] diff --git a/doc/bugs/git_annex_unused_aborts_due_to_filename_encoding_problems/comment_5_6d28c2537ce24eeb3496ca349823defd._comment b/doc/bugs/git_annex_unused_aborts_due_to_filename_encoding_problems/comment_5_6d28c2537ce24eeb3496ca349823defd._comment new file mode 100644 index 000000000..fafd1d248 --- /dev/null +++ b/doc/bugs/git_annex_unused_aborts_due_to_filename_encoding_problems/comment_5_6d28c2537ce24eeb3496ca349823defd._comment @@ -0,0 +1,19 @@ +[[!comment format=mdwn + username="https://www.google.com/accounts/o8/id?id=AItOawnXgp-iIaBK5pnk22xqMVERQb97VyXaejs" + nickname="Kristian" + subject="comment 5" + date="2012-06-20T14:55:33Z" + content=""" +I checkout out the git annex branch and using + + find * | grep -P \"[\xb4]\" + +I found a file + + 43e/b16/WORM-s4118528-m1245167306--Jerry Lee Lewis - Whole Lotta Shakin\302\264 Going\302\264 On.mp3.log + +The corresponding file also existed in the master branch (as a link). + +I moved both these files to a folder outside my repository and synched my git-annex branch with by master server. I still get the same error. Is there any other place where information about this file is stored? + +"""]] diff --git a/doc/bugs/git_annex_unused_aborts_due_to_filename_encoding_problems/comment_6_4bf14ecef622988e80976c0fb55c24b9._comment b/doc/bugs/git_annex_unused_aborts_due_to_filename_encoding_problems/comment_6_4bf14ecef622988e80976c0fb55c24b9._comment new file mode 100644 index 000000000..b35e31da6 --- /dev/null +++ b/doc/bugs/git_annex_unused_aborts_due_to_filename_encoding_problems/comment_6_4bf14ecef622988e80976c0fb55c24b9._comment @@ -0,0 +1,10 @@ +[[!comment format=mdwn + username="http://joeyh.name/" + ip="4.154.2.6" + subject="comment 6" + date="2012-06-20T16:59:53Z" + content=""" +git-annex was not crashing due to content in the git-annex branch, but due to a symlink in one of your regular git branches, probably master and origin/master. + +This bug is fixed in git master, if you need the fix before the next release. +"""]] diff --git a/doc/bugs/watch_command_on_OSX_10.7.mdwn b/doc/bugs/watch_command_on_OSX_10.7.mdwn new file mode 100644 index 000000000..1348c1155 --- /dev/null +++ b/doc/bugs/watch_command_on_OSX_10.7.mdwn @@ -0,0 +1,37 @@ +Running the tip of the watch branch on OSX in an annex'ed directory. + +The watch command detects the changes, does _something_, see the output below. + +Output from watch command + +<pre> +(Recording state in git...) +Added "./KeePass2.18.dmg" +Added "./KeePassX-0.4.3.dmg" +add ./KeePass2.18.dmg (checksum...) ok +add ./KeePassX-0.4.3.dmg (checksum...) ok +</pre> + +State of the annex + +<pre> +laplace:annex jtang$ git status +# On branch master +# Untracked files: +# (use "git add <file>..." to include in what will be committed) +# +# KeePass2.18.dmg +# KeePassX-0.4.3.dmg +nothing added to commit but untracked files present (use "git add" to track) +</pre> + +It seems to not do a git add and commit after the creation of the symlinks, manually doing this makes it all happy again till more files are added. + +note: i had posted a comment in the blog post, but posting the issue here is probably more appropriate. + +> Yeah, this is the issue I was struggling with last night. +> I think it's fixed in 57cf65eb6d811ba7fd19eb62a54e3b83a0c2dfa7, +> but the kqueue watch still needs a lot of work. --[[Joey]] + +>> Confirmed this is fixed, but do note the known kqueue bugs in +>> [[design/assistant/inotify]]! [[done]] --[[Joey]] diff --git a/doc/design/assistant/blog/day_13__kqueue_continued.mdwn b/doc/design/assistant/blog/day_13__kqueue_continued.mdwn new file mode 100644 index 000000000..fd0cbb372 --- /dev/null +++ b/doc/design/assistant/blog/day_13__kqueue_continued.mdwn @@ -0,0 +1,34 @@ +Good news! My beta testers report that the new kqueue code works on OSX. +At least "works" as well as it does on Debian kFreeBSD. My crazy +development strategy of developing on Debian kFreeBSD while targeting Mac +OSX is vindicated. ;-) + +So, I've been beating the kqueue code into shape for the last 12 hours, +minus a few hours sleep. + +First, I noticed it was seeming to starve the other threads. I'm using +Haskell's non-threaded runtime, which does cooperative multitasking between +threads, and my C code was never returning to let the other threads run. +Changed that around, so the C code runs until SIGALARMed, and then that +thread calls `yield` before looping back into the C code. Wow, cooperative +multitasking.. I last dealt with that when programming for Windows 3.1! +(Should try to use Haskell's -threaded runtime sometime, but git-annex +doesn't work under it, and I have not tried to figure out why not.) + +Then I made a [single commit](http://source.git-annex.branchable.com/?p=source.git;a=commitdiff;h=2bfcc0b09c5dd37c5e0ab65cb089232bfcc31934), +with no testing, in which I made the kqueue code maintain a cache of what +it expects in the directory tree, and use that to determine what files +changed how when a change is detected. Serious code. It worked on the +first go. If you were wondering why I'm writing in Haskell ... yeah, +that's why. + +And I've continued to hammer on the kqueue code, making lots of little +fixes, and at this point it seems *almost* able to handle the changes I +throw at it. It does have one big remaining problem; kqueue doesn't tell me +when a writer closes a file, so it will sometimes miss adding files. To fix +this, I'm going to need to make it maintain a queue of new files, and +periodically check them, with `lsof`, to see when they're done being +written to, and add them to the annex. So while a file is being written +to, `git annex watch` will have to wake up every second or so, and run +`lsof` ... and it'll take it at least 1 second to notice a file's complete. +Not ideal, but the best that can be managed with kqueue. diff --git a/doc/design/assistant/inotify.mdwn b/doc/design/assistant/inotify.mdwn index 9d3db9192..0b55298a3 100644 --- a/doc/design/assistant/inotify.mdwn +++ b/doc/design/assistant/inotify.mdwn @@ -13,6 +13,14 @@ There is a `watch` branch in git that adds the command. * When you `git annex unlock` a file, it will immediately be re-locked. +* With kqueue, if a file is created and still has a writer, it'll + give up adding it, and it will never get added. This is because kqueue + cannot track file closes. Need to go back and check these files every + second or something. + +* Kqueue has to open every directory it watches, so too many directories + will run it out of the max number of open files (typically 1024), and fail. + ## beyond Linux I'd also like to support OSX and if possible the BSDs. @@ -58,40 +66,8 @@ I'd also like to support OSX and if possible the BSDs. * Windows has a Win32 ReadDirectoryChangesW, and perhaps other things. -## beyond Linux - -I'd also like to support OSX and if possible the BSDs. - -* kqueue ([haskell bindings](http://hackage.haskell.org/package/kqueue)) - is supported by FreeBSD, OSX, and other BSDs. - - From what I can find, kqueue does not provide full directory watching - capabilities. To watch a file, you have to have an open file descriptor - to the file. This wouldn't scale. - - Gamin does the best it can with just kqueue, supplimented by polling. - The source file `server/gam_kqueue.c` makes for interesting reading. - Using gamin to do the heavy lifting is one option. - ([haskell bindings](http://hackage.haskell.org/package/hlibfam) for FAM; - gamin shares the API) - -* hfsevents ([haskell bindings](http://hackage.haskell.org/package/hfsevents)) - is OSX specific. - - Originally it was only directory level, and you were only told a - directory had changed and not which file. Based on the haskell - binding's code, from OSX 10.7.0, file level events were added. - - This will be harder for me to develop for, since I don't have access to - OSX machines.. - -* Windows has a Win32 ReadDirectoryChangesW, and perhaps other things. - ## todo -- Support OSes other than Linux; it only uses inotify currently. - OSX and FreeBSD use the same mechanism, and there is a Haskell interface - for it, - Run niced and ioniced? Seems to make sense, this is a background job. - configurable option to only annex files meeting certian size or filename criteria diff --git a/doc/forum/Wishlist:_automatic_reinject.mdwn b/doc/forum/Wishlist:_automatic_reinject.mdwn new file mode 100644 index 000000000..f975c7521 --- /dev/null +++ b/doc/forum/Wishlist:_automatic_reinject.mdwn @@ -0,0 +1,14 @@ +I think it would be useful to supplement the `reinject` command with an automatic +mode which calculates the checksum of the source file and injects the file if it +is known to the repository (without the need to provide a destination filename). +In addition, this could be done recursively if the user provides a directory to +inject. All this can probably be done already with some plumbing, but a simple +`reinject --auto` (or `scour`, or `scavenge`, if you like) would be a nice addition. +Of course this would only work for the checksum backends. + +Example use cases would be: + +* Recovering data from lost+found easily +* Making use of old (pre-git-annex) archival volumes with useful files + scattered among non-useful files +* Sneaker-netting files between disconnected git-annex repositories diff --git a/doc/forum/autobuilders_for_git-annex_to_aid_development.mdwn b/doc/forum/autobuilders_for_git-annex_to_aid_development.mdwn index 8cd370937..2c1280e51 100644 --- a/doc/forum/autobuilders_for_git-annex_to_aid_development.mdwn +++ b/doc/forum/autobuilders_for_git-annex_to_aid_development.mdwn @@ -31,4 +31,4 @@ if [ "$?" = 1 ]; then fi </pre> -It's also using the branches-local script for sorting and prioritising the branches to build, this branches-local script can be found at the [autobuild-ceph](https://github.com/ceph/autobuild-ceph/blob/master/branches-local) repository. If there are other people interested in setting up their own instances of gitbuilder for git-annex, please let me know and I will setup an aggregator page to collect status of the builds. The builder runs and updates the webpage every 30mins. +It's also using the branches-local script for sorting and prioritising the branches to build, this branches-local script can be found at the [autobuild-ceph](https://github.com/ceph/autobuild-ceph/blob/master/branches-local) repository. If there are other people interested in setting up their own instances of gitbuilder for git-annex, please let me know and I will setup an aggregator page to collect status of the builds. The builder runs and updates on a very regular basis. diff --git a/doc/install/OSX/comment_7_2ce7acab15403d3f993cec94ec7f3bc6._comment b/doc/install/OSX/comment_7_2ce7acab15403d3f993cec94ec7f3bc6._comment new file mode 100644 index 000000000..32093ee51 --- /dev/null +++ b/doc/install/OSX/comment_7_2ce7acab15403d3f993cec94ec7f3bc6._comment @@ -0,0 +1,14 @@ +[[!comment format=mdwn + username="http://www.davidhaslem.com/" + nickname="David" + subject="comment 7" + date="2012-06-19T04:41:27Z" + content=""" +$(brew --prefix) should, in most cases, be /usr/local. That's the recommended install location for homebrew. + +I already had git installed and homebrew as my package manager - my install steps were as follows: + +1. brew install haskell-platform ossp-uuid md5sha1sum coreutils pcre +2. PATH=\"$(brew --prefix coreutils)/libexec/gnubin:$PATH\" cabal install git-annex + +"""]] |