aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Joey Hess <joey@kitenet.net>2013-10-05 15:01:49 -0400
committerGravatar Joey Hess <joey@kitenet.net>2013-10-05 15:01:49 -0400
commit9f1e2760c60294311c08b5fb8e3a49c773ad89aa (patch)
treec9640e4dece4f84cfec669bfc8035a3c63b7e9bc
parentf9c3d7562f1ddda3adb8aa892fd40584e9b48ac0 (diff)
Better sanitization of problem characters when generating URL and WORM keys.
FAT has a lot of characters it does not allow in filenames, like ? and * It's probably the worst offender, but other filesystems also have limitiations. In 2011, I made keyFile escape : to handle FAT, but missed the other characters. It also turns out that when I did that, I was also living dangerously; any existing keys that contained a : had their object location change. Oops. So, adding new characters to escape to keyFile is out. Well, it would be possible to make keyFile behave differently on a per-filesystem basis, but this would be a real nightmare to get right. Consider that a rsync special remote uses keyFile to determine the filenames to use, and we don't know the underlying filesystem on the rsync server.. Instead, I have gone for a solution that is backwards compatable and simple. Its only downside is that already generated URL and WORM keys might not be able to be stored on FAT or some other filesystem that dislikes a character used in the key. (In this case, the user can just migrate the problem keys to a checksumming backend. If this became a big problem, fsck could be made to detect these and suggest a migration.) Going forward, new keys that are created will escape all characters that are likely to cause problems. And if some filesystem comes along that's even worse than FAT (seems unlikely, but here it is 2013, and people are still using FAT!), additional characters can be added to the set that are escaped without difficulty. (Also, made WORM limit the part of the filename that is embedded in the key, to deal with filesystem filename length limits. This could have already been a problem, but is more likely now, since the escaping of the filename can make it longer.) This commit was sponsored by Ian Downes
-rw-r--r--Backend/URL.hs17
-rw-r--r--Backend/Utilities.hs25
-rw-r--r--Backend/WORM.hs14
-rw-r--r--Locations.hs34
-rw-r--r--debian/changelog2
-rw-r--r--doc/bugs/direct_repository_on_FAT32_fails_to_addurl_containing___63__.mdwn2
6 files changed, 74 insertions, 20 deletions
diff --git a/Backend/URL.hs b/Backend/URL.hs
index ace578a24..a8161c98d 100644
--- a/Backend/URL.hs
+++ b/Backend/URL.hs
@@ -10,11 +10,10 @@ module Backend.URL (
fromUrl
) where
-import Data.Hash.MD5
-
import Common.Annex
import Types.Backend
import Types.Key
+import Backend.Utilities
backends :: [Backend]
backends = [backend]
@@ -27,18 +26,12 @@ backend = Backend
, canUpgradeKey = Nothing
}
-{- When it's not too long, use the full url as the key name.
- - If the url is too long, it's truncated at half the filename length
- - limit, and the md5 of the url is prepended to ensure a unique key. -}
+{- Every unique url has a corresponding key. -}
fromUrl :: String -> Maybe Integer -> Annex Key
fromUrl url size = do
- limit <- liftIO . fileNameLengthLimit =<< fromRepo gitAnnexDir
- let truncurl = truncateFilePath (limit `div` 2) url
- let key = if url == truncurl
- then url
- else truncurl ++ "-" ++ md5s (Str url)
+ n <- genKeyName url
return $ stubKey
- { keyName = key
+ { keyName = n
, keyBackendName = "URL"
, keySize = size
- }
+ }
diff --git a/Backend/Utilities.hs b/Backend/Utilities.hs
new file mode 100644
index 000000000..24dbfd6d9
--- /dev/null
+++ b/Backend/Utilities.hs
@@ -0,0 +1,25 @@
+{- git-annex backend utilities
+ -
+ - Copyright 2012 Joey Hess <joey@kitenet.net>
+ -
+ - Licensed under the GNU GPL version 3 or higher.
+ -}
+
+module Backend.Utilities where
+
+import Data.Hash.MD5
+
+import Common.Annex
+
+{- Generates a keyName from an input string. Takes care of sanitizing it.
+ - If it's not too long, the full string is used as the keyName.
+ - Otherwise, it's truncated at half the filename length limit, and its
+ - md5 is prepended to ensure a unique key. -}
+genKeyName :: String -> Annex String
+genKeyName s = do
+ limit <- liftIO . fileNameLengthLimit =<< fromRepo gitAnnexDir
+ let s' = preSanitizeKeyName s
+ let truncs = truncateFilePath (limit `div` 2) s'
+ return $ if s' == truncs
+ then s'
+ else truncs ++ "-" ++ md5s (Str s)
diff --git a/Backend/WORM.hs b/Backend/WORM.hs
index 3471eedc1..60db42f56 100644
--- a/Backend/WORM.hs
+++ b/Backend/WORM.hs
@@ -11,6 +11,7 @@ import Common.Annex
import Types.Backend
import Types.Key
import Types.KeySource
+import Backend.Utilities
backends :: [Backend]
backends = [backend]
@@ -33,9 +34,10 @@ backend = Backend
keyValue :: KeySource -> Annex (Maybe Key)
keyValue source = do
stat <- liftIO $ getFileStatus $ contentLocation source
- return $ Just Key {
- keyName = takeFileName $ keyFilename source,
- keyBackendName = name backend,
- keySize = Just $ fromIntegral $ fileSize stat,
- keyMtime = Just $ modificationTime stat
- }
+ n <- genKeyName $ keyFilename source
+ return $ Just Key
+ { keyName = n
+ , keyBackendName = name backend
+ , keySize = Just $ fromIntegral $ fileSize stat
+ , keyMtime = Just $ modificationTime stat
+ }
diff --git a/Locations.hs b/Locations.hs
index 36104458e..9b7bb575b 100644
--- a/Locations.hs
+++ b/Locations.hs
@@ -51,6 +51,7 @@ module Locations (
annexHashes,
hashDirMixed,
hashDirLower,
+ preSanitizeKeyName,
prop_idempotent_fileKey
) where
@@ -58,6 +59,7 @@ module Locations (
import Data.Bits
import Data.Word
import Data.Hash.MD5
+import Data.Char
import Common
import Types
@@ -284,6 +286,32 @@ gitAnnexAssistantDefaultDir = "annex"
isLinkToAnnex :: FilePath -> Bool
isLinkToAnnex s = (pathSeparator:objectDir) `isInfixOf` s
+{- Sanitizes a String that will be used as part of a Key's keyName,
+ - dealing with characters that cause problems on substandard filesystems.
+ -
+ - This is used when a new Key is initially being generated, eg by getKey.
+ - Unlike keyFile and fileKey, it does not need to be a reversable
+ - escaping. Also, it's ok to change this to add more problimatic
+ - characters later. Unlike changing keyFile, which could result in the
+ - filenames used for existing keys changing and contents getting lost.
+ -
+ - It is, however, important that the input and output of this function
+ - have a 1:1 mapping, to avoid two different inputs from mapping to the
+ - same key.
+ -}
+preSanitizeKeyName :: String -> String
+preSanitizeKeyName = concatMap escape
+ where
+ escape c
+ | isAsciiUpper c || isAsciiLower c || isDigit c = [c]
+ | c `elem` ".-_ " = [c] -- common, assumed safe
+ | c `elem` "/%:" = [c] -- handled by keyFile
+ -- , is safe and uncommon, so will be used to escape
+ -- other characters. By itself, it is escaped to
+ -- doubled form.
+ | c == ',' = ",,"
+ | otherwise = ',' : show(ord(c))
+
{- Converts a key into a filename fragment without any directory.
-
- Escape "/" in the key name, to keep a flat tree of files and avoid
@@ -293,8 +321,10 @@ isLinkToAnnex s = (pathSeparator:objectDir) `isInfixOf` s
- a slash
- "%" is escaped to "&s", and "&" to "&a"; this ensures that the mapping
- is one to one.
- - ":" is escaped to "&c", because despite it being 20XX people still care
- - about FAT.
+ - ":" is escaped to "&c", because it seemed like a good idea at the time.
+ -
+ - Changing what this function escapes and how is not a good idea, as it
+ - can cause existing objects to get lost.
-}
keyFile :: Key -> FilePath
keyFile key = replace "/" "%" $ replace ":" "&c" $
diff --git a/debian/changelog b/debian/changelog
index 7582e3e23..4bbdffb3f 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -7,6 +7,8 @@ git-annex (4.20131003) UNRELEASED; urgency=low
and remove it so it does not interfere with the automatic
commits of changed files.
* addurl: Better sanitization of generated filenames.
+ * Better sanitization of problem characters when generating URL and WORM
+ keys.
-- Joey Hess <joeyh@debian.org> Thu, 03 Oct 2013 15:41:24 -0400
diff --git a/doc/bugs/direct_repository_on_FAT32_fails_to_addurl_containing___63__.mdwn b/doc/bugs/direct_repository_on_FAT32_fails_to_addurl_containing___63__.mdwn
index 19215bf0e..1afb9ac14 100644
--- a/doc/bugs/direct_repository_on_FAT32_fails_to_addurl_containing___63__.mdwn
+++ b/doc/bugs/direct_repository_on_FAT32_fails_to_addurl_containing___63__.mdwn
@@ -40,3 +40,5 @@ git-annex: addurl: 1 failed
# End of transcript or log.
"""]]
+
+> [[fixed|done]] --[[Joey]]