summaryrefslogtreecommitdiff
path: root/Command/Unused.hs
blob: 28a26f82c68cbe75ded1124ca945d9e1219af246 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
{- git-annex command
 -
 - Copyright 2010 Joey Hess <joey@kitenet.net>
 -
 - Licensed under the GNU GPL version 3 or higher.
 -}

module Command.Unused where

import Control.Monad (filterM, unless)
import Control.Monad.State (liftIO)
import qualified Data.Set as S
import Data.Maybe
import System.FilePath
import System.Directory

import Command
import Types
import Content
import Messages
import Locations
import Utility
import qualified Annex
import qualified GitRepo as Git
import qualified Backend

command :: [Command]
command = [Command "unused" paramNothing seek "look for unused file content"]

seek :: [CommandSeek]
seek = [withNothing start]

{- Finds unused content in the annex. -} 
start :: CommandStartNothing
start = do
	showStart "unused" ""
	return $ Just perform

perform :: CommandPerform
perform = do
	_ <- checkUnused
	return $ Just $ return True

checkUnused :: Annex Bool
checkUnused = do
	showNote "checking for unused data..."
	(unused, staletmp) <- unusedKeys
	let unusedlist = number 0 unused
	let staletmplist = number (length unused) staletmp
	let list = unusedlist ++ staletmplist
	g <- Annex.gitRepo
	liftIO $ safeWriteFile (gitAnnexUnusedLog g) $ unlines $ 
		map (\(n, k) -> show n ++ " " ++ show k) list
	unless (null unused) $ showLongNote $ unusedmsg unusedlist
	unless (null staletmp) $ showLongNote $ staletmpmsg staletmplist
	unless (null list) $ showLongNote $ "\n"
	return $ null list

	where
		unusedmsg u = unlines $
			["Some annexed data is no longer pointed to by any files in the repository:"]
			++ table u ++
			["(To see where data was previously used, try: git log --stat -S'KEY')"] ++
			dropmsg
		staletmpmsg t = unlines $ 
			["Some partially transferred data exists in temporary files:"]
			++ table t ++ dropmsg
		dropmsg = ["(To remove unwanted data: git-annex dropunused NUMBER)"]
			
		table l = ["  NUMBER  KEY"] ++ map cols l
		cols (n,k) = "  " ++ pad 6 (show n) ++ "  " ++ show k
		pad n s = s ++ replicate (n - length s) ' '

number :: Int -> [a] -> [(Int, a)]
number _ [] = []
number n (x:xs) = (n+1, x):(number (n+1) xs)

{- Finds keys whose content is present, but that do not seem to be used
 - by any files in the git repo, or that are only present as tmp files. -}
unusedKeys :: Annex ([Key], [Key])
unusedKeys = do
	g <- Annex.gitRepo
	present <- getKeysPresent
	referenced <- getKeysReferenced
	tmps <- tmpKeys
	
	let (unused, staletmp, duptmp) = calcUnusedKeys present referenced tmps

	-- Tmp files that are dups of content already present can simply
	-- be removed.
	_ <- liftIO $ mapM (\t -> removeFile $ gitAnnexTmpLocation g t) duptmp

	return (unused, staletmp)

calcUnusedKeys :: [Key] -> [Key] -> [Key] -> ([Key], [Key], [Key])
calcUnusedKeys present referenced tmps = (unused, staletmp, duptmp)
	where
		unused = present `exclude` referenced
		staletmp = tmps `exclude` present
		duptmp = tmps `exclude` staletmp

		-- Constructing a single set, of the list that tends to be
		-- smaller, appears more efficient in both memory and CPU
		-- than constructing and taking the S.difference of two sets.
		exclude [] _ = [] -- optimisation
		exclude smaller larger = S.toList $ remove larger $ S.fromList smaller
		remove a b = foldl (flip S.delete) b a

{- List of keys referenced by symlinks in the git repo. -}
getKeysReferenced :: Annex [Key]
getKeysReferenced = do
	g <- Annex.gitRepo
	files <- liftIO $ Git.inRepo g [Git.workTree g]
	keypairs <- mapM Backend.lookupFile files
	return $ map fst $ catMaybes keypairs

{- List of keys that have temp files in the git repo. -}
tmpKeys :: Annex [Key]
tmpKeys = do
	g <- Annex.gitRepo
	let tmp = gitAnnexTmpDir g
	exists <- liftIO $ doesDirectoryExist tmp
	if (not exists)
		then return []
		else do
			contents <- liftIO $ getDirectoryContents tmp
			files <- liftIO $ filterM doesFileExist $
				map (tmp </>) contents
			return $ map (fileKey . takeFileName) files