handle C-style escapes in Format

I was happily able to repurpose some code from Git.Filename to handle this. I remember writing that code... a whole afternoon at a coffee shop, after which I felt I'd struggled with Haskell and git, and sorta lost, in needing to write this nasty peice of code. But was also pleased at the use of a pair of functions and quickcheck that allowed me to get it 100% right. So, turns out I not only got it right, but the code wasn't as special-purpose as I'd feared. Yay!
author: Joey Hess <joey@kitenet.net> 2011-12-22 20:14:35 -0400
committer: Joey Hess <joey@kitenet.net> 2011-12-23 01:05:16 -0400
commit: cba3ce08dfaa3318aa80b414e4d6d4f40d843d15 (patch)
tree: cdb54dd767cb1530c88b92277b1150f961d06012 /Utility
parent: a0872a8ec34894689489dd0cf47887d8609b9f47 (diff)
1 files changed, 84 insertions, 9 deletions
diff --git a/Utility/Format.hs b/Utility/Format.hs
index 5a74da96b..804dbff4c 100644
--- a/Utility/Format.hs
+++ b/Utility/Format.hs
@@ -1,17 +1,25 @@
 {- Formatted string handling.
  -
- - Copyright 2011 Joey Hess <joey@kitenet.net>
+ - Copyright 2010, 2011 Joey Hess <joey@kitenet.net>
  -
  - Licensed under the GNU GPL version 3 or higher.
  -}
 
-module Utility.Format (Format, gen, format) where
+module Utility.Format (
+	Format,
+	gen,
+	format,
+	decode_c,
+	encode_c,
+	prop_idempotent_deencode
+) where
 
 import Text.Printf (printf)
-import Data.String.Utils (replace)
-import Data.Char (isAlphaNum)
+import Data.Char (isAlphaNum, isOctDigit, chr, ord)
+import Data.Maybe (fromMaybe)
+import Data.Word (Word8)
+import qualified Codec.Binary.UTF8.String
 import qualified Data.Map as M
-import Data.Maybe
 
 import Utility.PartialPrelude
 
@@ -52,12 +60,11 @@ format f vars = concatMap expand f
  - (This is the same type of format string used by dpkg-query.)
  -}
 gen :: FormatString -> Format
-gen = finalize . scan []
+gen = filter (not . empty) . fuse [] . scan [] . decode_c
 	where
 		-- The Format is built up in reverse, for efficiency,
-		-- To finalize it, fix the reversing and do some
-		-- optimisations, including fusing adjacent Consts.
-		finalize = filter (not . empty) . fuse []
+		-- and can have adjacent Consts. Fusing it fixes both
+		-- problems.
 		fuse f [] = f
 		fuse f (Const c1:Const c2:vs) = fuse f $ Const (c2++c1) : vs
 		fuse f (v:vs) = fuse (v:f) vs
@@ -82,3 +89,71 @@ gen = finalize . scan []
 
 		novar v = "${" ++ reverse v
 		foundvar f v p cs = scan (Var (reverse v) p : f) cs
+
+
+{- Decodes a C-style encoding, where \n is a newline, \NNN is an octal
+ - encoded character, etc.
+ -}
+decode_c :: FormatString -> FormatString
+decode_c [] = []
+decode_c s = unescape ("", s)
+	where
+		e = '\\'
+		unescape (b, []) = b
+		-- look for escapes starting with '\'
+		unescape (b, v) = b ++ fst pair ++ unescape (handle $ snd pair)
+			where
+				pair = span (/= e) v
+		isescape x = x == e
+		-- \NNN is an octal encoded character
+		handle (x:n1:n2:n3:rest)
+			| isescape x && alloctal = (fromoctal, rest)
+				where
+					alloctal = isOctDigit n1 &&
+						isOctDigit n2 &&
+						isOctDigit n3
+					fromoctal = [chr $ readoctal [n1, n2, n3]]
+					readoctal o = Prelude.read $ "0o" ++ o :: Int
+		-- \C is used for a few special characters
+		handle (x:nc:rest)
+			| isescape x = ([echar nc], rest)
+			where
+				echar 'a' = '\a'
+				echar 'b' = '\b'
+				echar 'f' = '\f'
+				echar 'n' = '\n'
+				echar 'r' = '\r'
+				echar 't' = '\t'
+				echar 'v' = '\v'
+				echar a = a
+		handle n = ("", n)
+
+{- Should not need to use this, except for testing decode_c. -}
+encode_c :: FormatString -> FormatString
+encode_c s = concatMap echar s
+	where
+		e c = '\\' : [c]
+		echar '\a' = e 'a'
+		echar '\b' = e 'b'
+		echar '\f' = e 'f'
+		echar '\n' = e 'n'
+		echar '\r' = e 'r'
+		echar '\t' = e 't'
+		echar '\v' = e 'v'
+		echar '\\' = e '\\'
+		echar '"'  = e '"'
+		echar x
+			| ord x < 0x20 = e_num x -- low ascii
+			| ord x >= 256 = e_utf x
+			| ord x > 0x7E = e_num x -- high ascii
+			| otherwise = [x]        -- printable ascii
+			where 
+				showoctal i = '\\' : printf "%03o" i
+				e_num c = showoctal $ ord c
+				-- unicode character is decomposed to
+				-- Word8s and each is shown in octal
+				e_utf c = showoctal =<< (Codec.Binary.UTF8.String.encode [c] :: [Word8])
+
+{- for quickcheck -}
+prop_idempotent_deencode :: String -> Bool
+prop_idempotent_deencode s = s == decode_c (encode_c s)
author	Joey Hess <joey@kitenet.net>	2011-12-22 20:14:35 -0400
committer	Joey Hess <joey@kitenet.net>	2011-12-23 01:05:16 -0400
commit	cba3ce08dfaa3318aa80b414e4d6d4f40d843d15 (patch)
tree	cdb54dd767cb1530c88b92277b1150f961d06012 /Utility
parent	a0872a8ec34894689489dd0cf47887d8609b9f47 (diff)