{-# LANGUAGE BangPatterns #-} -------------------------------------------------------------------- -- | -- Module : Codec.Mbox -- Copyright : (c) Nicolas Pouillard 2008, 2009 -- License : BSD3 -- -- Maintainer: Nicolas Pouillard -- Stability : provisional -- Portability: -- -------------------------------------------------------------------- module Codec.Mbox ( -- * Mailbox, message, and direction data types Mbox(..) , MboxMessage(..) , Direction(..) -- * Mailbox parsing functions , parseMboxFile , parseMboxFiles , parseMbox , safeParseMbox , parseOneMboxMessage -- * Mailbox printing functions , showMbox , showMboxMessage , showMboxFromLine -- * First-class labels , mboxMsgSender , mboxMsgTime , mboxMsgBody , mboxMsgFile , mboxMsgOffset -- * Misc , Month(..) , fromQuoting , msgYear , msgMonthYear , opposite ) where import Control.Arrow (first,second) import Control.Applicative ((<$>)) import qualified Data.ByteString.Lazy.Char8 as C -- Char8 interface over Lazy ByteString's import Data.ByteString.Lazy (ByteString) import Data.Int (Int64) import Data.Maybe (listToMaybe) import System.IO import System.IO.Unsafe (unsafeInterleaveIO) data P a b = !a :*: !b first' :: (a -> b) -> P a c -> P b c first' f !(a :*: c) = f a :*: c {-# INLINE first' #-} uncurry' :: (a -> b -> c) -> P a b -> c uncurry' f (x :*: y) = f x y {-# INLINE uncurry' #-} --import Test.QuickCheck -- | An 'Mbox' is a list of 'MboxMessage' newtype Mbox s = Mbox { mboxMessages :: [MboxMessage s] } deriving (Eq, Ord, Show) -- | An 'MboxMessage' represent an mbox message, featuring -- the sender, the date-time, and the message body. data MboxMessage s = MboxMessage { _mboxMsgSender :: s , _mboxMsgTime :: s , _mboxMsgBody :: s , _mboxMsgFile :: FilePath , _mboxMsgOffset :: Int64 } deriving (Eq, Ord, Show) -- | Message's sender lens mboxMsgSender :: Functor f => (a -> f a) -> MboxMessage a -> f (MboxMessage a) mboxMsgSender f (MboxMessage s t b p o) = (\x -> MboxMessage x t b p o) <$> f s -- | Message's time lens mboxMsgTime :: Functor f => (a -> f a) -> MboxMessage a -> f (MboxMessage a) mboxMsgTime f (MboxMessage s t b p o) = (\x -> MboxMessage s x b p o) <$> f t -- | Message's body lens mboxMsgBody :: Functor f => (a -> f a) -> MboxMessage a -> f (MboxMessage a) mboxMsgBody f (MboxMessage s t b p o) = (\x -> MboxMessage s t x p o) <$> f b -- | First-class label to the file path of mbox's message mboxMsgFile :: Functor f => (FilePath -> f FilePath) -> MboxMessage a -> f (MboxMessage a) mboxMsgFile f (MboxMessage s t b p o) = (\x -> MboxMessage s t b x o) <$> f p -- | First-class label to the offset of the given message into the mbox mboxMsgOffset :: Functor f => (Int64-> f Int64) -> MboxMessage a -> f (MboxMessage a) mboxMsgOffset f (MboxMessage s t b p o) = (\x -> MboxMessage s t b p x) <$> f o readYear :: MboxMessage C.ByteString -> C.ByteString -> Int readYear m s = case reads $ C.unpack s of [(i, "")] -> i _ -> error ("readYear: badly formatted date (year) in " ++ show (C.unpack $ _mboxMsgTime m)) data Month = Jan | Feb | Mar | Apr | May | Jun | Jul | Aug | Sep | Oct | Nov | Dec deriving (Read, Show, Eq) readMonth :: MboxMessage C.ByteString -> C.ByteString -> Month readMonth m s = case reads $ C.unpack s of [(month, "")] -> month _ -> error ("readMonth: badly formatted date (month) in " ++ show (C.unpack $ _mboxMsgTime m)) msgYear :: MboxMessage C.ByteString -> Int msgYear m = readYear m . last . C.split ' ' . _mboxMsgTime $ m msgMonthYear :: MboxMessage C.ByteString -> (Month,Int) msgMonthYear m = case filter (not . C.null) . C.split ' ' . _mboxMsgTime $ m of [_wday, month, _mday, _hour, year] -> (readMonth m month, readYear m year) _ -> error ("msgMonthYear: badly formatted date in " ++ show (C.unpack $ _mboxMsgTime m)) nextFrom :: ByteString -> Maybe (Int64, ByteString, ByteString) nextFrom !orig = goNextFrom 0 orig where goNextFrom !count !input = do off <- (+1) <$> C.elemIndex '\n' input let (nls, i') = first C.length $ C.span (=='\n') $ C.drop off input if C.take 5 i' == bFrom then let off' = off + count + (nls - 1) in Just (off', C.take off' orig, C.drop 5 i') else goNextFrom (off + count + nls) i' -- TODO rules: -- - fromQuoting id == id -- - f . g == id ==> fromQuoting f . fromQuoting g == id -- but fromQuoting can fail on negative levels -- special case that works: -- n > 0 ==> fromQuoting ((-)n) . fromQuoting (+n) == id -- TODO performances: -- This ByteString fromQuoting is already quite fast, -- It still implies a x2 factor to the mbox-average-size tool, wether one -- enables it or not. -- Perhaps fusing nextFrom and fromQuoting could give more performances -- (but will decrease performances of mbox-counting). -- | @fromQuoting f s@ returns @s@ where the quoting level -- of From_ lines has been updated using the @f@ function. -- -- The From_ spefication, quoted from : -- -- @ -- \>From quoting ensures that the resulting -- lines are not From_ lines: the program prepends a \> to any -- From_ line, \>\From_ line, \>\>From_ line, \>\>\>From_ line, etc. -- @ fromQuoting :: (Int64 -> Int64) -> C.ByteString -> C.ByteString fromQuoting onLevel = C.tail . nextQuotedFrom . C.cons '\n' where nextQuotedFrom !orig = goNextQuotedFrom 0 orig where goNextQuotedFrom !count !input = case C.elemIndex '\n' input of Nothing -> -- TODO,NOTE: here I don't know what to do between the -- following code and just returning `orig' if C.null input then orig else orig `C.snoc` '\n' Just off -> let (!level, i') = first C.length $ C.span (=='>') $ C.drop (off + 1) input in if C.take 5 i' == bFrom then C.take (off + count) orig `C.append` mkQuotedFrom (onLevel level) `C.append` nextQuotedFrom (C.drop 5 i') else goNextQuotedFrom (off + level + count + 1) i' {- prop_fromQuotingInv (NonNegative n) s = s == fromQuoting (+(-n)) (fromQuoting (+n) s) prop_unparse_parse m = either (const False) (==m) $ safeParseMbox (showMbox m) prop_parse_unparse m = let s = showMbox m in Right s == (showMbox <$> safeParseMbox s) -- | Prefered xs: have the xs elements as favorites. newtype Prefered a = Prefered { unPrefered :: a } deriving ( Eq, Ord, Show, Read ) class Favorites a where favorites :: [a] instance Favorites Char where favorites = ">From \n" instance Arbitrary C.ByteString where arbitrary = arbitrary >>= return . C.pack . map unPrefered instance (Favorites a, Arbitrary a) => Arbitrary (Prefered a) where arbitrary = frequency [ (2, Prefered `fmap` oneof (map return favorites)) , (1, Prefered `fmap` arbitrary) ] shrink (Prefered x) = Prefered `fmap` shrink x --instance Arbitrary s => Arbitrary (MboxMessage s) where instance Arbitrary (MboxMessage ByteString) where arbitrary = MboxMessage (C.pack "S") (C.pack "D") <$> arbitrary -- TODO better sender,time shrink (MboxMessage x y z) = [ MboxMessage x' y' z' | (x', y', z') <- shrink (x, y, z) ] instance Arbitrary (Mbox ByteString) where --instance Arbitrary s => Arbitrary (Mbox s) where arbitrary = Mbox <$> arbitrary shrink (Mbox x) = Mbox <$> shrink x -} mkQuotedFrom :: Int64 -> C.ByteString mkQuotedFrom n | n < 0 = error "mkQuotedFrom: negative quoting" | otherwise = '\n' `C.cons` C.replicate n '>' `C.append` bFrom bFrom :: ByteString bFrom = C.pack "From " skipFirstFrom :: ByteString -> Either String ByteString skipFirstFrom xs | bFrom == C.take 5 xs = Right $ C.drop 5 xs | otherwise = Left "skipFirstFrom: badly formatted mbox: 'From ' expected at the beginning" -- | Same as 'parseMbox' but cat returns an error message. -- However only the line can cause an error message currently, so it's fine -- to dispatch on the either result. safeParseMbox :: FilePath -> Int64 -> ByteString -> Either String (Mbox ByteString) safeParseMbox fp offset s | C.null s = Right $ Mbox [] | otherwise = Mbox . map (uncurry' $ finishMboxMessageParsing fp) . splitMboxMessages offset <$> skipFirstFrom s -- | Turns a 'ByteString' into an 'Mbox' by splitting on From_ lines and -- unquoting the \'\>\*From\'s of the message. parseMbox :: ByteString -> Mbox ByteString parseMbox = either error id . safeParseMbox "" 0 splitMboxMessages :: Int64 -> ByteString -> [P Int64 ByteString] splitMboxMessages !offset !input = case nextFrom input of Nothing | C.null input -> [] | otherwise -> [(offset :*: input)] Just (!offset', !msg, rest) -> (offset :*: msg) : splitMboxMessages (6 + offset + offset') rest finishMboxMessageParsing :: FilePath -> Int64 -> ByteString -> MboxMessage ByteString finishMboxMessageParsing fp !offset !inp = MboxMessage sender time (fromQuoting pred body) fp offset where ((sender,time),body) = first (breakAt ' ') $ breakAt '\n' inp breakAt c = second (C.drop 1 {- a safe tail -}) . C.break (==c) -- | Turns an mbox into a 'ByteString' showMbox :: Mbox ByteString -> ByteString showMbox = C.intercalate (C.singleton '\n') . map showMboxMessage . mboxMessages -- | Returns an header line in mbox format given an mbox message. showMboxFromLine :: MboxMessage ByteString -> ByteString showMboxFromLine (MboxMessage sender time _ _ _) = C.append bFrom . C.append sender . C.cons ' ' . C.append time . C.cons '\n' $ C.empty -- | Returns a 'ByteString' given an mbox message. showMboxMessage :: MboxMessage ByteString -> ByteString showMboxMessage msg = showMboxFromLine msg `C.append` fromQuoting (+1) (_mboxMsgBody msg) -- lazyness at work! -- | Given a file handle and an offset, 'parseOneMboxMessage' returns -- the message a this offset. parseOneMboxMessage :: FilePath -> Handle -> Integer -> IO (MboxMessage C.ByteString) parseOneMboxMessage fp fh offset = do hSeek fh AbsoluteSeek offset s <- C.hGetContents fh a <- either fail return $ safeParseMbox fp (fromInteger offset) s (maybe (fail "parseOneMboxMessage: end of file") return . listToMaybe . mboxMessages) a readRevMboxFile :: FilePath -> IO (Mbox ByteString) readRevMboxFile fp = readRevMboxHandle fp =<< openFile fp ReadMode -- | @readRevMboxHandle fp h@ returns a reversed mbox for a file handle. -- The file handle is supposed to be in text mode, readable. readRevMboxHandle :: FilePath -> Handle -> IO (Mbox ByteString) readRevMboxHandle fp fh = do siz <- hFileSize fh readRevMbox fp siz <$> readHandleBackward mboxChunkSize siz fh -- buffering issues? readRevMbox :: FilePath -> Integer -> [ByteString] -> Mbox ByteString readRevMbox fp filesize chunks = Mbox $ go (fromInteger filesize+1) (filter (not . C.null) chunks) where go _ [] = [] go !siz (chunk1:cs) = case nextFrom chunk1 of Nothing -> kont siz cs chunk1 Just (!_backoffset, !msg, rest) -> let siz' = siz - C.length rest - 6 in (map finishmmp . reverse . map (first' (+siz')) . splitMboxMessages 0 $ rest) ++ kont siz' cs msg kont !_ [] = (:[]) . finishLast kont !siz (chunk2:cs2) = \k -> go siz (chunk2 `C.append` k : cs2) finishLast = finishMboxMessageParsing fp 0 . either (error . ("readRevMbox: impossible: " ++)) id . skipFirstFrom finishmmp = uncurry' $ finishMboxMessageParsing fp -- Not exported. -- -- | @readHandleBackward maxChunkSize size h@ lazily reads the @h@ file handle -- from the end. The file contents is returned as a reversed list of chunks. -- The result is such that if one apply @C.concat . reverse@ one get -- the in-order contents. {- propIO_read_anydir (Positive maxChunkSize) fh = do xs <- hGetContent fh siz <- hFileSize fh ys <- readHandleBackward maxChunkSize siz fh xs == C.concat (reverse ys) -} readHandleBackward :: Integer -> Integer -> Handle -> IO [ByteString] readHandleBackward maxChunkSize siz0 fh = go siz0 where go 0 = return [] go siz = unsafeInterleaveIO $ do let delta = min maxChunkSize siz siz' = siz - delta hSeek fh AbsoluteSeek siz' s <- C.hGet fh $ fromInteger delta (s :) <$> go siz' data Direction = Backward | Forward opposite :: Direction -> Direction opposite Forward = Backward opposite Backward = Forward -- | Returns a mbox given a direction (forward/backward) and a file path. parseMboxFile :: Direction -> FilePath -> IO (Mbox ByteString) parseMboxFile Forward fp = (either fail return =<<) . (safeParseMbox fp 0 <$>) . C.readFile $ fp parseMboxFile Backward fp = readRevMboxFile fp -- | Returns a mbox list given a direction (forward/backward) and a list of file path. -- Note that files are opened lazily. parseMboxFiles :: Direction -> [FilePath] -> IO [Mbox ByteString] parseMboxFiles Forward [] = (:[]) . parseMbox <$> C.getContents parseMboxFiles Backward [] = fail "reading backward on standard input does not make sense" parseMboxFiles dir xs = mapM (unsafeInterleaveIO . parseMboxFile dir) xs mboxChunkSize :: Integer mboxChunkSize = 10*oneMegabyte -- one megabyte in bytes oneMegabyte :: Integer oneMegabyte = 2 ^ (20 :: Int)