| 1 | {-# OPTIONS_GHC -XOverloadedStrings #-} |
|---|
| 2 | module Main where |
|---|
| 3 | |
|---|
| 4 | import System |
|---|
| 5 | import qualified System.IO as IO |
|---|
| 6 | import qualified System.Directory as IO (removeFile) |
|---|
| 7 | |
|---|
| 8 | import qualified Data.ByteString as BS |
|---|
| 9 | import qualified Data.ByteString.Internal as BS (w2c,c2w) |
|---|
| 10 | import qualified Data.String as S |
|---|
| 11 | |
|---|
| 12 | instance S.IsString BS.ByteString where |
|---|
| 13 | fromString = BS.pack . map BS.c2w |
|---|
| 14 | |
|---|
| 15 | -------------------------------------------------------------------------------- |
|---|
| 16 | -- Pipeable Streams (based on Get/Put continuations) |
|---|
| 17 | -- |
|---|
| 18 | -- lift f -- turn f into a stream by passing through f |
|---|
| 19 | -- |
|---|
| 20 | -- liftEither f -- wrap f with Either, passing Left through f and and Right |
|---|
| 21 | -- arround f (typically used for Left being normal operation |
|---|
| 22 | -- and Right being out-of-stream operation -- e.g., errors) |
|---|
| 23 | -- |
|---|
| 24 | -- liftMaybe f -- wrap f with Maybe, passing Just through f and Nothing around |
|---|
| 25 | -- f (typically used for end-of-stream operation) |
|---|
| 26 | -- |
|---|
| 27 | data Stream a b = Get (a -> Stream a b) |
|---|
| 28 | | Put b (Stream a b) |
|---|
| 29 | |
|---|
| 30 | pipe :: Stream a b -> Stream b c -> Stream a c |
|---|
| 31 | pipe f (Put y g) = Put y $ pipe f g |
|---|
| 32 | pipe (Put x f) (Get g) = pipe f $ g x |
|---|
| 33 | pipe (Get f) g = Get (\x -> pipe (f x) g) |
|---|
| 34 | |
|---|
| 35 | stack :: Int -> Stream a a -> Stream a a |
|---|
| 36 | stack n f | n>1 = f `pipe` stack (n-1) f |
|---|
| 37 | stack n f = f |
|---|
| 38 | |
|---|
| 39 | lift :: (a -> b) -> Stream a b |
|---|
| 40 | lift f = Get get |
|---|
| 41 | where --get :: a -> Stream a b |
|---|
| 42 | get x = Put (f x) $ Get get |
|---|
| 43 | |
|---|
| 44 | liftEither :: Stream a b -> Stream (Either a e) (Either b e) |
|---|
| 45 | liftEither (Get f) = Get get |
|---|
| 46 | where -- get :: Either a e -> Stream (Either a e) (Either b e) |
|---|
| 47 | get (Left x) = liftEither $ f x |
|---|
| 48 | get (Right e) = Put (Right e) $ Get get |
|---|
| 49 | liftEither (Put x f) = Put (Left x) $ liftEither f |
|---|
| 50 | |
|---|
| 51 | liftMaybe :: Stream a b -> Stream (Maybe a) (Maybe b) |
|---|
| 52 | liftMaybe (Get f) = Get get |
|---|
| 53 | where -- get :: Maybe a -> Stream (Maybe a) (Maybe b) |
|---|
| 54 | get (Just x) = liftMaybe $ f x |
|---|
| 55 | get Nothing = Put Nothing $ Get get |
|---|
| 56 | liftMaybe (Put x f) = Put (Just x) $ liftMaybe f |
|---|
| 57 | |
|---|
| 58 | |
|---|
| 59 | -------------------------------------------------------------------------------- |
|---|
| 60 | -- record |
|---|
| 61 | -- |
|---|
| 62 | -- Input ">Header 1\nABCDDFF\nAHDHFDSHE\n>Header 2\nAHSDHFF\nSDHJFD"... |
|---|
| 63 | -- Output Record "Header 1" "ABCDDFFAHDHFDSHE", |
|---|
| 64 | -- Record "Header 2" "AHSDHFFSDHJFD"... |
|---|
| 65 | -- |
|---|
| 66 | type Header = BS.ByteString |
|---|
| 67 | type Sequence = BS.ByteString |
|---|
| 68 | data Record = Record !Header !Sequence deriving Show |
|---|
| 69 | |
|---|
| 70 | record :: Stream (Maybe BS.ByteString) (Either (Maybe Record) BS.ByteString) |
|---|
| 71 | record = Get recordHeader |
|---|
| 72 | where recordHeader :: Maybe BS.ByteString -> Stream (Maybe BS.ByteString) (Either (Maybe Record) BS.ByteString) |
|---|
| 73 | recordHeader (Just l) | BS.head l == BS.c2w '>' = Get $ recordBody (BS.tail l) [] |
|---|
| 74 | recordHeader (Just l) = Put (Right $ BS.append "bad header line: " l) $ Get recordHeader |
|---|
| 75 | recordHeader Nothing = Put (Left Nothing) $ Get recordHeader |
|---|
| 76 | recordBody :: Header -> [Sequence] -> Maybe BS.ByteString -> Stream (Maybe BS.ByteString) (Either (Maybe Record) BS.ByteString) |
|---|
| 77 | recordBody h ls (Just l) | BS.head l /= BS.c2w '>' = Get $ recordBody h (l:ls) |
|---|
| 78 | recordBody h [] (Just l) = Put (Right $ BS.append "no body for: " h) $ recordHeader $ Just l |
|---|
| 79 | recordBody h ls (Just l) = Put (Left $ Just $ Record h $ BS.concat $ reverse ls) $ recordHeader $ Just l |
|---|
| 80 | recordBody h [] Nothing = Put (Right $ BS.append "no body for: " h) $ Put (Left Nothing) $ Get recordHeader |
|---|
| 81 | recordBody h ls Nothing = Put (Left $ Just $ Record h $ BS.concat $ reverse ls) $ Put (Left Nothing) $ Get recordHeader |
|---|
| 82 | |
|---|
| 83 | -------------------------------------------------------------------------------- |
|---|
| 84 | -- divide n |
|---|
| 85 | -- |
|---|
| 86 | -- Input Record "Header 1" "ABCDEF", ... |
|---|
| 87 | -- Output "ABC","BCD, "CDE", "DEF", ... |
|---|
| 88 | -- |
|---|
| 89 | -- Where n is the length of the bits (n=3 above) |
|---|
| 90 | -- |
|---|
| 91 | divide :: Int -> Stream Record BS.ByteString |
|---|
| 92 | divide n = Get extract |
|---|
| 93 | where extract :: Record -> Stream Record BS.ByteString |
|---|
| 94 | extract (Record h l) | BS.length l >= n = Put (BS.take n l) $ extract $ Record h $ BS.tail l |
|---|
| 95 | extract (Record _ l) = Get extract |
|---|
| 96 | |
|---|
| 97 | -------------------------------------------------------------------------------- |
|---|
| 98 | -- group n |
|---|
| 99 | -- |
|---|
| 100 | -- Input "ABC", "DEF", "BCD", "ABC", .... |
|---|
| 101 | -- Output ["ABC","DEF"], ["BCD","ABC"], ... |
|---|
| 102 | -- |
|---|
| 103 | -- Where n is the group size (n=2 above) |
|---|
| 104 | -- |
|---|
| 105 | group :: Int -> Stream (Maybe a) (Maybe [a]) |
|---|
| 106 | group n = Get $ group [] n |
|---|
| 107 | where group :: [a] -> Int -> Maybe a -> Stream (Maybe a) (Maybe [a]) |
|---|
| 108 | group xs n (Just x) | n > 1 = Get $ group (x:xs) (n-1) |
|---|
| 109 | group xs _ (Just x) = Put (Just $ reverse $ x:xs) $ Get $ group [] n |
|---|
| 110 | group [] _ Nothing = Put Nothing $ Get $ group [] n |
|---|
| 111 | group xs _ Nothing = Put (Just $ reverse xs) $ Put Nothing $ Get $ group [] n |
|---|
| 112 | |
|---|
| 113 | -------------------------------------------------------------------------------- |
|---|
| 114 | -- process s d |
|---|
| 115 | -- |
|---|
| 116 | -- Input ">Header 1\nABCDDFF\nAHDHFDSHE\n>Header 2\nAHSDHFF\nSDHJFD"... |
|---|
| 117 | -- Output [("ABC,1),("BCD",1),("CDD",1),("DDF",1)], |
|---|
| 118 | -- [("AHD,1),("DFF",1),("DHF",1),("HDH",1)],... |
|---|
| 119 | -- |
|---|
| 120 | -- Where s is length of the bits (s=3 above) and 2^d is the blocking size |
|---|
| 121 | -- (n=2 above) |
|---|
| 122 | -- |
|---|
| 123 | process :: Int -> Int -> Stream (Maybe BS.ByteString) (Either (Maybe [BS.ByteString]) BS.ByteString) |
|---|
| 124 | process s d = record `pipe` (liftEither $ (liftMaybe $ divide s) `pipe` group d) |
|---|
| 125 | -- where wrap :: Stream BS.ByteString [BS.ByteString] |
|---|
| 126 | -- wrap = lift (\x -> [x]) |
|---|
| 127 | |
|---|
| 128 | |
|---|
| 129 | -------------------------------------------------------------------------------- |
|---|
| 130 | -- main loop |
|---|
| 131 | -- |
|---|
| 132 | -- Setup the above process stream according to command-line paramaters, pass |
|---|
| 133 | -- the contents of the file through it (a line at a time), and write each |
|---|
| 134 | -- result (a collection of processed elements) out to seperate temporary file. |
|---|
| 135 | --- |
|---|
| 136 | main = do IO.hSetBuffering IO.stdout IO.LineBuffering |
|---|
| 137 | args <- getArgs :: IO [String] |
|---|
| 138 | if length args /= 3 then do prog <- getProgName |
|---|
| 139 | fail $ "command line: " ++ prog ++ " input size depth" |
|---|
| 140 | else return () |
|---|
| 141 | input <- return $ args !! 0 :: IO String |
|---|
| 142 | size <- catch (read $ args !! 1) $ \_ -> fail $ "size should be an integer not " ++ (show $ args !! 1) :: IO Int |
|---|
| 143 | depth <- catch (read $ args !! 2) $ \_ -> fail $ "depth should be an integer not " ++ (show $ args !! 2) :: IO Int |
|---|
| 144 | file <- IO.openFile input IO.ReadMode :: IO IO.Handle |
|---|
| 145 | hdls <- chew file [] $ process size depth :: IO [IO.Handle] |
|---|
| 146 | mapM_ IO.hClose hdls |
|---|
| 147 | IO.hClose file |
|---|
| 148 | where read :: (Read a) => String -> IO a |
|---|
| 149 | read xs = case [ x | (x,xs') <- reads xs, ("","") <- lex xs' ] of |
|---|
| 150 | [x] -> return x |
|---|
| 151 | _ -> fail "read: ambiguous parse" |
|---|
| 152 | chew :: IO.Handle -> [IO.Handle] -> Stream (Maybe BS.ByteString) (Either (Maybe [BS.ByteString]) BS.ByteString) -> IO [IO.Handle] |
|---|
| 153 | chew h hs (Put (Left (Just xs)) f) = do IO.hTell h >>= \p -> putStr $ "Starting new sorted block at " ++ show p ++ " (" |
|---|
| 154 | h' <- IO.openTempFile "/tmp" "PChunk.tmp" >>= \(nm,h) -> putStrLn (nm ++ ")") >> return h |
|---|
| 155 | mapM_ (BS.hPutStrLn h') xs |
|---|
| 156 | chew h (h':hs) f |
|---|
| 157 | chew h hs (Put (Left Nothing) f) = mapM (\h -> IO.hSeek h IO.AbsoluteSeek 0 >> return h) hs |
|---|
| 158 | chew h hs (Put (Right m) f) = do BS.hPutStrLn IO.stderr $ BS.append "**WARNING** " m |
|---|
| 159 | chew h hs f |
|---|
| 160 | chew h hs (Get f) = do eof <- IO.hIsEOF h |
|---|
| 161 | if eof |
|---|
| 162 | then chew h hs $ f Nothing |
|---|
| 163 | else do l <- BS.hGetLine h |
|---|
| 164 | chew h hs $ f $ Just l |
|---|