{-# LANGUAGE OverloadedStrings #-}
module SequenceFormats.RareAlleleHistogram (RareAlleleHistogram(..), readHistogramFromHandle,
SitePattern, readHistogram, writeHistogramStdOut, writeHistogramFile, showSitePattern) where
import SequenceFormats.Utils (SeqFormatException(..))
import Control.Applicative (optional)
import Control.Error (assertErr)
import Control.Exception (throw)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Control.Monad.Trans.State.Strict (evalStateT)
import qualified Data.Attoparsec.ByteString.Char8 as A
import Data.Char (isAlphaNum)
import Data.Int (Int64)
import Data.List (intercalate, sortBy)
import qualified Data.Map.Strict as Map
import qualified Data.ByteString.Char8 as B
import Pipes.Attoparsec (parse)
import qualified Pipes.ByteString as PB
import System.IO (Handle, IOMode(..), withFile)
data RareAlleleHistogram = RareAlleleHistogram {
raNames :: [String],
raNVec :: [Int],
raMinAf :: Int,
raMaxAf :: Int,
raConditionOn :: [Int],
raExcludePatterns :: [SitePattern],
raTotalNrSites :: Int64,
raCounts :: Map.Map SitePattern Int64,
raJackknifeEstimates :: Maybe (Map.Map SitePattern (Double, Double))
} deriving (Eq, Show)
type SitePattern = [Int]
showSitePattern :: SitePattern -> String
showSitePattern = intercalate "," . map show
showHistogram :: RareAlleleHistogram -> Either String B.ByteString
showHistogram hist = do
assertErr "can only print histogram with minAf=1 due to format-legacy" $ raMinAf hist == 1
assertErr "can only print histogram with no conditioning due to format-legacy" $
null (raConditionOn hist)
assertErr "can only print histogram with no exclude pattern due to format-legacy" $
null (raExcludePatterns hist)
let head0 = "NAMES=" <> (B.intercalate "," . map B.pack . raNames $ hist)
head1 = "N=" <> (B.pack . intercalate "," . map show . raNVec $ hist)
head2 = "MAX_M=" <> (B.pack . show . raMaxAf $ hist)
head3 = "TOTAL_SITES=" <> (B.pack . show . raTotalNrSites $ hist)
body = do
(k, v) <- sorted
case raJackknifeEstimates hist of
Nothing -> [B.intercalate " " [B.pack . showSitePattern $ k, B.pack . show $ v]]
Just jkHist -> do
let Just (jkMean, jkSE) = k `Map.lookup` jkHist
return $ B.intercalate " " [B.pack . showSitePattern $ k, B.pack . show $ v,
B.pack . show $ jkMean, B.pack . show $ jkSE]
return $ B.unlines (head0:head1:head2:head3:body)
where
sorted = sortBy (\(_, v1) (_, v2) -> compare v2 v1) $ Map.toList (raCounts hist)
writeHistogramStdOut :: (MonadIO m) => RareAlleleHistogram -> m ()
writeHistogramStdOut hist =
case showHistogram hist of
Left err -> throw (SeqFormatException err)
Right outStr -> liftIO $ B.putStrLn outStr
writeHistogramFile :: (MonadIO m) => FilePath -> RareAlleleHistogram -> m ()
writeHistogramFile outF hist =
case showHistogram hist of
Left err -> throw (SeqFormatException err)
Right outStr -> liftIO $ B.writeFile outF outStr
readHistogram :: (MonadIO m) => FilePath -> m RareAlleleHistogram
readHistogram path = liftIO $ withFile path ReadMode readHistogramFromHandle
readHistogramFromHandle :: (MonadIO m) => Handle -> m RareAlleleHistogram
readHistogramFromHandle handle = do
res <- evalStateT (parse parseHistogram) . PB.fromHandle $ handle
case res of
Nothing -> throw (SeqFormatException "histogram file exhausted too early")
Just (Left err) -> throw (SeqFormatException ("Histogram parsing error: " <> show err))
Just (Right hist) -> return hist
parseHistogram :: A.Parser RareAlleleHistogram
parseHistogram = do
names <- parseNames
nVec <- parseNVec
maxM <- parseMaxM
totalNrSites <- parseTotalNrSites
body <- parseBody
let countHist = Map.fromList $ [(k, c) | (k, c, _) <- body]
jkHist = case head body of
(_, _, Just _) -> Just . Map.fromList $ [(k, (jkMean, jkSE)) |
(k, _, Just (jkMean, jkSE)) <- body]
_ -> Nothing
return $ RareAlleleHistogram (map B.unpack names) nVec 1 maxM [][] totalNrSites countHist jkHist
where
parseNames = A.string "NAMES=" *> name `A.sepBy1` A.char ',' <* A.endOfLine
name = A.takeWhile1 (\c -> isAlphaNum c || c == '_' || c == '-')
parseNVec = A.string "N=" *> A.decimal `A.sepBy1` A.char ',' <* A.endOfLine
parseMaxM = A.string "MAX_M=" *> A.decimal <* A.endOfLine
parseTotalNrSites = A.string "TOTAL_SITES=" *> A.decimal <* A.endOfLine
parseBody :: A.Parser [(SitePattern, Int64, Maybe (Double, Double))]
parseBody = A.many1 patternLine
where
patternLine = (,,) <$> parsePattern <* A.space <*> parseLargeInt <*> optional parseJackknife <*
A.endOfLine
parsePattern = A.decimal `A.sepBy1` A.char ','
parseLargeInt = read <$> A.many1 A.digit
parseJackknife = (,) <$> (A.space *> A.double) <* A.space <*> A.double