{-# LANGUAGE OverloadedStrings #-}

{-| A module to read and write allele sharing histograms, as defined here:
<https://rarecoal-docs.readthedocs.io/en/latest/rarecoal.html#histogram-files>
-}

module SequenceFormats.RareAlleleHistogram (RareAlleleHistogram(..), readHistogramFromHandle,
                            SitePattern, readHistogram, writeHistogramStdOut, writeHistogramFile, showSitePattern) where

import SequenceFormats.Utils (SeqFormatException(..))

import Control.Applicative (optional)
import Control.Error (assertErr)
import Control.Exception (throw)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Control.Monad.Trans.State.Strict (evalStateT)
import qualified Data.Attoparsec.ByteString.Char8 as A
import Data.Char (isAlphaNum)
import Data.Int (Int64)
import Data.List (intercalate, sortBy)
import qualified Data.Map.Strict as Map
import qualified Data.ByteString.Char8 as B
import Pipes.Attoparsec (parse)
import qualified Pipes.ByteString as PB
import System.IO (Handle, IOMode(..), withFile)

-- |A datatype to represent an Allele Sharing Histogram:
data RareAlleleHistogram = RareAlleleHistogram {
    RareAlleleHistogram -> [[Char]]
raNames :: [String], -- ^A list of branch names
    RareAlleleHistogram -> [Int]
raNVec :: [Int], -- ^A list of haploid sample sizes.
    RareAlleleHistogram -> Int
raMinAf :: Int, -- ^The minimum allele count
    RareAlleleHistogram -> Int
raMaxAf :: Int, -- ^The maximum allele count
    RareAlleleHistogram -> [Int]
raConditionOn :: [Int], -- ^A list of branch indices that were used to condition the allele 
                            --sharing pattern
    RareAlleleHistogram -> [[Int]]
raExcludePatterns :: [SitePattern], -- ^A list of patterns that are excluded.
    RareAlleleHistogram -> Int64
raTotalNrSites :: Int64, -- ^The total number of non-missing sites in the genome.
    RareAlleleHistogram -> Map [Int] Int64
raCounts :: Map.Map SitePattern Int64, -- ^The actual data, a dictionary from allele sharing patterns to observed numbers.
    RareAlleleHistogram -> Maybe (Map [Int] (Double, Double))
raJackknifeEstimates :: Maybe (Map.Map SitePattern (Double, Double)) -- ^An optional dictionary that contains Jackknife estimates and standard deviations for each pattern frequency.
} deriving (RareAlleleHistogram -> RareAlleleHistogram -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: RareAlleleHistogram -> RareAlleleHistogram -> Bool
$c/= :: RareAlleleHistogram -> RareAlleleHistogram -> Bool
== :: RareAlleleHistogram -> RareAlleleHistogram -> Bool
$c== :: RareAlleleHistogram -> RareAlleleHistogram -> Bool
Eq, Int -> RareAlleleHistogram -> ShowS
[RareAlleleHistogram] -> ShowS
RareAlleleHistogram -> [Char]
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
showList :: [RareAlleleHistogram] -> ShowS
$cshowList :: [RareAlleleHistogram] -> ShowS
show :: RareAlleleHistogram -> [Char]
$cshow :: RareAlleleHistogram -> [Char]
showsPrec :: Int -> RareAlleleHistogram -> ShowS
$cshowsPrec :: Int -> RareAlleleHistogram -> ShowS
Show)

-- |A simple type synonym for the SitePattern, represented as a list of Integers that represents 
-- each pattern across the branches.
type SitePattern = [Int]

-- |A simple function to convert a pattern into a String.
showSitePattern :: SitePattern -> String
showSitePattern :: [Int] -> [Char]
showSitePattern = forall a. [a] -> [[a]] -> [a]
intercalate [Char]
"," forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a -> b) -> [a] -> [b]
map forall a. Show a => a -> [Char]
show

-- |Function to convert a Rare Allele Histogram to text. Returns an error if attempting to print a 
-- histogram with non-standard settings. Many settings, such as minAf>1, are only meant for 
-- in-memory representations, but are not compatible with the file format itself.
showHistogram :: RareAlleleHistogram -> Either String B.ByteString
showHistogram :: RareAlleleHistogram -> Either [Char] ByteString
showHistogram RareAlleleHistogram
hist = do
    forall e. e -> Bool -> Either e ()
assertErr [Char]
"can only print histogram with minAf=1 due to format-legacy" forall a b. (a -> b) -> a -> b
$ RareAlleleHistogram -> Int
raMinAf RareAlleleHistogram
hist forall a. Eq a => a -> a -> Bool
== Int
1
    forall e. e -> Bool -> Either e ()
assertErr [Char]
"can only print histogram with no conditioning due to format-legacy" forall a b. (a -> b) -> a -> b
$
        forall (t :: * -> *) a. Foldable t => t a -> Bool
null (RareAlleleHistogram -> [Int]
raConditionOn RareAlleleHistogram
hist)
    forall e. e -> Bool -> Either e ()
assertErr [Char]
"can only print histogram with no exclude pattern due to format-legacy" forall a b. (a -> b) -> a -> b
$
        forall (t :: * -> *) a. Foldable t => t a -> Bool
null (RareAlleleHistogram -> [[Int]]
raExcludePatterns RareAlleleHistogram
hist)
    let head0 :: ByteString
head0 = ByteString
"NAMES=" forall a. Semigroup a => a -> a -> a
<> (ByteString -> [ByteString] -> ByteString
B.intercalate ByteString
"," forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a -> b) -> [a] -> [b]
map [Char] -> ByteString
B.pack forall b c a. (b -> c) -> (a -> b) -> a -> c
. RareAlleleHistogram -> [[Char]]
raNames forall a b. (a -> b) -> a -> b
$ RareAlleleHistogram
hist)
        head1 :: ByteString
head1 = ByteString
"N=" forall a. Semigroup a => a -> a -> a
<> ([Char] -> ByteString
B.pack forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. [a] -> [[a]] -> [a]
intercalate [Char]
"," forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a -> b) -> [a] -> [b]
map forall a. Show a => a -> [Char]
show forall b c a. (b -> c) -> (a -> b) -> a -> c
. RareAlleleHistogram -> [Int]
raNVec forall a b. (a -> b) -> a -> b
$ RareAlleleHistogram
hist)
        head2 :: ByteString
head2 = ByteString
"MAX_M=" forall a. Semigroup a => a -> a -> a
<> ([Char] -> ByteString
B.pack forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Show a => a -> [Char]
show forall b c a. (b -> c) -> (a -> b) -> a -> c
. RareAlleleHistogram -> Int
raMaxAf forall a b. (a -> b) -> a -> b
$ RareAlleleHistogram
hist)
        head3 :: ByteString
head3 = ByteString
"TOTAL_SITES=" forall a. Semigroup a => a -> a -> a
<> ([Char] -> ByteString
B.pack forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Show a => a -> [Char]
show forall b c a. (b -> c) -> (a -> b) -> a -> c
. RareAlleleHistogram -> Int64
raTotalNrSites forall a b. (a -> b) -> a -> b
$ RareAlleleHistogram
hist)
        body :: [ByteString]
body = do
            ([Int]
k, Int64
v) <- [([Int], Int64)]
sorted
            case RareAlleleHistogram -> Maybe (Map [Int] (Double, Double))
raJackknifeEstimates RareAlleleHistogram
hist of
                Maybe (Map [Int] (Double, Double))
Nothing -> [ByteString -> [ByteString] -> ByteString
B.intercalate ByteString
" " [[Char] -> ByteString
B.pack forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Int] -> [Char]
showSitePattern forall a b. (a -> b) -> a -> b
$ [Int]
k, [Char] -> ByteString
B.pack forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Show a => a -> [Char]
show forall a b. (a -> b) -> a -> b
$ Int64
v]]
                Just Map [Int] (Double, Double)
jkHist -> do
                    let Just (Double
jkMean, Double
jkSE) = [Int]
k forall k a. Ord k => k -> Map k a -> Maybe a
`Map.lookup` Map [Int] (Double, Double)
jkHist
                    forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ ByteString -> [ByteString] -> ByteString
B.intercalate ByteString
" " [[Char] -> ByteString
B.pack forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Int] -> [Char]
showSitePattern forall a b. (a -> b) -> a -> b
$ [Int]
k, [Char] -> ByteString
B.pack forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Show a => a -> [Char]
show forall a b. (a -> b) -> a -> b
$ Int64
v,
                                                [Char] -> ByteString
B.pack forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Show a => a -> [Char]
show forall a b. (a -> b) -> a -> b
$ Double
jkMean, [Char] -> ByteString
B.pack forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Show a => a -> [Char]
show forall a b. (a -> b) -> a -> b
$ Double
jkSE]
    forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ [ByteString] -> ByteString
B.unlines (ByteString
head0forall a. a -> [a] -> [a]
:ByteString
head1forall a. a -> [a] -> [a]
:ByteString
head2forall a. a -> [a] -> [a]
:ByteString
head3forall a. a -> [a] -> [a]
:[ByteString]
body)
  where
    sorted :: [([Int], Int64)]
sorted = forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (\([Int]
_, Int64
v1) ([Int]
_, Int64
v2)  -> forall a. Ord a => a -> a -> Ordering
compare Int64
v2 Int64
v1) forall a b. (a -> b) -> a -> b
$ forall k a. Map k a -> [(k, a)]
Map.toList (RareAlleleHistogram -> Map [Int] Int64
raCounts RareAlleleHistogram
hist)

-- |Write a histogram to the stdout
writeHistogramStdOut :: (MonadIO m) => RareAlleleHistogram -> m ()
writeHistogramStdOut :: forall (m :: * -> *). MonadIO m => RareAlleleHistogram -> m ()
writeHistogramStdOut RareAlleleHistogram
hist =
    case RareAlleleHistogram -> Either [Char] ByteString
showHistogram RareAlleleHistogram
hist of
        Left [Char]
err -> forall a e. Exception e => e -> a
throw ([Char] -> SeqFormatException
SeqFormatException [Char]
err)
        Right ByteString
outStr -> forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO forall a b. (a -> b) -> a -> b
$ ByteString -> IO ()
B.putStrLn ByteString
outStr

-- |Write a histogram to a file
writeHistogramFile :: (MonadIO m) => FilePath -> RareAlleleHistogram -> m ()
writeHistogramFile :: forall (m :: * -> *).
MonadIO m =>
[Char] -> RareAlleleHistogram -> m ()
writeHistogramFile [Char]
outF RareAlleleHistogram
hist =
    case RareAlleleHistogram -> Either [Char] ByteString
showHistogram RareAlleleHistogram
hist of
        Left [Char]
err -> forall a e. Exception e => e -> a
throw ([Char] -> SeqFormatException
SeqFormatException [Char]
err)
        Right ByteString
outStr -> forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO forall a b. (a -> b) -> a -> b
$ [Char] -> ByteString -> IO ()
B.writeFile [Char]
outF ByteString
outStr
    
-- |Read a histogram from a FilePath
readHistogram :: (MonadIO m) => FilePath -> m RareAlleleHistogram
readHistogram :: forall (m :: * -> *). MonadIO m => [Char] -> m RareAlleleHistogram
readHistogram [Char]
path = forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO forall a b. (a -> b) -> a -> b
$ forall r. [Char] -> IOMode -> (Handle -> IO r) -> IO r
withFile [Char]
path IOMode
ReadMode forall (m :: * -> *). MonadIO m => Handle -> m RareAlleleHistogram
readHistogramFromHandle

-- |Read a histogram from a File Handle.
readHistogramFromHandle :: (MonadIO m) => Handle -> m RareAlleleHistogram
readHistogramFromHandle :: forall (m :: * -> *). MonadIO m => Handle -> m RareAlleleHistogram
readHistogramFromHandle Handle
handle = do
    Maybe (Either ParsingError RareAlleleHistogram)
res <- forall (m :: * -> *) s a. Monad m => StateT s m a -> s -> m a
evalStateT (forall (m :: * -> *) a b.
(Monad m, ParserInput a) =>
Parser a b -> Parser a m (Maybe (Either ParsingError b))
parse Parser RareAlleleHistogram
parseHistogram) (forall (m :: * -> *).
MonadIO m =>
Handle -> Producer' ByteString m ()
PB.fromHandle Handle
handle)
    case Maybe (Either ParsingError RareAlleleHistogram)
res of
        Maybe (Either ParsingError RareAlleleHistogram)
Nothing -> forall a e. Exception e => e -> a
throw ([Char] -> SeqFormatException
SeqFormatException [Char]
"histogram file exhausted too early")
        Just (Left ParsingError
err) -> forall a e. Exception e => e -> a
throw ([Char] -> SeqFormatException
SeqFormatException ([Char]
"Histogram parsing error: " forall a. Semigroup a => a -> a -> a
<> forall a. Show a => a -> [Char]
show ParsingError
err))
        Just (Right RareAlleleHistogram
hist) -> forall (m :: * -> *) a. Monad m => a -> m a
return RareAlleleHistogram
hist

parseHistogram :: A.Parser RareAlleleHistogram
parseHistogram :: Parser RareAlleleHistogram
parseHistogram = do
    [ByteString]
names <- Parser ByteString [ByteString]
parseNames
    [Int]
nVec <- Parser ByteString [Int]
parseNVec
    Int
maxM <- Parser ByteString Int
parseMaxM
    Int64
totalNrSites <- Parser ByteString Int64
parseTotalNrSites
    [([Int], Int64, Maybe (Double, Double))]
body <- Parser [([Int], Int64, Maybe (Double, Double))]
parseBody
    let countHist :: Map [Int] Int64
countHist = forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList forall a b. (a -> b) -> a -> b
$ [([Int]
k, Int64
c) | ([Int]
k, Int64
c, Maybe (Double, Double)
_) <- [([Int], Int64, Maybe (Double, Double))]
body]
        jkHist :: Maybe (Map [Int] (Double, Double))
jkHist = case forall a. [a] -> a
head [([Int], Int64, Maybe (Double, Double))]
body of
            ([Int]
_, Int64
_, Just (Double, Double)
_) -> forall a. a -> Maybe a
Just forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList forall a b. (a -> b) -> a -> b
$ [([Int]
k, (Double
jkMean, Double
jkSE)) |
                                                     ([Int]
k, Int64
_, Just (Double
jkMean, Double
jkSE)) <- [([Int], Int64, Maybe (Double, Double))]
body]
            ([Int], Int64, Maybe (Double, Double))
_ -> forall a. Maybe a
Nothing
    forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ [[Char]]
-> [Int]
-> Int
-> Int
-> [Int]
-> [[Int]]
-> Int64
-> Map [Int] Int64
-> Maybe (Map [Int] (Double, Double))
-> RareAlleleHistogram
RareAlleleHistogram (forall a b. (a -> b) -> [a] -> [b]
map ByteString -> [Char]
B.unpack [ByteString]
names) [Int]
nVec Int
1 Int
maxM [][] Int64
totalNrSites Map [Int] Int64
countHist Maybe (Map [Int] (Double, Double))
jkHist
  where
    parseNames :: Parser ByteString [ByteString]
parseNames = ByteString -> Parser ByteString
A.string ByteString
"NAMES=" forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser ByteString
name forall (f :: * -> *) a s. Alternative f => f a -> f s -> f [a]
`A.sepBy1` Char -> Parser Char
A.char Char
',' forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ()
A.endOfLine
    name :: Parser ByteString
name = (Char -> Bool) -> Parser ByteString
A.takeWhile1 (\Char
c -> Char -> Bool
isAlphaNum Char
c Bool -> Bool -> Bool
|| Char
c forall a. Eq a => a -> a -> Bool
== Char
'_' Bool -> Bool -> Bool
|| Char
c forall a. Eq a => a -> a -> Bool
== Char
'-')
    parseNVec :: Parser ByteString [Int]
parseNVec = ByteString -> Parser ByteString
A.string ByteString
"N=" forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> forall a. Integral a => Parser a
A.decimal forall (f :: * -> *) a s. Alternative f => f a -> f s -> f [a]
`A.sepBy1` Char -> Parser Char
A.char Char
',' forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ()
A.endOfLine
    parseMaxM :: Parser ByteString Int
parseMaxM = ByteString -> Parser ByteString
A.string ByteString
"MAX_M=" forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> forall a. Integral a => Parser a
A.decimal forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ()
A.endOfLine
    parseTotalNrSites :: Parser ByteString Int64
parseTotalNrSites = ByteString -> Parser ByteString
A.string ByteString
"TOTAL_SITES=" forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> forall a. Integral a => Parser a
A.decimal forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ()
A.endOfLine

parseBody :: A.Parser [(SitePattern, Int64, Maybe (Double, Double))]
parseBody :: Parser [([Int], Int64, Maybe (Double, Double))]
parseBody = forall (f :: * -> *) a. Alternative f => f a -> f [a]
A.many1 Parser ByteString ([Int], Int64, Maybe (Double, Double))
patternLine
  where
    patternLine :: Parser ByteString ([Int], Int64, Maybe (Double, Double))
patternLine = (,,) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser ByteString [Int]
parsePattern forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser Char
A.space forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Parser ByteString Int64
parseLargeInt forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional Parser ByteString (Double, Double)
parseJackknife forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<*
        Parser ()
A.endOfLine
    parsePattern :: Parser ByteString [Int]
parsePattern = forall a. Integral a => Parser a
A.decimal forall (f :: * -> *) a s. Alternative f => f a -> f s -> f [a]
`A.sepBy1` Char -> Parser Char
A.char Char
','
    parseLargeInt :: Parser ByteString Int64
parseLargeInt = forall a. Read a => [Char] -> a
read forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall (f :: * -> *) a. Alternative f => f a -> f [a]
A.many1 Parser Char
A.digit
    parseJackknife :: Parser ByteString (Double, Double)
parseJackknife = (,) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Parser Char
A.space forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser ByteString Double
A.double) forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser Char
A.space forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Parser ByteString Double
A.double