{-# LANGUAGE OverloadedStrings #-}
module SequenceFormats.Eigenstrat (EigenstratSnpEntry(..), EigenstratIndEntry(..),
readEigenstratInd, GenoEntry(..), GenoLine, Sex(..),
readEigenstratSnpStdIn, readEigenstratSnpFile,
readEigenstrat, writeEigenstrat, writeEigenstratIndFile, writeEigenstratSnp,
writeEigenstratGeno) where
import SequenceFormats.Utils (Chrom (..),
SeqFormatException (..),
consumeProducer,
readFileProd, word)
import Control.Applicative ((<|>))
import Control.Exception (throw)
import Control.Monad (forM_, void)
import Control.Monad.Catch (MonadThrow)
import Control.Monad.IO.Class (MonadIO, liftIO)
import qualified Data.Attoparsec.ByteString.Char8 as A
import qualified Data.ByteString.Char8 as B
import Data.Vector (Vector, fromList, toList)
import Pipes (Consumer, Pipe, Producer,
cat, for, yield, (>->))
import qualified Pipes.ByteString as PB
import qualified Pipes.Prelude as P
import Pipes.Safe (MonadSafe)
import qualified Pipes.Safe.Prelude as PS
import System.IO (Handle, IOMode (..),
hPutStrLn, withFile)
data EigenstratSnpEntry = EigenstratSnpEntry
{ snpChrom :: Chrom
, snpPos :: Int
, snpGeneticPos :: Double
, snpId :: B.ByteString
, snpRef :: Char
, snpAlt :: Char
}
deriving (Eq, Show)
data EigenstratIndEntry = EigenstratIndEntry String Sex String
deriving (Eq, Show)
data Sex = Male
| Female
| Unknown
deriving (Eq, Show)
data GenoEntry = HomRef
| Het
| HomAlt
| Missing
deriving (Eq, Show)
type GenoLine = Vector GenoEntry
eigenstratSnpParser :: A.Parser EigenstratSnpEntry
eigenstratSnpParser = do
snpId_ <- A.skipMany A.space >> word
chrom <- A.skipMany1 A.space >> word
geneticPos <- A.skipMany1 A.space >> A.double
pos <- A.skipMany1 A.space >> A.decimal
ref <- A.skipMany1 A.space >> A.satisfy (A.inClass "ACTGN")
alt <- A.skipMany1 A.space >> A.satisfy (A.inClass "ACTGX")
void A.endOfLine
return $ EigenstratSnpEntry (Chrom chrom) pos geneticPos snpId_ ref alt
eigenstratIndParser :: A.Parser EigenstratIndEntry
eigenstratIndParser = do
A.skipMany A.space
name <- word
A.skipMany1 A.space
sex <- parseSex
A.skipMany1 A.space
popName <- word
void A.endOfLine
return $ EigenstratIndEntry (B.unpack name) sex (B.unpack popName)
parseSex :: A.Parser Sex
parseSex = parseMale <|> parseFemale <|> parseUnknown
where
parseMale = A.char 'M' >> return Male
parseFemale = A.char 'F' >> return Female
parseUnknown = A.char 'U' >> return Unknown
readEigenstratInd :: (MonadIO m) => FilePath -> m [EigenstratIndEntry]
readEigenstratInd fn =
liftIO . withFile fn ReadMode $ \handle ->
P.toListM $ consumeProducer eigenstratIndParser (PB.fromHandle handle)
eigenstratGenoParser :: A.Parser GenoLine
eigenstratGenoParser = do
line <- A.takeWhile1 isValidNum
void A.endOfLine
return . fromList $ do
l <- B.unpack line
case l of
'0' -> return HomAlt
'1' -> return Het
'2' -> return HomRef
'9' -> return Missing
_ -> error "this should never happen"
where
isValidNum c = c == '0' || c == '1' || c == '2' || c == '9'
readEigenstratSnpStdIn :: (MonadThrow m, MonadIO m) => Producer EigenstratSnpEntry m ()
readEigenstratSnpStdIn = consumeProducer eigenstratSnpParser PB.stdin
readEigenstratSnpFile :: (MonadSafe m) => FilePath -> Producer EigenstratSnpEntry m ()
readEigenstratSnpFile = consumeProducer eigenstratSnpParser . readFileProd
readEigenstrat :: (MonadSafe m) => FilePath
-> FilePath
-> FilePath
-> m ([EigenstratIndEntry], Producer (EigenstratSnpEntry, GenoLine) m ())
readEigenstrat genoFile snpFile indFile = do
indEntries <- readEigenstratInd indFile
let snpProd = readEigenstratSnpFile snpFile
genoProd = consumeProducer eigenstratGenoParser (readFileProd genoFile) >->
validateEigenstratEntries (length indEntries)
return (indEntries, P.zip snpProd genoProd)
validateEigenstratEntries :: (MonadThrow m) => Int -> Pipe GenoLine GenoLine m ()
validateEigenstratEntries nr = for cat $ \line ->
if length line /= nr
then do
let msg = "inconsistent nr of genotypes (" <> show (length line) <> ", but should be " <> show nr <> ") in \
\genotype line " <> show line
throw (SeqFormatException msg)
else
yield line
writeEigenstratIndFile :: (MonadIO m) => FilePath -> [EigenstratIndEntry] -> m ()
writeEigenstratIndFile f indEntries =
liftIO . withFile f WriteMode $ \h ->
forM_ indEntries $ \(EigenstratIndEntry name sex popName) ->
hPutStrLn h $ name <> "\t" <> sexToStr sex <> "\t" <> popName
where
sexToStr sex = case sex of
Male -> "M"
Female -> "F"
Unknown -> "U"
writeEigenstratSnp :: (MonadIO m) => Handle
-> Consumer EigenstratSnpEntry m ()
writeEigenstratSnp snpFileH =
let snpOutTextConsumer = PB.toHandle snpFileH
toTextPipe = P.map (\(EigenstratSnpEntry chrom pos gpos gid ref alt) ->
let snpLine = B.intercalate "\t" [gid, unChrom chrom, B.pack (show gpos),
B.pack (show pos), B.singleton ref, B.singleton alt]
in snpLine <> "\n")
in toTextPipe >-> snpOutTextConsumer
writeEigenstratGeno :: (MonadIO m) => Handle
-> Consumer GenoLine m ()
writeEigenstratGeno genoFileH =
let genoOutTextConsumer = PB.toHandle genoFileH
toTextPipe = P.map (\genoLine ->
let genoLineStr = B.concat . map (B.pack . show . toEigenStratNum) . toList $ genoLine
in genoLineStr <> "\n")
in toTextPipe >-> genoOutTextConsumer
where
toEigenStratNum c = case c of
HomRef -> 2 :: Int
Het -> 1
HomAlt -> 0
Missing -> 9
writeEigenstrat :: (MonadSafe m) => FilePath
-> FilePath
-> FilePath
-> [EigenstratIndEntry]
-> Consumer (EigenstratSnpEntry, GenoLine) m ()
writeEigenstrat genoFile snpFile indFile indEntries = do
liftIO $ writeEigenstratIndFile indFile indEntries
let snpOutConsumer = PS.withFile snpFile WriteMode writeEigenstratSnp
genoOutConsumer = PS.withFile genoFile WriteMode writeEigenstratGeno
P.tee (P.map fst >-> snpOutConsumer) >-> P.map snd >-> genoOutConsumer