{-# LANGUAGE OverloadedStrings #-} {-|Module to read and parse Eigenstrat-formatted genotype data. The Eigenstrat format is defined at . -} module SequenceFormats.Eigenstrat (EigenstratSnpEntry(..), EigenstratIndEntry(..), readEigenstratInd, GenoEntry(..), GenoLine, Sex(..), readEigenstratSnpStdIn, readEigenstratSnpFile, readBimStdIn, readBimFile, readEigenstrat, writeEigenstrat, writeEigenstratIndFile, writeEigenstratSnp, writeBim, writeEigenstratGeno) where import SequenceFormats.Utils (consumeProducer, SeqFormatException(..), Chrom(..), readFileProd, word) import Control.Applicative ((<|>)) import Control.Exception (throw) import Control.Monad (void, forM_) import Control.Monad.Catch (MonadThrow) import Control.Monad.IO.Class (MonadIO, liftIO) import qualified Data.Attoparsec.ByteString.Char8 as A import Data.Vector (Vector, fromList, toList) import qualified Data.ByteString.Char8 as B import Pipes (Producer, Pipe, (>->), for, cat, yield, Consumer) import Pipes.Safe (MonadSafe) import qualified Pipes.Safe.Prelude as PS import qualified Pipes.Prelude as P import qualified Pipes.ByteString as PB import System.IO (withFile, IOMode(..), Handle, hPutStrLn) -- |A datatype to represent a single genomic SNP. The constructor arguments are: -- Chromosome, Position, Reference Allele, Alternative Allele. data EigenstratSnpEntry = EigenstratSnpEntry { snpChrom :: Chrom, snpPos :: Int, snpGeneticPos :: Double, snpId :: B.ByteString, snpRef :: Char, snpAlt :: Char } deriving (Eq, Show) -- |A datatype to represent a single individual. The constructor arguments are: -- Name, Sex and Population Name data EigenstratIndEntry = EigenstratIndEntry String Sex String deriving (Eq, Show) -- |A datatype to represent Sex in an Eigenstrat Individual file data Sex = Male | Female | Unknown deriving (Eq, Show) -- |A datatype to represent the genotype of an individual at a SNP. data GenoEntry = HomRef | Het | HomAlt | Missing deriving (Eq, Show) -- |Vector of the genotypes of all individuals at a single SNP. type GenoLine = Vector GenoEntry eigenstratSnpParser :: A.Parser EigenstratSnpEntry eigenstratSnpParser = do snpId_ <- A.skipMany A.space >> word chrom <- A.skipMany1 A.space >> word geneticPos <- A.skipMany1 A.space >> A.double pos <- A.skipMany1 A.space >> A.decimal ref <- A.skipMany1 A.space >> A.satisfy (A.inClass "ACTGN") alt <- A.skipMany1 A.space >> A.satisfy (A.inClass "ACTGX") void A.endOfLine return $ EigenstratSnpEntry (Chrom chrom) pos geneticPos snpId_ ref alt bimParser :: A.Parser EigenstratSnpEntry bimParser = do chrom <- word snpId_ <- A.skipMany1 A.space >> word geneticPos <- A.skipMany1 A.space >> A.double pos <- A.skipMany1 A.space >> A.decimal ref <- A.skipMany1 A.space >> A.satisfy (A.inClass "ACTGN") alt <- A.skipMany1 A.space >> A.satisfy (A.inClass "ACTGX") void A.endOfLine return $ EigenstratSnpEntry (Chrom chrom) pos geneticPos snpId_ ref alt eigenstratIndParser :: A.Parser EigenstratIndEntry eigenstratIndParser = do A.skipMany A.space name <- word A.skipMany1 A.space sex <- parseSex A.skipMany1 A.space popName <- word void A.endOfLine return $ EigenstratIndEntry (B.unpack name) sex (B.unpack popName) parseSex :: A.Parser Sex parseSex = parseMale <|> parseFemale <|> parseUnknown where parseMale = A.char 'M' >> return Male parseFemale = A.char 'F' >> return Female parseUnknown = A.char 'U' >> return Unknown -- |Function to read an Eigenstrat individual file. Returns the Eigenstrat Individual Entries as list. readEigenstratInd :: (MonadIO m) => FilePath -> m [EigenstratIndEntry] readEigenstratInd fn = liftIO . withFile fn ReadMode $ \handle -> P.toListM $ consumeProducer eigenstratIndParser (PB.fromHandle handle) eigenstratGenoParser :: A.Parser GenoLine eigenstratGenoParser = do line <- A.takeWhile1 isValidNum void A.endOfLine return . fromList $ do l <- B.unpack line case l of '0' -> return HomAlt '1' -> return Het '2' -> return HomRef '9' -> return Missing _ -> error "this should never happen" where isValidNum c = c == '0' || c == '1' || c == '2' || c == '9' -- |Function to read a Snp File from StdIn. Returns a Pipes-Producer over the EigenstratSnpEntries. readEigenstratSnpStdIn :: (MonadThrow m, MonadIO m) => Producer EigenstratSnpEntry m () readEigenstratSnpStdIn = consumeProducer eigenstratSnpParser PB.stdin -- |Function to read a Snp File from a file. Returns a Pipes-Producer over the EigenstratSnpEntries. readEigenstratSnpFile :: (MonadSafe m) => FilePath -> Producer EigenstratSnpEntry m () readEigenstratSnpFile = consumeProducer eigenstratSnpParser . readFileProd -- |Function to read a Bim File from StdIn. Returns a Pipes-Producer over the EigenstratSnpEntries. readBimStdIn :: (MonadThrow m, MonadIO m) => Producer EigenstratSnpEntry m () readBimStdIn = consumeProducer bimParser PB.stdin -- |Function to read a Bim File from a file. Returns a Pipes-Producer over the EigenstratSnpEntries. readBimFile :: (MonadSafe m) => FilePath -> Producer EigenstratSnpEntry m () readBimFile = consumeProducer bimParser . readFileProd -- |Function to read a full Eigenstrat database from files. Returns a pair of the Eigenstrat Individual Entries, and a joint Producer over the snp entries and the genotypes. readEigenstrat :: (MonadSafe m) => FilePath -- ^The Genotype file -> FilePath -- ^The Snp File -> FilePath -- ^The Ind file -> m ([EigenstratIndEntry], Producer (EigenstratSnpEntry, GenoLine) m ()) -- The return pair of individual entries and a joint Snp/Geno Producer. readEigenstrat genoFile snpFile indFile = do indEntries <- readEigenstratInd indFile let snpProd = readEigenstratSnpFile snpFile genoProd = consumeProducer eigenstratGenoParser (readFileProd genoFile) >-> validateEigenstratEntries (length indEntries) return (indEntries, P.zip snpProd genoProd) validateEigenstratEntries :: (MonadThrow m) => Int -> Pipe GenoLine GenoLine m () validateEigenstratEntries nr = for cat $ \line -> if length line /= nr then do let msg = "inconsistent nr of genotypes (" <> show (length line) <> ", but should be " <> show nr <> ") in \ \genotype line " <> show line throw (SeqFormatException msg) else yield line -- |Function to write an Eigenstrat Ind file. writeEigenstratIndFile :: (MonadIO m) => FilePath -> [EigenstratIndEntry] -> m () writeEigenstratIndFile f indEntries = liftIO . withFile f WriteMode $ \h -> forM_ indEntries $ \(EigenstratIndEntry name sex popName) -> hPutStrLn h $ name <> "\t" <> sexToStr sex <> "\t" <> popName where sexToStr sex = case sex of Male -> "M" Female -> "F" Unknown -> "U" -- |Function to write an Eigenstrat Snp File. Returns a consumer expecting EigenstratSnpEntries. writeEigenstratSnp :: (MonadIO m) => Handle -- ^The Eigenstrat Snp File Handle. -> Consumer EigenstratSnpEntry m () -- ^A consumer to read EigenstratSnpEntries writeEigenstratSnp snpFileH = let snpOutTextConsumer = PB.toHandle snpFileH toTextPipe = P.map (\(EigenstratSnpEntry chrom pos gpos gid ref alt) -> let snpLine = B.intercalate "\t" [gid, unChrom chrom, B.pack (show gpos), B.pack (show pos), B.singleton ref, B.singleton alt] in snpLine <> "\n") in toTextPipe >-> snpOutTextConsumer -- |Function to write a Bim file. Returns a consumer expecting EigenstratSnpEntries. writeBim :: (MonadIO m) => Handle -- ^The Eigenstrat Snp File handle. -> Consumer EigenstratSnpEntry m () -- ^A consumer to read EigenstratSnpEntries writeBim snpFileH = let snpOutTextConsumer = PB.toHandle snpFileH toTextPipe = P.map (\(EigenstratSnpEntry chrom pos gpos gid ref alt) -> B.intercalate "\t" [unChrom chrom, gid, B.pack (show gpos), B.pack (show pos), B.singleton ref, B.singleton alt]) in toTextPipe >-> snpOutTextConsumer -- |Function to write an Eigentrat Geno File. Returns a consumer expecting Eigenstrat Genolines. writeEigenstratGeno :: (MonadIO m) => Handle -- ^The Genotype file handle -> Consumer GenoLine m () -- ^A consumer to read Genotype entries. writeEigenstratGeno genoFileH = let genoOutTextConsumer = PB.toHandle genoFileH toTextPipe = P.map (\genoLine -> let genoLineStr = B.concat . map (B.pack . show . toEigenStratNum) . toList $ genoLine in genoLineStr <> "\n") in toTextPipe >-> genoOutTextConsumer where toEigenStratNum c = case c of HomRef -> 2 :: Int Het -> 1 HomAlt -> 0 Missing -> 9 -- |Function to write an Eigenstrat Database. Returns a consumer expecting joint Snp- and Genotype lines. writeEigenstrat :: (MonadSafe m) => FilePath -- ^The Genotype file -> FilePath -- ^The Snp File -> FilePath -- ^The Ind file -> [EigenstratIndEntry] -- ^The list of individual entries -> Consumer (EigenstratSnpEntry, GenoLine) m () -- ^A consumer to read joint Snp/Genotype entries. writeEigenstrat genoFile snpFile indFile indEntries = do liftIO $ writeEigenstratIndFile indFile indEntries let snpOutConsumer = PS.withFile snpFile WriteMode writeEigenstratSnp genoOutConsumer = PS.withFile genoFile WriteMode writeEigenstratGeno P.tee (P.map fst >-> snpOutConsumer) >-> P.map snd >-> genoOutConsumer