{- | Description : Write a counts file Copyright : (c) Dominik Schrempf 2017 License : GPLv3 Maintainer : dominik.schrempf@gmail.com Stability : unstable Portability : non-portable (not tested) TODO: Import. * The Counts Format The input of PoMo is allele frequency data. Especially, when populations have many individuals it is preferable to count the number of bases at each position. This decreases file size and speeds up the parser. Counts files contain: - One headerline that specifies the file as counts file and states the number of populations as well as the number of sites (separated by white space). - A second headerline with white space separated headers: CRHOM (chromosome), POS (position) and sequence names. - Many lines with counts of A, C, G and T bases and their respective positions. Comments: - Lines starting with # before the first headerline are treated as comments. A toy example: @ COUNTSFILE NPOP 5 NSITES N CHROM POS Sheep BlackSheep RedSheep Wolf RedWolf 1 1 0,0,1,0 0,0,1,0 0,0,1,0 0,0,5,0 0,0,0,1 1 2 0,0,0,1 0,0,0,1 0,0,0,1 0,0,0,5 0,0,0,1 . . . 9 8373 0,0,0,1 1,0,0,0 0,1,0,0 0,1,4,0 0,0,1,0 . . . Y 9999 0,0,0,1 0,1,0,0 0,1,0,0 0,5,0,0 0,0,1,0 @ -} module ELynx.Export.Sequence.CountsFile ( Chrom , Pos , DataOneSite , PopulationNames , toCountsFile ) where import qualified Data.ByteString.Lazy.Char8 as L import Data.Maybe (fromMaybe) import ELynx.Data.Character.BoundaryMutation import ELynx.Tools.ByteString (alignLeft, alignRight) -- | The number of sites that will be printed. type NSites = Int -- | The names of the populations. type PopulationNames = [L.ByteString] -- Desired column width of the counts file. colW :: Int colW = 11 -- | Compose the header using the number of sites and the population names. header :: NSites -> PopulationNames -> L.ByteString header nSites popNames = L.unlines [lineOne, lineTwo] where nPop = length popNames lineOne = L.pack $ "COUNTSFILE NPOP " ++ show nPop ++ " NSITES " ++ show nSites lineTwo = L.unwords $ [ alignLeft colW $ L.pack "CHROM" , alignRight colW $ L.pack "POS" ] ++ map (alignLeft colW) popNames -- | The chromosome name. type Chrom = L.ByteString -- | The position on the chromosome. type Pos = Int -- | The set of boundary states for one site. type DataOneSite = [State] -- | Get a data line in the counts file. dataLine :: Maybe Chrom -> Maybe Pos -> DataOneSite -> L.ByteString dataLine chrom mPos bstates = L.unwords $ [ alignLeft colW (fromMaybe (L.pack "NA") chrom) , alignRight colW (L.pack (maybe "NaN" show mPos)) ] ++ map (alignRight colW . showCounts) bstates -- | Convert data to a counts file. toCountsFile :: PopulationNames -> [(Maybe Chrom, Maybe Pos, DataOneSite)] -> L.ByteString toCountsFile ns d = L.unlines $ header l ns : zipWith3 dataLine cs ps ds where l = length d (cs, ps, ds) = unzip3 d