{- | Description : State space of the boundary mutation model Copyright : (c) Dominik Schrempf 2017 License : GPLv3 Maintainer : dominik.schrempf@gmail.com Stability : unstable Portability : non-portable (not tested) The boundary mutation model is a discrete-state, continuous-time Markov process that allows mutations only when the population is monomorphic. * Changelog TODO: BM states can not be read and written with a single letter, like characters. Check status of implementation. -} module ELynx.Data.Character.BoundaryMutation ( -- * Types Nucleotide , Allele , PopulationSize , AlleleCount , State(..) , showCounts , nFixed -- * Functions , setPopulationSize , fromIndexWith , toIndex , stateSpace , stateSpaceSize , neighbors ) where import qualified Data.ByteString.Lazy.Char8 as L import Numeric.SpecFunctions (choose) import ELynx.Data.Character.Nucleotide import ELynx.Tools.Misc -- | Alleles are just nucleotides at the moment. However, I want to keep the -- code such that it can be extended easily to codons or amino acids. type Allele = Nucleotide -- | The population size has to be larger than one otherwise there be dragons. type PopulationSize = Int -- | The absolute frequency of an allele. type AlleleCount = Int -- | The number of alleles. nAlleles :: Int nAlleles = 1 + fromEnum (maxBound :: Allele) -- | A boundary mutation model state is either a boundary state or a polymorphic -- state. The population size has to be larger than one; the allele count has to -- be larger than one and lower than the population size, otherwise there be -- dragons. -- -- Another possibility would be: -- @ -- data State = Bnd Allele | Ply AlleleCount Allele Allele -- data StateComplete = StateComplete PopulationSize State -- @ -- But then, I think it is more important that the information is kept in one, -- at the cost of some overhead. data State = Bnd { bndN :: PopulationSize -- | Population size. , bndA :: Allele } | Ply { plyN :: PopulationSize -- | Population size. , plyI :: AlleleCount -- | Allele count. , plyA :: Allele , plyB :: Allele } deriving (Read, Eq) -- | L.ByteString representation of 'State'; without surrounding brackets. showCounts :: State -> L.ByteString showCounts (Bnd n a) = L.intersperse ',' $ L.concat $ map (L.pack . toCounts) allValues where toCounts b | a == b = show n | otherwise = "0" showCounts (Ply n i a b) = L.intersperse ',' $ L.concat $ map (L.pack . toCounts) allValues where toCounts c | c == a = show i | c == b = show (n-i) | otherwise = "0" showState :: State -> L.ByteString showState s = L.singleton '(' <> showCounts s <> L.singleton ')' -- instance Show State where -- show s = "(" ++ showCounts s ++ ")" -- | A total order on the boundary mutation model states. In general, Bnd < Ply. -- Then, sorting happens according to the order population size, first allele, -- second allele, allele count. It may be beneficial to reverse the allele count -- order (i.e., make a polymorphic state with higher allele count show up before -- a polymorphic state with lower allele count, this would move some polymorphic -- states closer to their respective boundaries), instance Ord State where Bnd {} <= Ply {} = True Ply {} <= Bnd {} = False s@(Bnd n a) <= t@(Bnd m b) | s == t = True | n /= m = n <= m | otherwise = a <= b s@(Ply n i a b) <= t@(Ply m j c d) | s == t = True | n /= m = n <= m | a < c = True | a > c = False -- We can be sure that a = c now. | b < d = True | b > d = False -- Now we can be sure that both nucleotides are the same. | otherwise = i <= j -- | Fixed population size when converting a 'State' to or from a number. In -- this case, a fixed population size is necessary so that @toEnum . fromEnum == -- id@. When converting from a number to 'State', the population size has to be -- given or assumed (see 'fromIndexWith') anyways. Especially when performing IO, -- the same number should always correspond to the same 'State' (bijection). -- 'nFixed' has been set such that the size of the state space is 256. nFixed :: Int nFixed = 43 -- | Set the population size of a 'State'; validity of resulting 'State' is checked. setPopulationSize :: PopulationSize -> State -> Maybe State setPopulationSize n s = if valid s' then Just s' else Nothing where s' = unsafeSetPopulationSize n s -- | See 'setPopulationSize'. Does not check if resulting 'State' is valid. unsafeSetPopulationSize :: Int -> State -> State unsafeSetPopulationSize n (Bnd _ s) = Bnd n s unsafeSetPopulationSize n (Ply _ i a b) = Ply n i a b -- | For a given population size 'PopulationSize', convert a number 'Int' to 'State'. fromIndexWith :: PopulationSize -> Int -> State fromIndexWith n i | i >= stateSpaceSize n = error $ "Index " ++ show i ++ "out of bounds when population size is " ++ show n ++ "." | i < nAlleles = Bnd n (toEnum i) | otherwise = Ply n (i' - j + 1) x k where i' = i - nAlleles l = [ (enumCombination a b * (n-1), a, b) | a <- [minBound .. pred maxBound] , b <- [succ a ..]] (j, x, k) = last $ takeWhile (\(e, _, _) -> e <= i') l -- | Convert 'State' to a number 'Int' for the given population size 'PopulationSize'. -- Back conversion can be done with 'fromIndexWith', with the same population size. toIndex :: State -> Int toIndex (Bnd _ a) = fromEnum a -- We also have to shift the enumeration value by the number of boundary -- states, which is 'nAlleles'. toIndex (Ply n i a b) = nAlleles + enumCombination a b * (n-1) + i-1 -- | Enumeration only works when the population size is 'nFixed'. Only then, -- @toEnum . fromEnum == id@ can be guaranteed. This is because @toEnum :: -- State@ is only defined if the population size is known. See also -- 'fromIndexWith', and 'toIndex', as well as, 'setPopulationSize'. instance Enum State where fromEnum s = if getPopulationSize s /= nFixed then error $ "State is not enumerable: " ++ (L.unpack . showState) s ++ "." else toIndex s toEnum = fromIndexWith nFixed -- The formula is a little complicated. Sketch of derivation: Order the states -- in the following way: -- @ -- AC CG GT -- AG CT -- AT -- @ -- The edge length of the triangle is @'nAlleles' - 1@. Use Gauss's triangle -- equation @area=binom(length+1, 2)@ twice to count the number of combinations -- up to a certain allele. E.g., up to, but excluding G: -- @ -- AC CG -- AG CT -- AT -- @ countCombinationsUpToAllele :: Allele -> Int countCombinationsUpToAllele a = round $ nAlleles `choose` 2 - (nAlleles - fromEnum a) `choose` 2 -- See 'countCombinationsUpToAllele'. The @-1@ pops up because we start counting -- from 0. For example, the enumeration value of @GT@ (with @fromEnum G = k = 2@ -- and @fromEnum T = 3@) is then @enumCombinationsUpToK 2 + (3-2)@. enumCombination :: Allele -> Allele -> Int enumCombination a b = countCombinationsUpToAllele a - 1 + (fromEnum b - fromEnum a) -- | A fixed population size 'nFixed' is assumed. instance Bounded State where minBound = Bnd nFixed minBound maxBound = Ply nFixed (nFixed-1) (pred maxBound) maxBound -- -- I am not sure if I should remove the 'Character' instance because writing -- -- Fasta files with boundary mutation model states is not really promising -- -- anyways. However, the 'toIndex' and 'fromIndexWith' function provide a -- -- convenient way to map states to integers. This functionality is needed when -- -- working with matrices. -- -- | A fixed population size 'nFixed' is assumed. -- instance Character State where -- fromWord = toEnum . fromEnum -- toWord = toEnum . fromEnum -- -- FIXME: This requires more thought. Are polymorphic characters standard? -- isStandard _ = error "Requires more thought." -- -- FIXME: This requires more work. (0,0,0,0) should be a gap! -- isGapOrUnknown _ = error "Not implemented." valid :: State -> Bool valid (Bnd n _) | n <= 1 = False | otherwise = True valid (Ply n i a b) | n <= 1 = False | a >= b = False | i <= 0 = False | i >= n = False | otherwise = True filterValidStates :: [State] -> [State] filterValidStates = filter valid getPopulationSize :: State -> PopulationSize getPopulationSize (Bnd n _) = n getPopulationSize (Ply n _ _ _) = n -- CCC: This function is a not very efficient. A better would be something like: -- @ -- | Sorted list of all possible PoMo states for a specific population size. stateSpace :: PopulationSize -> [State] stateSpace n = map (fromIndexWith n) [0 .. stateSpaceSize n - 1] -- An easier, but slower implementation. -- stateSpace n -- | n <= 1 = error "The population size has to be larger than one." -- | otherwise = sort $ filterValidStates ( allBndStates ++ allPlyStates ) -- where allBndStates = [ Bnd n a | -- a <- [minBound .. maxBound] :: [Allele] ] -- allPlyStates = [ Ply n i a b | -- i <- [0..n], -- a <- [minBound .. maxBound] :: [Allele], -- b <- [minBound .. maxBound] :: [Allele] ] -- | The state space of the boundary mutation model for four alleles and a -- population size N is 4 + 6*(N-1). stateSpaceSize :: PopulationSize -> Int stateSpaceSize n = k + k*(k-1) `div` 2 * (n-1) where k = nAlleles -- -- This is a very convenient version of toIndex, but can we always guarantee -- -- that the state space is sorted the same way? -- -- | Convert a boundary state to its ID (integer). See also 'idToState'. -- stateToId :: State -> Maybe Int -- stateToId s = elemIndex s (stateSpace $ getPopulationSize s) -- -- Same here. -- -- | Convert an ID to a boundary state. See also 'stateID'. -- idToState :: PopulationSize -> Int -> State -- idToState n i = stateSpace n !! i -- | Check if two states are connected. By definition, states are NOT connected -- with themselves. neighbors :: State -> State -> Bool neighbors s t = s `elem` getNeighbors t getNeighbors :: State -> [State] getNeighbors (Bnd n a) = filterValidStates allNeighbors where allNeighbors = [ Ply n (n-1) a b | b <- [minBound .. maxBound] :: [Allele] ] ++ [ Ply n 1 b a | b <- [minBound .. maxBound] :: [Allele] ] getNeighbors (Ply n i a b) -- Careful when the population size is two, because then each polymorphic -- states has two boundary states as neighbors. | i == 1 && n == 2 = Bnd n a : [Bnd n b] | i == 1 = Bnd n b : [Ply n 2 a b] | i == (n-1) = Bnd n a : [Ply n (n-2) a b] | otherwise = Ply n (i+1) a b : [Ply n (i-1) a b]