{-# LANGUAGE GeneralizedNewtypeDeriving, TypeFamilies, CPP #-} {-# LANGUAGE ExistentialQuantification, TemplateHaskell #-} -- | Common data types used everywhere. This module is a collection of -- very basic "bioinformatics" data types that are simple, but don't -- make sense to define over and over. module Bio.Base( Nucleotide(..), Nucleotides(..), Qual(..), toQual, fromQual, fromQualRaised, probToQual, Prob'(..), Prob, toProb, fromProb, qualToProb, pow, Word8, nucA, nucC, nucG, nucT, nucsA, nucsC, nucsG, nucsT, nucsN, gap, toNucleotide, toNucleotides, nucToNucs, showNucleotide, showNucleotides, isGap, isBase, isProperBase, properBases, compl, compls, Seqid, Position(..), shiftPosition, p_is_reverse, Range(..), shiftRange, reverseRange, extendRange, insideRange, wrapRange, w2c, c2w, findAuxFile ) where import BasePrelude #if MIN_VERSION_base(4,9,0) hiding ( log1pexp, log1mexp ) #endif import Bio.Util.Numeric ( log1pexp, log1mexp ) import Data.ByteString.Internal ( c2w, w2c ) import Data.Vector.Unboxed.Deriving ( derivingUnbox ) import System.Directory ( doesFileExist ) import System.FilePath ( (</>), isAbsolute, splitSearchPath ) import qualified Data.ByteString.Char8 as S import qualified Data.Vector.Unboxed as U -- | A nucleotide base. We only represent A,C,G,T. The contained -- 'Word8' ist guaranteed to be 0..3. newtype Nucleotide = N { unN :: Word8 } deriving ( Eq, Ord, Enum, Ix, Storable ) derivingUnbox "Nucleotide" [t| Nucleotide -> Word8 |] [| unN |] [| N |] instance Bounded Nucleotide where minBound = N 0 maxBound = N 3 -- | A nucleotide base in an alignment. -- Experience says we're dealing with Ns and gaps all the type, so -- purity be damned, they are included as if they were real bases. -- -- To allow @Nucleotides@s to be unpacked and incorporated into -- containers, we choose to represent them the same way as the BAM file -- format: as a 4 bit wide field. Gaps are encoded as 0 where they -- make sense, N is 15. The contained 'Word8' is guaranteed to be -- 0..15. newtype Nucleotides = Ns { unNs :: Word8 } deriving ( Eq, Ord, Enum, Ix, Storable ) derivingUnbox "Nucleotides" [t| Nucleotides -> Word8 |] [| unNs |] [| Ns |] instance Bounded Nucleotides where minBound = Ns 0 maxBound = Ns 15 nucToNucs :: Nucleotide -> Nucleotides nucToNucs (N x) = Ns $ 1 `shiftL` fromIntegral (x .&. 3) -- | Qualities are stored in deciban, also known as the Phred scale. To -- represent a value @p@, we store @-10 * log_10 p@. Operations work -- directly on the \"Phred\" value, as the name suggests. The same goes -- for the 'Ord' instance: greater quality means higher \"Phred\" -- score, meand lower error probability. newtype Qual = Q { unQ :: Word8 } deriving ( Eq, Ord, Storable, Bounded ) derivingUnbox "Qual" [t| Qual -> Word8 |] [| unQ |] [| Q |] instance Show Qual where showsPrec p (Q q) = (:) 'q' . showsPrec p q toQual :: (Floating a, RealFrac a) => a -> Qual toQual a = Q $ round (-10 * log a / log 10) fromQual :: Qual -> Double fromQual (Q q) = 10 ** (- fromIntegral q / 10) fromQualRaised :: Double -> Qual -> Double fromQualRaised k (Q q) = 10 ** (- k * fromIntegral q / 10) -- | A positive floating point value stored in log domain. We store the -- natural logarithm (makes computation easier), but allow conversions -- to the familiar \"Phred\" scale used for 'Qual' values. newtype Prob' a = Pr { unPr :: a } deriving ( Eq, Ord, Storable ) -- | Common way of using 'Prob''. type Prob = Prob' Double derivingUnbox "Prob'" [t| forall a . U.Unbox a => Prob' a -> a |] [| unPr |] [| Pr |] instance RealFloat a => Show (Prob' a) where showsPrec _ (Pr p) = (:) 'q' . showFFloat (Just 1) q where q = - 10 * p / log 10 instance (Floating a, Ord a) => Num (Prob' a) where {-# INLINE fromInteger #-} fromInteger a = Pr (log (fromInteger a)) {-# INLINE (+) #-} Pr x + Pr y = Pr $ if x >= y then x + log1pexp (y-x) else y + log1pexp (x-y) {-# INLINE (-) #-} Pr x - Pr y = Pr $ if x >= y then x + log1mexp (y-x) else error "no negative error probabilities" {-# INLINE (*) #-} Pr a * Pr b = Pr $ a + b {-# INLINE negate #-} negate _ = Pr $ error "no negative error probabilities" {-# INLINE abs #-} abs x = x {-# INLINE signum #-} signum _ = Pr 0 instance (Floating a, Fractional a, Ord a) => Fractional (Prob' a) where fromRational a = Pr (log (fromRational a)) Pr a / Pr b = Pr (a - b) recip (Pr a) = Pr (negate a) infixr 8 `pow` pow :: Num a => Prob' a -> a -> Prob' a pow (Pr a) e = Pr $ a * e toProb :: Floating a => a -> Prob' a toProb p = Pr (log p) fromProb :: Floating a => Prob' a -> a fromProb (Pr q) = exp q qualToProb :: Floating a => Qual -> Prob' a qualToProb (Q q) = Pr (- log 10 * fromIntegral q / 10) probToQual :: (Floating a, RealFrac a) => Prob' a -> Qual probToQual (Pr p) = Q (round (- 10 * p / log 10)) nucA, nucC, nucG, nucT :: Nucleotide nucA = N 0 nucC = N 1 nucG = N 2 nucT = N 3 gap, nucsA, nucsC, nucsG, nucsT, nucsN :: Nucleotides gap = Ns 0 nucsA = Ns 1 nucsC = Ns 2 nucsG = Ns 4 nucsT = Ns 8 nucsN = Ns 15 -- | Sequence identifiers are ASCII strings -- Since we tend to store them for a while, we use strict byte strings. type Seqid = S.ByteString -- | Coordinates in a genome. -- The position is zero-based, no questions about it. Think of the -- position as pointing to the crack between two bases: looking forward -- you see the next base to the right, looking in the reverse direction -- you see the complement of the first base to the left. -- -- To encode the strand, we (virtually) reverse-complement any sequence -- and prepend it to the normal one. That way, reversed coordinates -- have a negative sign and automatically make sense. Position 0 could -- either be the beginning of the sequence or the end on the reverse -- strand... that ambiguity shouldn't really matter. data Position = Pos { p_seq :: {-# UNPACK #-} !Seqid, -- ^ sequence (e.g. some chromosome) p_start :: {-# UNPACK #-} !Int -- ^ offset, zero-based } deriving (Show, Eq, Ord) p_is_reverse :: Position -> Bool p_is_reverse = (< 0) . p_start -- | Ranges in genomes -- We combine a position with a length. In 'Range pos len', 'pos' is -- always the start of a stretch of length 'len'. Positions therefore -- move in the opposite direction on the reverse strand. To get the -- same stretch on the reverse strand, shift r_pos by r_length, then -- reverse direction (or call reverseRange). data Range = Range { r_pos :: {-# UNPACK #-} !Position, r_length :: {-# UNPACK #-} !Int } deriving (Show, Eq, Ord) -- | Converts a character into a 'Nucleotides'. -- The usual codes for A,C,G,T and U are understood, '-' and '.' become -- gaps and everything else is an N. toNucleotide :: Char -> Nucleotide toNucleotide c = if ord c < 128 then N (ar `U.unsafeIndex` ord c) else N 0 where ar = U.replicate 128 0 U.// ( [ (ord x, n) | (x, N n) <- pairs ] ++ [ (ord (toUpper x), n) | (x, N n) <- pairs ] ) pairs = [ ('a', nucA), ('c', nucC), ('g', nucG), ('t', nucT) ] -- | Converts a character into a 'Nucleotides'. -- The usual codes for A,C,G,T and U are understood, '-' and '.' become -- gaps and everything else is an N. toNucleotides :: Char -> Nucleotides toNucleotides c = if ord c < 128 then Ns (ar `U.unsafeIndex` ord c) else nucsN where ar = U.replicate 128 (unNs nucsN) U.// ( [ (ord x, n) | (x, Ns n) <- pairs ] ++ [ (ord (toUpper x), n) | (x, Ns n) <- pairs ] ) Ns a `plus` Ns b = Ns (a .|. b) pairs = [ ('a', nucsA), ('c', nucsC), ('g', nucsG), ('t', nucsT), ('u', nucsT), ('-', gap), ('.', gap), ('b', nucsC `plus` nucsG `plus` nucsT), ('d', nucsA `plus` nucsG `plus` nucsT), ('h', nucsA `plus` nucsC `plus` nucsT), ('v', nucsA `plus` nucsC `plus` nucsG), ('k', nucsG `plus` nucsT), ('m', nucsA `plus` nucsC), ('s', nucsC `plus` nucsG), ('w', nucsA `plus` nucsT), ('r', nucsA `plus` nucsG), ('y', nucsC `plus` nucsT) ] -- | Tests if a 'Nucleotides' is a base. -- Returns 'True' for everything but gaps. isBase :: Nucleotides -> Bool isBase (Ns n) = n /= 0 -- | Tests if a 'Nucleotides' is a proper base. -- Returns 'True' for A,C,G,T only. isProperBase :: Nucleotides -> Bool isProperBase x = x == nucsA || x == nucsC || x == nucsG || x == nucsT properBases :: [ Nucleotides ] properBases = [ nucsA, nucsC, nucsG, nucsT ] -- | Tests if a 'Nucleotides' is a gap. -- Returns true only for the gap. isGap :: Nucleotides -> Bool isGap x = x == gap {-# INLINE showNucleotide #-} showNucleotide :: Nucleotide -> Char showNucleotide (N x) = S.index str $ fromIntegral $ x .&. 3 where str = S.pack "ACGT" {-# INLINE showNucleotides #-} showNucleotides :: Nucleotides -> Char showNucleotides (Ns x) = S.index str $ fromIntegral $ x .&. 15 where str = S.pack "-ACMGRSVTWYHKDBN" instance Show Nucleotide where show x = [ showNucleotide x ] showList l = (map showNucleotide l ++) instance Read Nucleotide where readsPrec _ ('a':cs) = [(nucA, cs)] readsPrec _ ('A':cs) = [(nucA, cs)] readsPrec _ ('c':cs) = [(nucC, cs)] readsPrec _ ('C':cs) = [(nucC, cs)] readsPrec _ ('g':cs) = [(nucG, cs)] readsPrec _ ('G':cs) = [(nucG, cs)] readsPrec _ ('t':cs) = [(nucT, cs)] readsPrec _ ('T':cs) = [(nucT, cs)] readsPrec _ ('u':cs) = [(nucT, cs)] readsPrec _ ('U':cs) = [(nucT, cs)] readsPrec _ _ = [ ] readList ('-':cs) = readList cs readList (c:cs) | isSpace c = readList cs | otherwise = case reads (c:cs) of [] -> [ ([],c:cs) ] xs -> [ (n:ns,r2) | (n,r1) <- xs, (ns,r2) <- readList r1 ] readList [] = [([],[])] instance Show Nucleotides where show x = [ showNucleotides x ] showList l = (map showNucleotides l ++) instance Read Nucleotides where readsPrec _ (c:cs) = [(toNucleotides c, cs)] readsPrec _ [ ] = [] readList s = let (hd,tl) = span (\c -> isAlpha c || isSpace c || '-' == c) s in [(map toNucleotides $ filter (not . isSpace) hd, tl)] -- | Complements a Nucleotides. {-# INLINE compl #-} compl :: Nucleotide -> Nucleotide compl (N n) = N $ n `xor` 3 -- | Complements a Nucleotides. {-# INLINE compls #-} compls :: Nucleotides -> Nucleotides compls (Ns x) = Ns $ ar `U.unsafeIndex` fromIntegral (x .&. 15) where !ar = U.fromListN 16 [ 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 ] -- | Moves a @Position@. The position is moved forward according to the -- strand, negative indexes move backward accordingly. shiftPosition :: Int -> Position -> Position shiftPosition a p = p { p_start = p_start p + a } -- | Moves a @Range@. This is just @shiftPosition@ lifted. shiftRange :: Int -> Range -> Range shiftRange a r = r { r_pos = shiftPosition a (r_pos r) } -- | Reverses a 'Range' to give the same @Range@ on the opposite strand. reverseRange :: Range -> Range reverseRange (Range (Pos sq pos) len) = Range (Pos sq (-pos-len)) len -- | Extends a range. The length of the range is simply increased. extendRange :: Int -> Range -> Range extendRange a r = r { r_length = r_length r + a } -- | Expands a subrange. -- @(range1 `insideRange` range2)@ interprets @range1@ as a subrange of -- @range2@ and computes its absolute coordinates. The sequence name of -- @range1@ is ignored. insideRange :: Range -> Range -> Range insideRange r1@(Range (Pos _ start1) length1) r2@(Range (Pos sq start2) length2) | start2 < 0 = reverseRange (insideRange r1 (reverseRange r2)) | start1 <= length2 = Range (Pos sq (start2 + start1)) (min length1 (length2 - start1)) | otherwise = Range (Pos sq (start2 + length2)) 0 -- | Wraps a range to a region. This simply normalizes the start -- position to be in the interval '[0,n)', which only makes sense if the -- @Range@ is to be mapped onto a circular genome. This works on both -- strands and the strand information is retained. wrapRange :: Int -> Range -> Range wrapRange n (Range (Pos sq s) l) = Range (Pos sq (s `mod` n)) l -- | Finds a file by searching the environment variable BIOHAZARD like a -- PATH. findAuxFile :: FilePath -> IO FilePath findAuxFile fn | isAbsolute fn = return fn | otherwise = go . maybe ["."] splitSearchPath . lookup "BIOHAZARD" =<< getEnvironment where go [ ] = return fn go (p:ps) = doesFileExist (p </> fn) >>= bool (return $ p </> fn) (go ps)