-- | Degenerate base symbol representation. We use the same conventions as in -- <> which ignores -- @U@racil, except if it stands alone for @Char@ and @XNA@ targets. If the -- 'Degenerate' target is @RNA@, then we create @U@s instead of @T@s. -- -- TODO Shall we handle 'Complement' for degenerates? module Biobase.Primary.IUPAC where import Control.Arrow ((***)) import Data.ByteString.Char8 (ByteString,unpack) import Data.Char (toUpper) import Data.FileEmbed (embedFile) import Data.List (nub,sort) import Data.String import Data.Tuple (swap) import qualified Data.Vector.Unboxed as VU import Control.Category ((>>>)) import Biobase.Primary.Letter import Biobase.Primary.Nuc import qualified Biobase.Primary.Nuc.RNA as R -- | Allow the full, including degenerates, alphabet. data DEG pattern A = Letter 0 :: Letter DEG pattern C = Letter 1 :: Letter DEG pattern G = Letter 2 :: Letter DEG pattern T = Letter 3 :: Letter DEG pattern U = Letter 4 :: Letter DEG pattern W = Letter 5 :: Letter DEG pattern S = Letter 6 :: Letter DEG pattern M = Letter 7 :: Letter DEG pattern K = Letter 8 :: Letter DEG pattern R = Letter 9 :: Letter DEG pattern Y = Letter 10 :: Letter DEG pattern B = Letter 11 :: Letter DEG pattern D = Letter 12 :: Letter DEG pattern H = Letter 13 :: Letter DEG pattern V = Letter 14 :: Letter DEG pattern N = Letter 15 :: Letter DEG instance Bounded (Letter DEG) where minBound = A maxBound = N instance Enum (Letter DEG) where succ N = error "succ/N:DEG" succ (Letter x) = Letter $ x+1 pred A = error "pred/A:DEG" pred (Letter x) = Letter $ x-1 toEnum k | k>=0 && k<=15 = Letter k toEnum k = error $ "toEnum/Letter DEG " ++ show k fromEnum (Letter k) = k charDEG = toUpper >>> \case 'A' -> A 'C' -> C 'G' -> G 'T' -> T 'U' -> U 'W' -> W 'S' -> S 'M' -> M 'K' -> K 'R' -> R 'Y' -> Y 'B' -> B 'D' -> D 'H' -> H 'V' -> V _ -> N {-# INLINE charDEG #-} degChar = \case A -> 'A' C -> 'C' G -> 'G' T -> 'T' U -> 'U' W -> 'W' S -> 'S' M -> 'M' K -> 'K' R -> 'R' Y -> 'Y' B -> 'B' D -> 'D' H -> 'H' V -> 'V' N -> 'N' {-# INLINE degChar #-} instance Show (Letter DEG) where show c = [degChar c] degSeq :: MkPrimary n DEG => n -> Primary DEG degSeq = primary instance MkPrimary (VU.Vector Char) DEG where primary = VU.map charDEG instance IsString [Letter DEG] where fromString = map charDEG -- * Conversions class Degenerate x where fromDegenerate :: Char -> [x] toDegenerate :: [x] -> Maybe Char instance Degenerate Char where fromDegenerate = maybe [] id . flip lookup iupacXDNAchars toDegenerate = flip lookup (map swap iupacXDNAchars) . nub . sort instance Degenerate (Letter RNA) where fromDegenerate 'T' = [] fromDegenerate x = map dnaTrna $ fromDegenerate x toDegenerate xs | xs == [R.U] = Just 'U' | otherwise = toDegenerate $ map rnaTdna xs instance Degenerate (Letter DNA) where fromDegenerate 'U' = [] fromDegenerate x = map charDNA $ fromDegenerate x toDegenerate = toDegenerate . map dnaChar instance Degenerate (Letter XNA) where fromDegenerate = map charXNA . fromDegenerate toDegenerate = toDegenerate . map xnaChar -- * Raw embeddings -- | list of characters, using the XNA alphabet, but degenerate chars -- assume DNA characters. iupacXDNAchars :: [(Char,String)] iupacXDNAchars = map (go . words) . lines . unpack $ iupacNucleotides where go [[c],cs] = (c,cs) {-# NOINLINE iupacXDNAchars #-} -- | Raw iupac data, embedded into the library. iupacNucleotides :: ByteString iupacNucleotides = $(embedFile "sources/iupac-nucleotides")