{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TypeFamilies #-} {- | Module : ELynx.Data.NucleotideI Description : Nucleotides with IUPAC characters Copyright : (c) Dominik Schrempf 2018 License : GPL-3.0-or-later Maintainer : dominik.schrempf@gmail.com Stability : unstable Portability : portable Creation date: Thu Oct 4 18:26:35 2018. See header of 'ELynx.Data.Alphabet.Alphabet'. Nucleotide IUPAC code. See also https://www.bioinformatics.org/sms/iupac.html or https://en.wikipedia.org/wiki/International_Union_of_Pure_and_Applied_Chemistry. @ Symbol Description Bases represented Complement ------ ----------- ----------------- ---------- A Adenine A T C Cytosine C G G Guanine G C T Thymine T A ------ ----------- ----------------- ---------- U Uracil U A W Weak A T W S Strong C G S M aMino A C K K Keto G T M R puRine A G Y Y pYrimidine C T R B not A C G T V D not C A G T H H not G A C T D V not T A C G B ------ ----------- ----------------- ---------- N any A C G T N ------ ----------- ----------------- ---------- - or . Gap (Zero) - @ -} module ELynx.Data.Character.NucleotideI ( NucleotideI(..) ) where import Data.Vector.Unboxed.Deriving import Data.Word8 import qualified ELynx.Data.Character.Character as C import ELynx.Tools -- | NucleotideIs. data NucleotideI = A | C | G | T | U | W | S | M | K | R | Y | B | D | H | V | N | Gap deriving (Show, Read, Eq, Ord, Enum, Bounded) -- See https://stackoverflow.com/a/31527024; apparently, pattern matching (and -- case statements) are fast because they are compiled to lookup tables. Hence, -- they are faster than guards (because equality has to be checked), and faster -- than lookups with sets. toWord :: NucleotideI -> Word8 toWord A = c2w 'A' toWord C = c2w 'C' toWord G = c2w 'G' toWord T = c2w 'T' toWord U = c2w 'U' toWord W = c2w 'W' toWord S = c2w 'S' toWord M = c2w 'M' toWord K = c2w 'K' toWord R = c2w 'R' toWord Y = c2w 'Y' toWord B = c2w 'B' toWord D = c2w 'D' toWord H = c2w 'H' toWord V = c2w 'V' toWord N = c2w 'N' toWord Gap = c2w '-' fromWord :: Word8 -> NucleotideI fromWord w = case w2c w of 'A' -> A 'C' -> C 'G' -> G 'T' -> T 'U' -> U 'W' -> W 'S' -> S 'M' -> M 'K' -> K 'R' -> R 'Y' -> Y 'B' -> B 'D' -> D 'H' -> H 'V' -> V 'N' -> N '-' -> Gap '.' -> Gap _ -> error "fromWord: Cannot convert to NucleotideI." derivingUnbox "NucleotideI" [t| NucleotideI -> Word8 |] [| toWord |] [| fromWord |] instance C.Character NucleotideI where toWord = toWord fromWord = fromWord toStandard :: NucleotideI -> [NucleotideI] toStandard A = [A] toStandard C = [C] toStandard G = [G] toStandard T = [T] toStandard U = [T] toStandard W = [A, T] toStandard S = [G, C] toStandard M = [A, C] toStandard K = [G, T] toStandard R = [A, G] toStandard Y = [C, T] toStandard B = [C, G, T] toStandard D = [A, G, T] toStandard H = [A, C, T] toStandard V = [A, C, G] toStandard N = [A, C, G, T] toStandard Gap = [] instance C.CharacterX NucleotideI where gap = Gap instance C.CharacterI NucleotideI where unknown = N iupac = [U, W, S, M, K, R, Y, B, D, H, V, N] toStandard = toStandard