{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TypeFamilies #-} -- | -- Module : ELynx.Data.NucleotideI -- Description : Nucleotides with IUPAC characters -- Copyright : (c) Dominik Schrempf 2021 -- License : GPL-3.0-or-later -- -- Maintainer : dominik.schrempf@gmail.com -- Stability : unstable -- Portability : portable -- -- Creation date: Thu Oct 4 18:26:35 2018. -- -- See header of 'ELynx.Data.Alphabet.Alphabet'. -- -- Nucleotide IUPAC code. See also https://www.bioinformatics.org/sms/iupac.html or -- https://en.wikipedia.org/wiki/International_Union_of_Pure_and_Applied_Chemistry. -- -- Remarks: -- -- - Question marks (@?@) are interpreted as unknowns (same as @N@). However, when -- a sequence is printed/exported, @N@s will be used. -- -- - Full stops (@.@) are interpreted as gaps (same as @-@). However, when a -- sequence is printed/exported, @-@s will be used -- -- @ -- Symbol Description Bases represented Complement -- ------ ----------- ----------------- ---------- -- A Adenine A T -- C Cytosine C G -- G Guanine G C -- T Thymine T A -- ------ ----------- ----------------- ---------- -- U Uracil U A -- W Weak A T W -- S Strong C G S -- M aMino A C K -- K Keto G T M -- R puRine A G Y -- Y pYrimidine C T R -- B not A C G T V -- D not C A G T H -- H not G A C T D -- V not T A C G B -- ------ ----------- ----------------- ---------- -- N any A C G T N (preferred) -- ? any A C G T N -- ------ ----------- ----------------- ---------- -- - Gap (Zero) - (preferred) -- . Gap (Zero) - -- @ module ELynx.Data.Character.NucleotideI ( NucleotideI (..), ) where import Data.ByteString.Internal (c2w, w2c) import Data.Vector.Unboxed.Deriving import Data.Word8 import qualified ELynx.Data.Character.Character as C -- | NucleotideIs. data NucleotideI = A | C | G | T | U | W | S | M | K | R | Y | B | D | H | V | N | Gap deriving (Show, Read, Eq, Ord, Enum, Bounded) -- See https://stackoverflow.com/a/31527024; apparently, pattern matching (and -- case statements) are fast because they are compiled to lookup tables. Hence, -- they are faster than guards (because equality has to be checked), and faster -- than lookups with sets. toWord :: NucleotideI -> Word8 toWord A = c2w 'A' toWord C = c2w 'C' toWord G = c2w 'G' toWord T = c2w 'T' toWord U = c2w 'U' toWord W = c2w 'W' toWord S = c2w 'S' toWord M = c2w 'M' toWord K = c2w 'K' toWord R = c2w 'R' toWord Y = c2w 'Y' toWord B = c2w 'B' toWord D = c2w 'D' toWord H = c2w 'H' toWord V = c2w 'V' toWord N = c2w 'N' toWord Gap = c2w '-' fromWord :: Word8 -> NucleotideI fromWord w = case w2c w of 'A' -> A 'C' -> C 'G' -> G 'T' -> T 'U' -> U 'W' -> W 'S' -> S 'M' -> M 'K' -> K 'R' -> R 'Y' -> Y 'B' -> B 'D' -> D 'H' -> H 'V' -> V 'N' -> N '?' -> N '-' -> Gap '.' -> Gap _ -> error "fromWord: Cannot convert to NucleotideI." derivingUnbox "NucleotideI" [t|NucleotideI -> Word8|] [|toWord|] [|fromWord|] instance C.Character NucleotideI where toWord = toWord fromWord = fromWord toStandard :: NucleotideI -> [NucleotideI] toStandard A = [A] toStandard C = [C] toStandard G = [G] toStandard T = [T] toStandard U = [T] toStandard W = [A, T] toStandard S = [G, C] toStandard M = [A, C] toStandard K = [G, T] toStandard R = [A, G] toStandard Y = [C, T] toStandard B = [C, G, T] toStandard D = [A, G, T] toStandard H = [A, C, T] toStandard V = [A, C, G] toStandard N = [A, C, G, T] toStandard Gap = [] instance C.CharacterX NucleotideI where gap = Gap instance C.CharacterI NucleotideI where unknown = N iupac = [U, W, S, M, K, R, Y, B, D, H, V, N] toStandard = toStandard