{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeFamilies #-}

-- |
-- Module      :  ELynx.Data.NucleotideI
-- Description :  Nucleotides with IUPAC characters
-- Copyright   :  (c) Dominik Schrempf 2021
-- License     :  GPL-3.0-or-later
--
-- Maintainer  :  dominik.schrempf@gmail.com
-- Stability   :  unstable
-- Portability :  portable
--
-- Creation date: Thu Oct  4 18:26:35 2018.
--
-- See header of 'ELynx.Data.Alphabet.Alphabet'.
--
-- Nucleotide IUPAC code. See also https://www.bioinformatics.org/sms/iupac.html or
-- https://en.wikipedia.org/wiki/International_Union_of_Pure_and_Applied_Chemistry.
--
-- Remarks:
--
-- - Question marks (@?@) are interpreted as unknowns (same as @N@). However, when
--   a sequence is printed/exported, @N@s will be used.
--
-- - Full stops (@.@) are interpreted as gaps (same as @-@). However, when a
--   sequence is printed/exported, @-@s will be used
--
-- @
-- Symbol  Description  Bases represented  Complement
-- ------  -----------  -----------------  ----------
-- A       Adenine      A                  T
-- C       Cytosine        C               G
-- G       Guanine            G            C
-- T       Thymine               T         A
-- ------  -----------  -----------------  ----------
-- U       Uracil                U         A
-- W       Weak         A        T         W
-- S       Strong          C  G            S
-- M       aMino        A  C               K
-- K       Keto               G  T         M
-- R       puRine       A     G            Y
-- Y       pYrimidine      C     T         R
-- B       not A           C  G  T         V
-- D       not C        A     G  T         H
-- H       not G        A  C     T         D
-- V       not T        A  C  G            B
-- ------  -----------  -----------------  ----------
-- N       any          A  C  G  T         N           (preferred)
-- ?       any          A  C  G  T         N
-- ------  -----------  -----------------  ----------
-- -       Gap (Zero)                      -           (preferred)
-- .       Gap (Zero)                      -
-- @
module ELynx.Data.Character.NucleotideI
  ( NucleotideI (..),
  )
where

import Data.ByteString.Internal (c2w, w2c)
import Data.Vector.Unboxed.Deriving
import Data.Word8
import qualified ELynx.Data.Character.Character as C

-- | NucleotideIs.
data NucleotideI
  = A
  | C
  | G
  | T
  | U
  | W
  | S
  | M
  | K
  | R
  | Y
  | B
  | D
  | H
  | V
  | N
  | Gap
  deriving (Int -> NucleotideI -> ShowS
[NucleotideI] -> ShowS
NucleotideI -> String
(Int -> NucleotideI -> ShowS)
-> (NucleotideI -> String)
-> ([NucleotideI] -> ShowS)
-> Show NucleotideI
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [NucleotideI] -> ShowS
$cshowList :: [NucleotideI] -> ShowS
show :: NucleotideI -> String
$cshow :: NucleotideI -> String
showsPrec :: Int -> NucleotideI -> ShowS
$cshowsPrec :: Int -> NucleotideI -> ShowS
Show, ReadPrec [NucleotideI]
ReadPrec NucleotideI
Int -> ReadS NucleotideI
ReadS [NucleotideI]
(Int -> ReadS NucleotideI)
-> ReadS [NucleotideI]
-> ReadPrec NucleotideI
-> ReadPrec [NucleotideI]
-> Read NucleotideI
forall a.
(Int -> ReadS a)
-> ReadS [a] -> ReadPrec a -> ReadPrec [a] -> Read a
readListPrec :: ReadPrec [NucleotideI]
$creadListPrec :: ReadPrec [NucleotideI]
readPrec :: ReadPrec NucleotideI
$creadPrec :: ReadPrec NucleotideI
readList :: ReadS [NucleotideI]
$creadList :: ReadS [NucleotideI]
readsPrec :: Int -> ReadS NucleotideI
$creadsPrec :: Int -> ReadS NucleotideI
Read, NucleotideI -> NucleotideI -> Bool
(NucleotideI -> NucleotideI -> Bool)
-> (NucleotideI -> NucleotideI -> Bool) -> Eq NucleotideI
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: NucleotideI -> NucleotideI -> Bool
$c/= :: NucleotideI -> NucleotideI -> Bool
== :: NucleotideI -> NucleotideI -> Bool
$c== :: NucleotideI -> NucleotideI -> Bool
Eq, Eq NucleotideI
Eq NucleotideI
-> (NucleotideI -> NucleotideI -> Ordering)
-> (NucleotideI -> NucleotideI -> Bool)
-> (NucleotideI -> NucleotideI -> Bool)
-> (NucleotideI -> NucleotideI -> Bool)
-> (NucleotideI -> NucleotideI -> Bool)
-> (NucleotideI -> NucleotideI -> NucleotideI)
-> (NucleotideI -> NucleotideI -> NucleotideI)
-> Ord NucleotideI
NucleotideI -> NucleotideI -> Bool
NucleotideI -> NucleotideI -> Ordering
NucleotideI -> NucleotideI -> NucleotideI
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: NucleotideI -> NucleotideI -> NucleotideI
$cmin :: NucleotideI -> NucleotideI -> NucleotideI
max :: NucleotideI -> NucleotideI -> NucleotideI
$cmax :: NucleotideI -> NucleotideI -> NucleotideI
>= :: NucleotideI -> NucleotideI -> Bool
$c>= :: NucleotideI -> NucleotideI -> Bool
> :: NucleotideI -> NucleotideI -> Bool
$c> :: NucleotideI -> NucleotideI -> Bool
<= :: NucleotideI -> NucleotideI -> Bool
$c<= :: NucleotideI -> NucleotideI -> Bool
< :: NucleotideI -> NucleotideI -> Bool
$c< :: NucleotideI -> NucleotideI -> Bool
compare :: NucleotideI -> NucleotideI -> Ordering
$ccompare :: NucleotideI -> NucleotideI -> Ordering
$cp1Ord :: Eq NucleotideI
Ord, Int -> NucleotideI
NucleotideI -> Int
NucleotideI -> [NucleotideI]
NucleotideI -> NucleotideI
NucleotideI -> NucleotideI -> [NucleotideI]
NucleotideI -> NucleotideI -> NucleotideI -> [NucleotideI]
(NucleotideI -> NucleotideI)
-> (NucleotideI -> NucleotideI)
-> (Int -> NucleotideI)
-> (NucleotideI -> Int)
-> (NucleotideI -> [NucleotideI])
-> (NucleotideI -> NucleotideI -> [NucleotideI])
-> (NucleotideI -> NucleotideI -> [NucleotideI])
-> (NucleotideI -> NucleotideI -> NucleotideI -> [NucleotideI])
-> Enum NucleotideI
forall a.
(a -> a)
-> (a -> a)
-> (Int -> a)
-> (a -> Int)
-> (a -> [a])
-> (a -> a -> [a])
-> (a -> a -> [a])
-> (a -> a -> a -> [a])
-> Enum a
enumFromThenTo :: NucleotideI -> NucleotideI -> NucleotideI -> [NucleotideI]
$cenumFromThenTo :: NucleotideI -> NucleotideI -> NucleotideI -> [NucleotideI]
enumFromTo :: NucleotideI -> NucleotideI -> [NucleotideI]
$cenumFromTo :: NucleotideI -> NucleotideI -> [NucleotideI]
enumFromThen :: NucleotideI -> NucleotideI -> [NucleotideI]
$cenumFromThen :: NucleotideI -> NucleotideI -> [NucleotideI]
enumFrom :: NucleotideI -> [NucleotideI]
$cenumFrom :: NucleotideI -> [NucleotideI]
fromEnum :: NucleotideI -> Int
$cfromEnum :: NucleotideI -> Int
toEnum :: Int -> NucleotideI
$ctoEnum :: Int -> NucleotideI
pred :: NucleotideI -> NucleotideI
$cpred :: NucleotideI -> NucleotideI
succ :: NucleotideI -> NucleotideI
$csucc :: NucleotideI -> NucleotideI
Enum, NucleotideI
NucleotideI -> NucleotideI -> Bounded NucleotideI
forall a. a -> a -> Bounded a
maxBound :: NucleotideI
$cmaxBound :: NucleotideI
minBound :: NucleotideI
$cminBound :: NucleotideI
Bounded)

-- See https://stackoverflow.com/a/31527024; apparently, pattern matching (and
-- case statements) are fast because they are compiled to lookup tables. Hence,
-- they are faster than guards (because equality has to be checked), and faster
-- than lookups with sets.
toWord :: NucleotideI -> Word8
toWord :: NucleotideI -> Word8
toWord NucleotideI
A = Char -> Word8
c2w Char
'A'
toWord NucleotideI
C = Char -> Word8
c2w Char
'C'
toWord NucleotideI
G = Char -> Word8
c2w Char
'G'
toWord NucleotideI
T = Char -> Word8
c2w Char
'T'
toWord NucleotideI
U = Char -> Word8
c2w Char
'U'
toWord NucleotideI
W = Char -> Word8
c2w Char
'W'
toWord NucleotideI
S = Char -> Word8
c2w Char
'S'
toWord NucleotideI
M = Char -> Word8
c2w Char
'M'
toWord NucleotideI
K = Char -> Word8
c2w Char
'K'
toWord NucleotideI
R = Char -> Word8
c2w Char
'R'
toWord NucleotideI
Y = Char -> Word8
c2w Char
'Y'
toWord NucleotideI
B = Char -> Word8
c2w Char
'B'
toWord NucleotideI
D = Char -> Word8
c2w Char
'D'
toWord NucleotideI
H = Char -> Word8
c2w Char
'H'
toWord NucleotideI
V = Char -> Word8
c2w Char
'V'
toWord NucleotideI
N = Char -> Word8
c2w Char
'N'
toWord NucleotideI
Gap = Char -> Word8
c2w Char
'-'

fromWord :: Word8 -> NucleotideI
fromWord :: Word8 -> NucleotideI
fromWord Word8
w = case Word8 -> Char
w2c Word8
w of
  Char
'A' -> NucleotideI
A
  Char
'C' -> NucleotideI
C
  Char
'G' -> NucleotideI
G
  Char
'T' -> NucleotideI
T
  Char
'U' -> NucleotideI
U
  Char
'W' -> NucleotideI
W
  Char
'S' -> NucleotideI
S
  Char
'M' -> NucleotideI
M
  Char
'K' -> NucleotideI
K
  Char
'R' -> NucleotideI
R
  Char
'Y' -> NucleotideI
Y
  Char
'B' -> NucleotideI
B
  Char
'D' -> NucleotideI
D
  Char
'H' -> NucleotideI
H
  Char
'V' -> NucleotideI
V
  Char
'N' -> NucleotideI
N
  Char
'?' -> NucleotideI
N
  Char
'-' -> NucleotideI
Gap
  Char
'.' -> NucleotideI
Gap
  Char
_ -> String -> NucleotideI
forall a. HasCallStack => String -> a
error String
"fromWord: Cannot convert to NucleotideI."

derivingUnbox
  "NucleotideI"
  [t|NucleotideI -> Word8|]
  [|toWord|]
  [|fromWord|]

instance C.Character NucleotideI where
  toWord :: NucleotideI -> Word8
toWord = NucleotideI -> Word8
toWord
  fromWord :: Word8 -> NucleotideI
fromWord = Word8 -> NucleotideI
fromWord

toStandard :: NucleotideI -> [NucleotideI]
toStandard :: NucleotideI -> [NucleotideI]
toStandard NucleotideI
A = [NucleotideI
A]
toStandard NucleotideI
C = [NucleotideI
C]
toStandard NucleotideI
G = [NucleotideI
G]
toStandard NucleotideI
T = [NucleotideI
T]
toStandard NucleotideI
U = [NucleotideI
T]
toStandard NucleotideI
W = [NucleotideI
A, NucleotideI
T]
toStandard NucleotideI
S = [NucleotideI
G, NucleotideI
C]
toStandard NucleotideI
M = [NucleotideI
A, NucleotideI
C]
toStandard NucleotideI
K = [NucleotideI
G, NucleotideI
T]
toStandard NucleotideI
R = [NucleotideI
A, NucleotideI
G]
toStandard NucleotideI
Y = [NucleotideI
C, NucleotideI
T]
toStandard NucleotideI
B = [NucleotideI
C, NucleotideI
G, NucleotideI
T]
toStandard NucleotideI
D = [NucleotideI
A, NucleotideI
G, NucleotideI
T]
toStandard NucleotideI
H = [NucleotideI
A, NucleotideI
C, NucleotideI
T]
toStandard NucleotideI
V = [NucleotideI
A, NucleotideI
C, NucleotideI
G]
toStandard NucleotideI
N = [NucleotideI
A, NucleotideI
C, NucleotideI
G, NucleotideI
T]
toStandard NucleotideI
Gap = []

instance C.CharacterX NucleotideI where
  gap :: NucleotideI
gap = NucleotideI
Gap

instance C.CharacterI NucleotideI where
  unknown :: NucleotideI
unknown = NucleotideI
N
  iupac :: [NucleotideI]
iupac = [NucleotideI
U, NucleotideI
W, NucleotideI
S, NucleotideI
M, NucleotideI
K, NucleotideI
R, NucleotideI
Y, NucleotideI
B, NucleotideI
D, NucleotideI
H, NucleotideI
V, NucleotideI
N]
  toStandard :: NucleotideI -> [NucleotideI]
toStandard = NucleotideI -> [NucleotideI]
toStandard