{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeFamilies #-}

-- |
-- Module      :  ELynx.Data.NucleotideX
-- Description :  Extended nucleotides including gaps and unknowns
-- Copyright   :  (c) Dominik Schrempf 2021
--
-- License     :  GPL-3.0-or-later
--
-- Maintainer  :  dominik.schrempf@gmail.com
-- Stability   :  unstable
-- Portability :  portable
--
-- See header of 'ELynx.Data.Alphabet'.
--
-- Extended nucleotides with gaps. See also
-- https://www.bioinformatics.org/sms/iupac.html or
-- https://en.wikipedia.org/wiki/International_Union_of_Pure_and_Applied_Chemistry.
--
-- @
-- Symbol  Description  Bases represented  Complement
-- ------  -----------  -----------------  ----------
-- A       Adenine      A                  T
-- C       Cytosine        C               G
-- G       Guanine            G            C
-- T       Thymine               T         A
-- ------  -----------  -----------------  ----------
-- - or .  Gap (Zero)                      -
-- @
module ELynx.Data.Character.NucleotideX
  ( NucleotideX (..),
  )
where

import Data.ByteString.Internal (c2w, w2c)
import Data.Vector.Unboxed.Deriving
import Data.Word8
import qualified ELynx.Data.Character.Character as C

-- | Extended nucleotides.
data NucleotideX
  = A
  | C
  | G
  | T
  | Gap
  deriving (Show, Read, Eq, Ord, Enum, Bounded)

toWord :: NucleotideX -> Word8
toWord A = c2w 'A'
toWord C = c2w 'C'
toWord G = c2w 'G'
toWord T = c2w 'T'
toWord Gap = c2w '-'

fromWord :: Word8 -> NucleotideX
fromWord w = case w2c w of
  'A' -> A
  'C' -> C
  'G' -> G
  'T' -> T
  '-' -> Gap
  '.' -> Gap
  c -> error $ "fromWord: Cannot convert " ++ show c ++ " to NucleotideX."

derivingUnbox
  "NucleotideX"
  [t|NucleotideX -> Word8|]
  [|toWord|]
  [|fromWord|]

instance C.Character NucleotideX where
  toWord = toWord
  fromWord = fromWord

instance C.CharacterX NucleotideX where
  gap = Gap