{-# LANGUAGE DeriveGeneric #-}

-- |
-- Module      :  ELynx.Alphabet.Alphabet
-- Description :  Alphabets store hereditary information
-- Copyright   :  (c) Dominik Schrempf 2021
-- License     :  GPL-3.0-or-later
--
-- Maintainer  :  dominik.schrempf@gmail.com
-- Stability   :  unstable
--
-- Portability :  portable
--
-- Creation date: Fri May 10 11:10:32 2019.
--
-- Hierarchy:
--
-- 1. 'Character' type.
--
-- 2. Sets of 'Character's form 'Alphabet's; each 'Alphabet' has a specification
-- 'AlphabetSpec'.
--
-- New alphabets have to be added manually to this module.
--
-- This way of handling characters and alphabets IS NOT TYPE SAFE, but much, much
-- faster. A second layer of modules such as 'ELynx.Character.Nucleotide'
-- depend on a 'ELynx.Character.Character.Character' type class. Hence, they
-- provide a type safe way of handling alphabets. Conversion is possible, for
-- instance, with 'ELynx.Alphabet.Character.fromCVec', and
-- 'ELynx.Alphabet.Character.toCVec'.
module ELynx.Alphabet.Alphabet
  ( Alphabet (..),
    AlphabetSpec (..),
    alphabetSpec,
    alphabetDescription,
    isStd,
    isGap,
    isUnknown,
    isIUPAC,
    isMember,
  )
where

import Data.Aeson
  ( FromJSON,
    ToJSON,
  )
import qualified Data.Set as S
import ELynx.Alphabet.Character
import GHC.Generics (Generic)
import Prelude hiding (all)

-- | Available alphabets; for details see 'alphabetSpec'.
data Alphabet
  = DNA
  | DNAX
  | DNAI
  | Protein
  | ProteinX
  | ProteinS
  | ProteinI
  deriving (Int -> Alphabet -> ShowS
[Alphabet] -> ShowS
Alphabet -> String
(Int -> Alphabet -> ShowS)
-> (Alphabet -> String) -> ([Alphabet] -> ShowS) -> Show Alphabet
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Alphabet] -> ShowS
$cshowList :: [Alphabet] -> ShowS
show :: Alphabet -> String
$cshow :: Alphabet -> String
showsPrec :: Int -> Alphabet -> ShowS
$cshowsPrec :: Int -> Alphabet -> ShowS
Show, ReadPrec [Alphabet]
ReadPrec Alphabet
Int -> ReadS Alphabet
ReadS [Alphabet]
(Int -> ReadS Alphabet)
-> ReadS [Alphabet]
-> ReadPrec Alphabet
-> ReadPrec [Alphabet]
-> Read Alphabet
forall a.
(Int -> ReadS a)
-> ReadS [a] -> ReadPrec a -> ReadPrec [a] -> Read a
readListPrec :: ReadPrec [Alphabet]
$creadListPrec :: ReadPrec [Alphabet]
readPrec :: ReadPrec Alphabet
$creadPrec :: ReadPrec Alphabet
readList :: ReadS [Alphabet]
$creadList :: ReadS [Alphabet]
readsPrec :: Int -> ReadS Alphabet
$creadsPrec :: Int -> ReadS Alphabet
Read, Alphabet -> Alphabet -> Bool
(Alphabet -> Alphabet -> Bool)
-> (Alphabet -> Alphabet -> Bool) -> Eq Alphabet
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Alphabet -> Alphabet -> Bool
$c/= :: Alphabet -> Alphabet -> Bool
== :: Alphabet -> Alphabet -> Bool
$c== :: Alphabet -> Alphabet -> Bool
Eq, Eq Alphabet
Eq Alphabet
-> (Alphabet -> Alphabet -> Ordering)
-> (Alphabet -> Alphabet -> Bool)
-> (Alphabet -> Alphabet -> Bool)
-> (Alphabet -> Alphabet -> Bool)
-> (Alphabet -> Alphabet -> Bool)
-> (Alphabet -> Alphabet -> Alphabet)
-> (Alphabet -> Alphabet -> Alphabet)
-> Ord Alphabet
Alphabet -> Alphabet -> Bool
Alphabet -> Alphabet -> Ordering
Alphabet -> Alphabet -> Alphabet
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: Alphabet -> Alphabet -> Alphabet
$cmin :: Alphabet -> Alphabet -> Alphabet
max :: Alphabet -> Alphabet -> Alphabet
$cmax :: Alphabet -> Alphabet -> Alphabet
>= :: Alphabet -> Alphabet -> Bool
$c>= :: Alphabet -> Alphabet -> Bool
> :: Alphabet -> Alphabet -> Bool
$c> :: Alphabet -> Alphabet -> Bool
<= :: Alphabet -> Alphabet -> Bool
$c<= :: Alphabet -> Alphabet -> Bool
< :: Alphabet -> Alphabet -> Bool
$c< :: Alphabet -> Alphabet -> Bool
compare :: Alphabet -> Alphabet -> Ordering
$ccompare :: Alphabet -> Alphabet -> Ordering
$cp1Ord :: Eq Alphabet
Ord, Int -> Alphabet
Alphabet -> Int
Alphabet -> [Alphabet]
Alphabet -> Alphabet
Alphabet -> Alphabet -> [Alphabet]
Alphabet -> Alphabet -> Alphabet -> [Alphabet]
(Alphabet -> Alphabet)
-> (Alphabet -> Alphabet)
-> (Int -> Alphabet)
-> (Alphabet -> Int)
-> (Alphabet -> [Alphabet])
-> (Alphabet -> Alphabet -> [Alphabet])
-> (Alphabet -> Alphabet -> [Alphabet])
-> (Alphabet -> Alphabet -> Alphabet -> [Alphabet])
-> Enum Alphabet
forall a.
(a -> a)
-> (a -> a)
-> (Int -> a)
-> (a -> Int)
-> (a -> [a])
-> (a -> a -> [a])
-> (a -> a -> [a])
-> (a -> a -> a -> [a])
-> Enum a
enumFromThenTo :: Alphabet -> Alphabet -> Alphabet -> [Alphabet]
$cenumFromThenTo :: Alphabet -> Alphabet -> Alphabet -> [Alphabet]
enumFromTo :: Alphabet -> Alphabet -> [Alphabet]
$cenumFromTo :: Alphabet -> Alphabet -> [Alphabet]
enumFromThen :: Alphabet -> Alphabet -> [Alphabet]
$cenumFromThen :: Alphabet -> Alphabet -> [Alphabet]
enumFrom :: Alphabet -> [Alphabet]
$cenumFrom :: Alphabet -> [Alphabet]
fromEnum :: Alphabet -> Int
$cfromEnum :: Alphabet -> Int
toEnum :: Int -> Alphabet
$ctoEnum :: Int -> Alphabet
pred :: Alphabet -> Alphabet
$cpred :: Alphabet -> Alphabet
succ :: Alphabet -> Alphabet
$csucc :: Alphabet -> Alphabet
Enum, Alphabet
Alphabet -> Alphabet -> Bounded Alphabet
forall a. a -> a -> Bounded a
maxBound :: Alphabet
$cmaxBound :: Alphabet
minBound :: Alphabet
$cminBound :: Alphabet
Bounded, (forall x. Alphabet -> Rep Alphabet x)
-> (forall x. Rep Alphabet x -> Alphabet) -> Generic Alphabet
forall x. Rep Alphabet x -> Alphabet
forall x. Alphabet -> Rep Alphabet x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep Alphabet x -> Alphabet
$cfrom :: forall x. Alphabet -> Rep Alphabet x
Generic)

instance FromJSON Alphabet

instance ToJSON Alphabet

-- | Verbose alphabet name.
alphabetDescription :: Alphabet -> String
alphabetDescription :: Alphabet -> String
alphabetDescription Alphabet
DNA = String
"DNA (nucleotides)"
alphabetDescription Alphabet
DNAX = String
"DNAX (nucleotides; including gaps)"
alphabetDescription Alphabet
DNAI = String
"DNAI (nucleotides; including gaps, and IUPAC codes)"
alphabetDescription Alphabet
Protein = String
"Protein (amino acids)"
alphabetDescription Alphabet
ProteinX = String
"ProteinX (amino acids; including gaps)"
alphabetDescription Alphabet
ProteinS = String
"ProteinS (amino acids; including gaps, and translation stops)"
alphabetDescription Alphabet
ProteinI = String
"ProteinI (amino acids; including gaps, translation stops, and IUPAC codes)"

-- | Alphabet specification. 'S.Set' is used because it provides fast lookups.
data AlphabetSpec = AlphabetSpec
  { -- | Standard characters.
    AlphabetSpec -> Set Character
std :: !(S.Set Character),
    -- | Gap characters.
    AlphabetSpec -> Set Character
gap :: !(S.Set Character),
    -- | Unknown characters.
    AlphabetSpec -> Set Character
unknown :: !(S.Set Character),
    -- | Other IUPAC codes.
    AlphabetSpec -> Set Character
iupac :: !(S.Set Character),
    -- | All characters in the alphabet.
    AlphabetSpec -> Set Character
all :: !(S.Set Character),
    -- | Convert from IUPAC to the corresponding standard characters.
    AlphabetSpec -> Character -> [Character]
toStd :: Character -> [Character]
  }

-- Create alphabet spec.
fromChars ::
  String -> String -> String -> String -> (Char -> String) -> AlphabetSpec
fromChars :: String
-> String -> String -> String -> (Char -> String) -> AlphabetSpec
fromChars String
st String
ga String
un String
iu Char -> String
to =
  Set Character
-> Set Character
-> Set Character
-> Set Character
-> Set Character
-> (Character -> [Character])
-> AlphabetSpec
AlphabetSpec
    Set Character
st'
    Set Character
ga'
    Set Character
un'
    Set Character
iu'
    Set Character
al
    (String -> [Character]
fromString (String -> [Character])
-> (Character -> String) -> Character -> [Character]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> String
to (Char -> String) -> (Character -> Char) -> Character -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Character -> Char
toChar)
  where
    st' :: Set Character
st' = [Character] -> Set Character
forall a. Ord a => [a] -> Set a
S.fromList ([Character] -> Set Character) -> [Character] -> Set Character
forall a b. (a -> b) -> a -> b
$ String -> [Character]
fromString String
st
    ga' :: Set Character
ga' = [Character] -> Set Character
forall a. Ord a => [a] -> Set a
S.fromList ([Character] -> Set Character) -> [Character] -> Set Character
forall a b. (a -> b) -> a -> b
$ String -> [Character]
fromString String
ga
    un' :: Set Character
un' = [Character] -> Set Character
forall a. Ord a => [a] -> Set a
S.fromList ([Character] -> Set Character) -> [Character] -> Set Character
forall a b. (a -> b) -> a -> b
$ String -> [Character]
fromString String
un
    iu' :: Set Character
iu' = [Character] -> Set Character
forall a. Ord a => [a] -> Set a
S.fromList ([Character] -> Set Character) -> [Character] -> Set Character
forall a b. (a -> b) -> a -> b
$ String -> [Character]
fromString String
iu
    al :: Set Character
al = [Set Character] -> Set Character
forall (f :: * -> *) a. (Foldable f, Ord a) => f (Set a) -> Set a
S.unions [Set Character
st', Set Character
ga', Set Character
un', Set Character
iu']

-- | Get the alphabet specification for a given alphabet.
alphabetSpec :: Alphabet -> AlphabetSpec
alphabetSpec :: Alphabet -> AlphabetSpec
alphabetSpec Alphabet
DNA = AlphabetSpec
dna
alphabetSpec Alphabet
DNAX = AlphabetSpec
dnaX
alphabetSpec Alphabet
DNAI = AlphabetSpec
dnaI
alphabetSpec Alphabet
Protein = AlphabetSpec
protein
alphabetSpec Alphabet
ProteinX = AlphabetSpec
proteinX
alphabetSpec Alphabet
ProteinS = AlphabetSpec
proteinS
alphabetSpec Alphabet
ProteinI = AlphabetSpec
proteinI

isWith :: (AlphabetSpec -> S.Set Character) -> Alphabet -> Character -> Bool
isWith :: (AlphabetSpec -> Set Character) -> Alphabet -> Character -> Bool
isWith AlphabetSpec -> Set Character
set Alphabet
alph Character
char = Character
char Character -> Set Character -> Bool
forall a. Ord a => a -> Set a -> Bool
`S.member` AlphabetSpec -> Set Character
set (Alphabet -> AlphabetSpec
alphabetSpec Alphabet
alph)

-- | Test if standard character.
isStd :: Alphabet -> Character -> Bool
isStd :: Alphabet -> Character -> Bool
isStd = (AlphabetSpec -> Set Character) -> Alphabet -> Character -> Bool
isWith AlphabetSpec -> Set Character
std

-- | Test if gap.
isGap :: Alphabet -> Character -> Bool
isGap :: Alphabet -> Character -> Bool
isGap = (AlphabetSpec -> Set Character) -> Alphabet -> Character -> Bool
isWith AlphabetSpec -> Set Character
gap

-- | Test if unknown.
isUnknown :: Alphabet -> Character -> Bool
isUnknown :: Alphabet -> Character -> Bool
isUnknown = (AlphabetSpec -> Set Character) -> Alphabet -> Character -> Bool
isWith AlphabetSpec -> Set Character
unknown

-- | Test if extended IUPAC character (excluding gaps and unknowns).
isIUPAC :: Alphabet -> Character -> Bool
isIUPAC :: Alphabet -> Character -> Bool
isIUPAC = (AlphabetSpec -> Set Character) -> Alphabet -> Character -> Bool
isWith AlphabetSpec -> Set Character
iupac

-- | Test if member of alphabet.
isMember :: Alphabet -> Character -> Bool
isMember :: Alphabet -> Character -> Bool
isMember = (AlphabetSpec -> Set Character) -> Alphabet -> Character -> Bool
isWith AlphabetSpec -> Set Character
all

dna :: AlphabetSpec
dna :: AlphabetSpec
dna = String
-> String -> String -> String -> (Char -> String) -> AlphabetSpec
fromChars String
"ACGT" [] [] [] Char -> String
toStdDNA

toStdDNA :: Char -> String
toStdDNA :: Char -> String
toStdDNA Char
'A' = String
"A"
toStdDNA Char
'C' = String
"C"
toStdDNA Char
'G' = String
"G"
toStdDNA Char
'T' = String
"T"
toStdDNA Char
_ = ShowS
forall a. HasCallStack => String -> a
error String
"tostdDNA: Cannot convert to standard nucleotide."

dnaX :: AlphabetSpec
dnaX :: AlphabetSpec
dnaX = String
-> String -> String -> String -> (Char -> String) -> AlphabetSpec
fromChars String
"ACGT" String
"-." [] [] Char -> String
toStdDNAX

toStdDNAX :: Char -> String
toStdDNAX :: Char -> String
toStdDNAX Char
'A' = String
"A"
toStdDNAX Char
'C' = String
"C"
toStdDNAX Char
'G' = String
"G"
toStdDNAX Char
'T' = String
"T"
toStdDNAX Char
'-' = []
toStdDNAX Char
'.' = []
toStdDNAX Char
_ = ShowS
forall a. HasCallStack => String -> a
error String
"toStdDNAX: Cannot convert to standard nucleotide."

dnaI :: AlphabetSpec
dnaI :: AlphabetSpec
dnaI = String
-> String -> String -> String -> (Char -> String) -> AlphabetSpec
fromChars String
"ACGT" String
"-." String
"N?" String
"UWSMKRYBDHV" Char -> String
toStdDNAI

toStdDNAI :: Char -> String
toStdDNAI :: Char -> String
toStdDNAI Char
'A' = String
"A"
toStdDNAI Char
'C' = String
"C"
toStdDNAI Char
'G' = String
"G"
toStdDNAI Char
'T' = String
"T"
toStdDNAI Char
'U' = String
"T"
toStdDNAI Char
'W' = String
"AT"
toStdDNAI Char
'S' = String
"GC"
toStdDNAI Char
'M' = String
"AC"
toStdDNAI Char
'K' = String
"GT"
toStdDNAI Char
'R' = String
"AG"
toStdDNAI Char
'Y' = String
"CT"
toStdDNAI Char
'B' = String
"CGT"
toStdDNAI Char
'D' = String
"AGT"
toStdDNAI Char
'H' = String
"ACT"
toStdDNAI Char
'V' = String
"ACG"
toStdDNAI Char
'N' = String
"ACGT"
toStdDNAI Char
'?' = String
"ACGT"
toStdDNAI Char
'-' = []
toStdDNAI Char
'.' = []
toStdDNAI Char
_ = ShowS
forall a. HasCallStack => String -> a
error String
"toStdDNAI: Cannot convert to standard nucleotide."

protein :: AlphabetSpec
protein :: AlphabetSpec
protein = String
-> String -> String -> String -> (Char -> String) -> AlphabetSpec
fromChars String
"ACDEFGHIKLMNPQRSTVWY" [] [] [] Char -> String
toStdP

toStdP :: Char -> String
toStdP :: Char -> String
toStdP Char
'A' = String
"A"
toStdP Char
'C' = String
"C"
toStdP Char
'D' = String
"D"
toStdP Char
'E' = String
"E"
toStdP Char
'F' = String
"F"
toStdP Char
'G' = String
"G"
toStdP Char
'H' = String
"H"
toStdP Char
'I' = String
"I"
toStdP Char
'K' = String
"K"
toStdP Char
'L' = String
"L"
toStdP Char
'M' = String
"M"
toStdP Char
'N' = String
"N"
toStdP Char
'P' = String
"P"
toStdP Char
'Q' = String
"Q"
toStdP Char
'R' = String
"R"
toStdP Char
'S' = String
"S"
toStdP Char
'T' = String
"T"
toStdP Char
'V' = String
"V"
toStdP Char
'W' = String
"W"
toStdP Char
'Y' = String
"Y"
toStdP Char
_ = ShowS
forall a. HasCallStack => String -> a
error String
"toStdP: Cannot convert to standard amino acid."

proteinX :: AlphabetSpec
proteinX :: AlphabetSpec
proteinX = String
-> String -> String -> String -> (Char -> String) -> AlphabetSpec
fromChars String
"ACDEFGHIKLMNPQRSTVWY" String
"-." [] [] Char -> String
toStdPX

toStdPX :: Char -> String
toStdPX :: Char -> String
toStdPX Char
'A' = String
"A"
toStdPX Char
'C' = String
"C"
toStdPX Char
'D' = String
"D"
toStdPX Char
'E' = String
"E"
toStdPX Char
'F' = String
"F"
toStdPX Char
'G' = String
"G"
toStdPX Char
'H' = String
"H"
toStdPX Char
'I' = String
"I"
toStdPX Char
'K' = String
"K"
toStdPX Char
'L' = String
"L"
toStdPX Char
'M' = String
"M"
toStdPX Char
'N' = String
"N"
toStdPX Char
'P' = String
"P"
toStdPX Char
'Q' = String
"Q"
toStdPX Char
'R' = String
"R"
toStdPX Char
'S' = String
"S"
toStdPX Char
'T' = String
"T"
toStdPX Char
'V' = String
"V"
toStdPX Char
'W' = String
"W"
toStdPX Char
'Y' = String
"Y"
toStdPX Char
'-' = String
""
toStdPX Char
'.' = String
""
toStdPX Char
_ = ShowS
forall a. HasCallStack => String -> a
error String
"toStdPX: Cannot convert to standard amino acid."

proteinS :: AlphabetSpec
proteinS :: AlphabetSpec
proteinS = String
-> String -> String -> String -> (Char -> String) -> AlphabetSpec
fromChars String
"ACDEFGHIKLMNPQRSTVWY" String
"-." [] String
"*" Char -> String
toStdPS

toStdPS :: Char -> String
toStdPS :: Char -> String
toStdPS Char
'A' = String
"A"
toStdPS Char
'C' = String
"C"
toStdPS Char
'D' = String
"D"
toStdPS Char
'E' = String
"E"
toStdPS Char
'F' = String
"F"
toStdPS Char
'G' = String
"G"
toStdPS Char
'H' = String
"H"
toStdPS Char
'I' = String
"I"
toStdPS Char
'K' = String
"K"
toStdPS Char
'L' = String
"L"
toStdPS Char
'M' = String
"M"
toStdPS Char
'N' = String
"N"
toStdPS Char
'P' = String
"P"
toStdPS Char
'Q' = String
"Q"
toStdPS Char
'R' = String
"R"
toStdPS Char
'S' = String
"S"
toStdPS Char
'T' = String
"T"
toStdPS Char
'V' = String
"V"
toStdPS Char
'W' = String
"W"
toStdPS Char
'Y' = String
"Y"
toStdPS Char
'-' = String
""
toStdPS Char
'.' = String
""
toStdPS Char
'*' = String
""
toStdPS Char
_ = ShowS
forall a. HasCallStack => String -> a
error String
"toStdPS: Cannot convert to standard amino acid."

proteinI :: AlphabetSpec
proteinI :: AlphabetSpec
proteinI = String
-> String -> String -> String -> (Char -> String) -> AlphabetSpec
fromChars String
"ACDEFGHIKLMNPQRSTVWY" String
"-." String
"X?" String
"*JBZ" Char -> String
toStdPI

toStdPI :: Char -> String
toStdPI :: Char -> String
toStdPI Char
'A' = String
"A"
toStdPI Char
'C' = String
"C"
toStdPI Char
'D' = String
"D"
toStdPI Char
'E' = String
"E"
toStdPI Char
'F' = String
"F"
toStdPI Char
'G' = String
"G"
toStdPI Char
'H' = String
"H"
toStdPI Char
'I' = String
"I"
toStdPI Char
'K' = String
"K"
toStdPI Char
'L' = String
"L"
toStdPI Char
'M' = String
"M"
toStdPI Char
'N' = String
"N"
toStdPI Char
'P' = String
"P"
toStdPI Char
'Q' = String
"Q"
toStdPI Char
'R' = String
"R"
toStdPI Char
'S' = String
"S"
toStdPI Char
'T' = String
"T"
toStdPI Char
'V' = String
"V"
toStdPI Char
'W' = String
"W"
toStdPI Char
'Y' = String
"Y"
toStdPI Char
'-' = String
""
toStdPI Char
'.' = String
""
toStdPI Char
'*' = String
""
toStdPI Char
'J' = String
"LI"
toStdPI Char
'B' = String
"DN"
toStdPI Char
'Z' = String
"EQ"
toStdPI Char
'X' = String
"ACDEFGHIKLMNPQRSTVWY"
toStdPI Char
'?' = String
"ACDEFGHIKLMNPQRSTVWY"
toStdPI Char
_ = ShowS
forall a. HasCallStack => String -> a
error String
"toStdPI: Cannot convert to standard amino acid."