-- | Degenerate base symbol representation. We use the same conventions as in
-- <<https://en.wikipedia.org/wiki/Nucleic_acid_notation>> which ignores
-- @U@racil, except if it stands alone for @Char@ and @XNA@ targets. If the
-- 'Degenerate' target is @RNA@, then we create @U@s instead of @T@s.
--
-- TODO Shall we handle 'Complement' for degenerates?

module Biobase.Primary.IUPAC where

import           Control.Arrow ((***))
import           Data.ByteString.Char8 (ByteString,unpack)
import           Data.Char (toUpper)
import           Data.FileEmbed (makeRelativeToProject, embedFile)
import           Data.List (nub,sort)
import           Data.String
import           Data.Tuple (swap)
import qualified Data.Vector.Unboxed as VU
import           Control.Category ((>>>))

import           Biobase.Types.BioSequence

import           Biobase.Primary.Letter
import           Biobase.Primary.Nuc
import qualified Biobase.Primary.Nuc.RNA as R


-- | Allow the full, including degenerates, alphabet.

data DEG

pattern $bA :: Letter DEG n
$mA :: forall r k (n :: k).
Letter DEG n -> (Void# -> r) -> (Void# -> r) -> r
A = Letter  0 :: Letter DEG n
pattern $bC :: Letter DEG n
$mC :: forall r k (n :: k).
Letter DEG n -> (Void# -> r) -> (Void# -> r) -> r
C = Letter  1 :: Letter DEG n
pattern $bG :: Letter DEG n
$mG :: forall r k (n :: k).
Letter DEG n -> (Void# -> r) -> (Void# -> r) -> r
G = Letter  2 :: Letter DEG n
pattern $bT :: Letter DEG n
$mT :: forall r k (n :: k).
Letter DEG n -> (Void# -> r) -> (Void# -> r) -> r
T = Letter  3 :: Letter DEG n
pattern $bU :: Letter DEG n
$mU :: forall r k (n :: k).
Letter DEG n -> (Void# -> r) -> (Void# -> r) -> r
U = Letter  4 :: Letter DEG n
pattern $bW :: Letter DEG n
$mW :: forall r k (n :: k).
Letter DEG n -> (Void# -> r) -> (Void# -> r) -> r
W = Letter  5 :: Letter DEG n
pattern $bS :: Letter DEG n
$mS :: forall r k (n :: k).
Letter DEG n -> (Void# -> r) -> (Void# -> r) -> r
S = Letter  6 :: Letter DEG n
pattern $bM :: Letter DEG n
$mM :: forall r k (n :: k).
Letter DEG n -> (Void# -> r) -> (Void# -> r) -> r
M = Letter  7 :: Letter DEG n
pattern $bK :: Letter DEG n
$mK :: forall r k (n :: k).
Letter DEG n -> (Void# -> r) -> (Void# -> r) -> r
K = Letter  8 :: Letter DEG n
pattern $bR :: Letter DEG n
$mR :: forall r k (n :: k).
Letter DEG n -> (Void# -> r) -> (Void# -> r) -> r
R = Letter  9 :: Letter DEG n
pattern $bY :: Letter DEG n
$mY :: forall r k (n :: k).
Letter DEG n -> (Void# -> r) -> (Void# -> r) -> r
Y = Letter 10 :: Letter DEG n
pattern $bB :: Letter DEG n
$mB :: forall r k (n :: k).
Letter DEG n -> (Void# -> r) -> (Void# -> r) -> r
B = Letter 11 :: Letter DEG n
pattern $bD :: Letter DEG n
$mD :: forall r k (n :: k).
Letter DEG n -> (Void# -> r) -> (Void# -> r) -> r
D = Letter 12 :: Letter DEG n
pattern $bH :: Letter DEG n
$mH :: forall r k (n :: k).
Letter DEG n -> (Void# -> r) -> (Void# -> r) -> r
H = Letter 13 :: Letter DEG n
pattern $bV :: Letter DEG n
$mV :: forall r k (n :: k).
Letter DEG n -> (Void# -> r) -> (Void# -> r) -> r
V = Letter 14 :: Letter DEG n
pattern $bN :: Letter DEG n
$mN :: forall r k (n :: k).
Letter DEG n -> (Void# -> r) -> (Void# -> r) -> r
N = Letter 15 :: Letter DEG n

instance Bounded (Letter DEG n) where
    minBound :: Letter DEG n
minBound = Letter DEG n
forall k (n :: k). Letter DEG n
A
    maxBound :: Letter DEG n
maxBound = Letter DEG n
forall k (n :: k). Letter DEG n
N

instance Enum (Letter DEG n) where
    succ :: Letter DEG n -> Letter DEG n
succ Letter DEG n
N           = [Char] -> Letter DEG n
forall a. HasCallStack => [Char] -> a
error [Char]
"succ/N:DEG"
    succ (Letter Int
x)  = Int -> Letter DEG n
forall k seqTy (nameTy :: k). Int -> Letter seqTy nameTy
Letter (Int -> Letter DEG n) -> Int -> Letter DEG n
forall a b. (a -> b) -> a -> b
$ Int
xInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1
    pred :: Letter DEG n -> Letter DEG n
pred Letter DEG n
A           = [Char] -> Letter DEG n
forall a. HasCallStack => [Char] -> a
error [Char]
"pred/A:DEG"
    pred (Letter Int
x)  = Int -> Letter DEG n
forall k seqTy (nameTy :: k). Int -> Letter seqTy nameTy
Letter (Int -> Letter DEG n) -> Int -> Letter DEG n
forall a b. (a -> b) -> a -> b
$ Int
xInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
1
    toEnum :: Int -> Letter DEG n
toEnum Int
k | Int
kInt -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>=Int
0 Bool -> Bool -> Bool
&& Int
kInt -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<=Int
15 = Int -> Letter DEG n
forall k seqTy (nameTy :: k). Int -> Letter seqTy nameTy
Letter Int
k
    toEnum Int
k                 = [Char] -> Letter DEG n
forall a. HasCallStack => [Char] -> a
error ([Char] -> Letter DEG n) -> [Char] -> Letter DEG n
forall a b. (a -> b) -> a -> b
$ [Char]
"toEnum/Letter DEG " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ Int -> [Char]
forall a. Show a => a -> [Char]
show Int
k
    fromEnum :: Letter DEG n -> Int
fromEnum (Letter Int
k) = Int
k

charDEG :: Char -> Letter DEG n
charDEG = Char -> Char
toUpper (Char -> Char) -> (Char -> Letter DEG n) -> Char -> Letter DEG n
forall k (cat :: k -> k -> *) (a :: k) (b :: k) (c :: k).
Category cat =>
cat a b -> cat b c -> cat a c
>>> \case
  Char
'A' -> Letter DEG n
forall k (n :: k). Letter DEG n
A
  Char
'C' -> Letter DEG n
forall k (n :: k). Letter DEG n
C
  Char
'G' -> Letter DEG n
forall k (n :: k). Letter DEG n
G
  Char
'T' -> Letter DEG n
forall k (n :: k). Letter DEG n
T
  Char
'U' -> Letter DEG n
forall k (n :: k). Letter DEG n
U
  Char
'W' -> Letter DEG n
forall k (n :: k). Letter DEG n
W
  Char
'S' -> Letter DEG n
forall k (n :: k). Letter DEG n
S
  Char
'M' -> Letter DEG n
forall k (n :: k). Letter DEG n
M
  Char
'K' -> Letter DEG n
forall k (n :: k). Letter DEG n
K
  Char
'R' -> Letter DEG n
forall k (n :: k). Letter DEG n
R
  Char
'Y' -> Letter DEG n
forall k (n :: k). Letter DEG n
Y
  Char
'B' -> Letter DEG n
forall k (n :: k). Letter DEG n
B
  Char
'D' -> Letter DEG n
forall k (n :: k). Letter DEG n
D
  Char
'H' -> Letter DEG n
forall k (n :: k). Letter DEG n
H
  Char
'V' -> Letter DEG n
forall k (n :: k). Letter DEG n
V
  Char
_   -> Letter DEG n
forall k (n :: k). Letter DEG n
N
{-# INLINE charDEG #-}

degChar :: Letter DEG n -> Char
degChar = \case
  Letter DEG n
A -> Char
'A'
  Letter DEG n
C -> Char
'C'
  Letter DEG n
G -> Char
'G'
  Letter DEG n
T -> Char
'T'
  Letter DEG n
U -> Char
'U'
  Letter DEG n
W -> Char
'W'
  Letter DEG n
S -> Char
'S'
  Letter DEG n
M -> Char
'M'
  Letter DEG n
K -> Char
'K'
  Letter DEG n
R -> Char
'R'
  Letter DEG n
Y -> Char
'Y'
  Letter DEG n
B -> Char
'B'
  Letter DEG n
D -> Char
'D'
  Letter DEG n
H -> Char
'H'
  Letter DEG n
V -> Char
'V'
  Letter DEG n
N -> Char
'N'
{-# INLINE degChar #-}            

instance Show (Letter DEG n) where
    show :: Letter DEG n -> [Char]
show Letter DEG n
c = [Letter DEG n -> Char
forall k (n :: k). Letter DEG n -> Char
degChar Letter DEG n
c]

degSeq :: MkPrimary p DEG n => p -> Primary DEG n
degSeq :: p -> Primary DEG n
degSeq = p -> Primary DEG n
forall k c t (n :: k). MkPrimary c t n => c -> Primary t n
primary

instance MkPrimary (VU.Vector Char) DEG n where
    primary :: Vector Char -> Primary DEG n
primary = (Char -> Letter DEG n) -> Vector Char -> Primary DEG n
forall a b. (Unbox a, Unbox b) => (a -> b) -> Vector a -> Vector b
VU.map Char -> Letter DEG n
forall k (n :: k). Char -> Letter DEG n
charDEG

instance IsString [Letter DEG n] where
    fromString :: [Char] -> [Letter DEG n]
fromString = (Char -> Letter DEG n) -> [Char] -> [Letter DEG n]
forall a b. (a -> b) -> [a] -> [b]
map Char -> Letter DEG n
forall k (n :: k). Char -> Letter DEG n
charDEG



-- * Conversions

class Degenerate x where
  fromDegenerate :: Char -> [x]
  toDegenerate   :: [x]  -> Maybe Char

instance Degenerate Char where
  fromDegenerate :: Char -> [Char]
fromDegenerate = [Char] -> ([Char] -> [Char]) -> Maybe [Char] -> [Char]
forall b a. b -> (a -> b) -> Maybe a -> b
maybe [] [Char] -> [Char]
forall a. a -> a
id (Maybe [Char] -> [Char])
-> (Char -> Maybe [Char]) -> Char -> [Char]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> [(Char, [Char])] -> Maybe [Char])
-> [(Char, [Char])] -> Char -> Maybe [Char]
forall a b c. (a -> b -> c) -> b -> a -> c
flip Char -> [(Char, [Char])] -> Maybe [Char]
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup [(Char, [Char])]
iupacXDNAchars
  toDegenerate :: [Char] -> Maybe Char
toDegenerate   = ([Char] -> [([Char], Char)] -> Maybe Char)
-> [([Char], Char)] -> [Char] -> Maybe Char
forall a b c. (a -> b -> c) -> b -> a -> c
flip [Char] -> [([Char], Char)] -> Maybe Char
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup (((Char, [Char]) -> ([Char], Char))
-> [(Char, [Char])] -> [([Char], Char)]
forall a b. (a -> b) -> [a] -> [b]
map (Char, [Char]) -> ([Char], Char)
forall a b. (a, b) -> (b, a)
swap [(Char, [Char])]
iupacXDNAchars) ([Char] -> Maybe Char)
-> ([Char] -> [Char]) -> [Char] -> Maybe Char
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Char] -> [Char]
forall a. Eq a => [a] -> [a]
nub ([Char] -> [Char]) -> ([Char] -> [Char]) -> [Char] -> [Char]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Char] -> [Char]
forall a. Ord a => [a] -> [a]
sort

instance Degenerate (Letter RNA n) where
    fromDegenerate :: Char -> [Letter RNA n]
fromDegenerate Char
'T' = []
    fromDegenerate Char
x   = (Letter DNA Any -> Letter RNA n)
-> [Letter DNA Any] -> [Letter RNA n]
forall a b. (a -> b) -> [a] -> [b]
map Letter DNA Any -> Letter RNA n
forall k1 k2 (n1 :: k1) (n2 :: k2). Letter DNA n1 -> Letter RNA n2
dnaTrna ([Letter DNA Any] -> [Letter RNA n])
-> [Letter DNA Any] -> [Letter RNA n]
forall a b. (a -> b) -> a -> b
$ Char -> [Letter DNA Any]
forall x. Degenerate x => Char -> [x]
fromDegenerate Char
x
    toDegenerate :: [Letter RNA n] -> Maybe Char
toDegenerate   [Letter RNA n]
xs  | [Letter RNA n]
xs [Letter RNA n] -> [Letter RNA n] -> Bool
forall a. Eq a => a -> a -> Bool
== [Letter RNA n
forall k (n :: k). Letter RNA n
R.U] = Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'U'
                       | Bool
otherwise  = [Letter DNA Any] -> Maybe Char
forall x. Degenerate x => [x] -> Maybe Char
toDegenerate ([Letter DNA Any] -> Maybe Char) -> [Letter DNA Any] -> Maybe Char
forall a b. (a -> b) -> a -> b
$ (Letter RNA n -> Letter DNA Any)
-> [Letter RNA n] -> [Letter DNA Any]
forall a b. (a -> b) -> [a] -> [b]
map Letter RNA n -> Letter DNA Any
forall k1 k2 (n1 :: k1) (n2 :: k2). Letter RNA n1 -> Letter DNA n2
rnaTdna [Letter RNA n]
xs

instance Degenerate (Letter DNA n) where
    fromDegenerate :: Char -> [Letter DNA n]
fromDegenerate Char
'U' = []
    fromDegenerate Char
x   = (Char -> Letter DNA n) -> [Char] -> [Letter DNA n]
forall a b. (a -> b) -> [a] -> [b]
map Char -> Letter DNA n
forall k (n :: k). Char -> Letter DNA n
charDNA ([Char] -> [Letter DNA n]) -> [Char] -> [Letter DNA n]
forall a b. (a -> b) -> a -> b
$ Char -> [Char]
forall x. Degenerate x => Char -> [x]
fromDegenerate Char
x
    toDegenerate :: [Letter DNA n] -> Maybe Char
toDegenerate       = [Char] -> Maybe Char
forall x. Degenerate x => [x] -> Maybe Char
toDegenerate ([Char] -> Maybe Char)
-> ([Letter DNA n] -> [Char]) -> [Letter DNA n] -> Maybe Char
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Letter DNA n -> Char) -> [Letter DNA n] -> [Char]
forall a b. (a -> b) -> [a] -> [b]
map Letter DNA n -> Char
forall k (n :: k). Letter DNA n -> Char
dnaChar

instance Degenerate (Letter XNA n) where
    fromDegenerate :: Char -> [Letter XNA n]
fromDegenerate = (Char -> Letter XNA n) -> [Char] -> [Letter XNA n]
forall a b. (a -> b) -> [a] -> [b]
map Char -> Letter XNA n
forall k (n :: k). Char -> Letter XNA n
charXNA ([Char] -> [Letter XNA n])
-> (Char -> [Char]) -> Char -> [Letter XNA n]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> [Char]
forall x. Degenerate x => Char -> [x]
fromDegenerate
    toDegenerate :: [Letter XNA n] -> Maybe Char
toDegenerate   = [Char] -> Maybe Char
forall x. Degenerate x => [x] -> Maybe Char
toDegenerate ([Char] -> Maybe Char)
-> ([Letter XNA n] -> [Char]) -> [Letter XNA n] -> Maybe Char
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Letter XNA n -> Char) -> [Letter XNA n] -> [Char]
forall a b. (a -> b) -> [a] -> [b]
map Letter XNA n -> Char
forall k (n :: k). Letter XNA n -> Char
xnaChar



-- * Raw embeddings

-- | list of characters, using the XNA alphabet, but degenerate chars
-- assume DNA characters.

iupacXDNAchars :: [(Char,String)]
iupacXDNAchars :: [(Char, [Char])]
iupacXDNAchars = ([Char] -> (Char, [Char])) -> [[Char]] -> [(Char, [Char])]
forall a b. (a -> b) -> [a] -> [b]
map ([[Char]] -> (Char, [Char])
forall a. [[a]] -> (a, [a])
go ([[Char]] -> (Char, [Char]))
-> ([Char] -> [[Char]]) -> [Char] -> (Char, [Char])
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Char] -> [[Char]]
words) ([[Char]] -> [(Char, [Char])])
-> (ByteString -> [[Char]]) -> ByteString -> [(Char, [Char])]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Char] -> [[Char]]
lines ([Char] -> [[Char]])
-> (ByteString -> [Char]) -> ByteString -> [[Char]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> [Char]
unpack (ByteString -> [(Char, [Char])]) -> ByteString -> [(Char, [Char])]
forall a b. (a -> b) -> a -> b
$ ByteString
iupacNucleotides where
  go :: [[a]] -> (a, [a])
go [[a
c],[a]
cs] = (a
c,[a]
cs)
{-# NOINLINE iupacXDNAchars #-}

-- | Raw iupac data, embedded into the library.

iupacNucleotides :: ByteString
iupacNucleotides :: ByteString
iupacNucleotides = $(makeRelativeToProject "sources/iupac-nucleotides" >>= embedFile)