-- | Accession numbers. These /numbers/ are not really numbers because they
-- they are made up of alphanumeric characters.

module Biobase.Types.Accession where

import Control.DeepSeq
import Data.Aeson
import Data.Binary
import Data.Char (isLetter)
import Data.Hashable (Hashable)
import Data.Ix (Ix)
import Data.Serialize
import Data.Serialize.Text
import Data.String
import Data.String.Conversions (ConvertibleStrings(..), cs)
import Data.String.Conversions.Monomorphic (toST, fromST)
import Data.Text.Binary
import Data.Text (Text, span, length)
import GHC.Generics (Generic)
import Prelude hiding (length,span)



-- * 'Accession' with phantom types.
--
-- <http://www.ncbi.nlm.nih.gov/Sequin/acc.html>
--
-- <http://www.uniprot.org/help/accession_numbers>
--
-- <http://en.wikipedia.org/wiki/Accession_number_%28bioinformatics%29>

-- | The accession number is a unique identifier in bioinformatics.
--
-- Depending on the source, accession numbers follow different alphanumeric
-- formats! While letters-than-numbers is quite common, swissprot uses
-- a mix. Hence, we just use a text string as accession.
--
-- A phantom type is provided to enable type safety annotations. Helper
-- functions provide smart construction from the @Accession@ tagged generic
-- type.

newtype Accession t = Accession { Accession t -> Text
_getAccession :: Text }
  deriving (Accession t -> Accession t -> Bool
(Accession t -> Accession t -> Bool)
-> (Accession t -> Accession t -> Bool) -> Eq (Accession t)
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
forall k (t :: k). Accession t -> Accession t -> Bool
/= :: Accession t -> Accession t -> Bool
$c/= :: forall k (t :: k). Accession t -> Accession t -> Bool
== :: Accession t -> Accession t -> Bool
$c== :: forall k (t :: k). Accession t -> Accession t -> Bool
Eq,Eq (Accession t)
Eq (Accession t)
-> (Accession t -> Accession t -> Ordering)
-> (Accession t -> Accession t -> Bool)
-> (Accession t -> Accession t -> Bool)
-> (Accession t -> Accession t -> Bool)
-> (Accession t -> Accession t -> Bool)
-> (Accession t -> Accession t -> Accession t)
-> (Accession t -> Accession t -> Accession t)
-> Ord (Accession t)
Accession t -> Accession t -> Bool
Accession t -> Accession t -> Ordering
Accession t -> Accession t -> Accession t
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
forall k (t :: k). Eq (Accession t)
forall k (t :: k). Accession t -> Accession t -> Bool
forall k (t :: k). Accession t -> Accession t -> Ordering
forall k (t :: k). Accession t -> Accession t -> Accession t
min :: Accession t -> Accession t -> Accession t
$cmin :: forall k (t :: k). Accession t -> Accession t -> Accession t
max :: Accession t -> Accession t -> Accession t
$cmax :: forall k (t :: k). Accession t -> Accession t -> Accession t
>= :: Accession t -> Accession t -> Bool
$c>= :: forall k (t :: k). Accession t -> Accession t -> Bool
> :: Accession t -> Accession t -> Bool
$c> :: forall k (t :: k). Accession t -> Accession t -> Bool
<= :: Accession t -> Accession t -> Bool
$c<= :: forall k (t :: k). Accession t -> Accession t -> Bool
< :: Accession t -> Accession t -> Bool
$c< :: forall k (t :: k). Accession t -> Accession t -> Bool
compare :: Accession t -> Accession t -> Ordering
$ccompare :: forall k (t :: k). Accession t -> Accession t -> Ordering
$cp1Ord :: forall k (t :: k). Eq (Accession t)
Ord,ReadPrec [Accession t]
ReadPrec (Accession t)
Int -> ReadS (Accession t)
ReadS [Accession t]
(Int -> ReadS (Accession t))
-> ReadS [Accession t]
-> ReadPrec (Accession t)
-> ReadPrec [Accession t]
-> Read (Accession t)
forall a.
(Int -> ReadS a)
-> ReadS [a] -> ReadPrec a -> ReadPrec [a] -> Read a
forall k (t :: k). ReadPrec [Accession t]
forall k (t :: k). ReadPrec (Accession t)
forall k (t :: k). Int -> ReadS (Accession t)
forall k (t :: k). ReadS [Accession t]
readListPrec :: ReadPrec [Accession t]
$creadListPrec :: forall k (t :: k). ReadPrec [Accession t]
readPrec :: ReadPrec (Accession t)
$creadPrec :: forall k (t :: k). ReadPrec (Accession t)
readList :: ReadS [Accession t]
$creadList :: forall k (t :: k). ReadS [Accession t]
readsPrec :: Int -> ReadS (Accession t)
$creadsPrec :: forall k (t :: k). Int -> ReadS (Accession t)
Read,Int -> Accession t -> ShowS
[Accession t] -> ShowS
Accession t -> String
(Int -> Accession t -> ShowS)
-> (Accession t -> String)
-> ([Accession t] -> ShowS)
-> Show (Accession t)
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
forall k (t :: k). Int -> Accession t -> ShowS
forall k (t :: k). [Accession t] -> ShowS
forall k (t :: k). Accession t -> String
showList :: [Accession t] -> ShowS
$cshowList :: forall k (t :: k). [Accession t] -> ShowS
show :: Accession t -> String
$cshow :: forall k (t :: k). Accession t -> String
showsPrec :: Int -> Accession t -> ShowS
$cshowsPrec :: forall k (t :: k). Int -> Accession t -> ShowS
Show,(forall x. Accession t -> Rep (Accession t) x)
-> (forall x. Rep (Accession t) x -> Accession t)
-> Generic (Accession t)
forall x. Rep (Accession t) x -> Accession t
forall x. Accession t -> Rep (Accession t) x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
forall k (t :: k) x. Rep (Accession t) x -> Accession t
forall k (t :: k) x. Accession t -> Rep (Accession t) x
$cto :: forall k (t :: k) x. Rep (Accession t) x -> Accession t
$cfrom :: forall k (t :: k) x. Accession t -> Rep (Accession t) x
Generic)

-- | Generate an accession with an explicit phantom type: @accession'
-- Nucleotide "Bla"@ has type @:: Accession Nucleotide@.

accession' :: ConvertibleStrings s Text => t -> s -> Accession t
accession' :: t -> s -> Accession t
accession' t
t = Text -> Accession t
forall k (t :: k). Text -> Accession t
Accession (Text -> Accession t) -> (s -> Text) -> s -> Accession t
forall b c a. (b -> c) -> (a -> b) -> a -> c
. s -> Text
forall a. ConvertibleStrings a Text => a -> Text
toST

-- | Generate an accession when the type @Accession t@ is clear from the
-- context.

accession :: ConvertibleStrings s Text => s -> Accession t
accession :: s -> Accession t
accession = Text -> Accession t
forall k (t :: k). Text -> Accession t
Accession (Text -> Accession t) -> (s -> Text) -> s -> Accession t
forall b c a. (b -> c) -> (a -> b) -> a -> c
. s -> Text
forall a. ConvertibleStrings a Text => a -> Text
toST
{-# Inline accession #-}

-- | Retag an accession

retagAccession :: Accession f -> Accession t
retagAccession :: Accession f -> Accession t
retagAccession = Text -> Accession t
forall k (t :: k). Text -> Accession t
Accession (Text -> Accession t)
-> (Accession f -> Text) -> Accession f -> Accession t
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Accession f -> Text
forall k (t :: k). Accession t -> Text
_getAccession
{-# Inline retagAccession #-}

instance IsString (Accession t) where
  fromString :: String -> Accession t
fromString = String -> Accession t
forall k s (t :: k). ConvertibleStrings s Text => s -> Accession t
accession
  {-# Inline fromString #-}

instance Binary    (Accession t)
instance FromJSON  (Accession t)
instance Hashable  (Accession t)
instance Serialize (Accession t)
instance ToJSON    (Accession t)
instance NFData    (Accession t)



-- * Phantom types. All with an excliti data constructor to guide
-- 'accession''.

-- ** NCBI phantom types

-- | nucleotide sequence

data Nucleotide = Nucleotide

-- | protein sequence

data Protein = Protein

-- ** Rfam phantom types
--
-- The format is RFxxxxx, PFxxxxx, or CLxxxxx.

-- | Tag as being a clan.

data Clan = Clan

-- | Tag as being a Pfam model.

data Pfam = Pfam

-- | Tag as being an Rfam model. Used for Stockholm and CM files.

data Rfam = Rfam

-- | Species have an accession number, too.

data Species = Species



-- * Helper functions

-- | Guess the type of accession number. Returns @Nothing@ if unknown
-- structure.

guessAccessionType :: Accession t -> Maybe Text
guessAccessionType :: Accession t -> Maybe Text
guessAccessionType (Accession Text
a) = case (Text -> Int
length Text
l, Text -> Int
length Text
d) of
  (Int
1,Int
5)                   -> Text -> Maybe Text
forall a. a -> Maybe a
Just Text
"Nucleotide"
  (Int
2,Int
6)                   -> Text -> Maybe Text
forall a. a -> Maybe a
Just Text
"Nucleotide"
  (Int
3,Int
5)                   -> Text -> Maybe Text
forall a. a -> Maybe a
Just Text
"Protein"
  (Int
3,Int
k) | Int
8Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
k Bool -> Bool -> Bool
&& Int
kInt -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
10 -> Text -> Maybe Text
forall a. a -> Maybe a
Just Text
"WGS"
  (Int
5,Int
7)                   -> Text -> Maybe Text
forall a. a -> Maybe a
Just Text
"MGA"
  (Int, Int)
_                       -> Maybe Text
forall a. Maybe a
Nothing
  where (Text
l,Text
d) = (Char -> Bool) -> Text -> (Text, Text)
span Char -> Bool
isLetter Text
a