{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}

-- | The Rfam.fasta.gz file provides useful information: (1) conversion between
-- Rfam accession and Rfam identifier, (2) species accession, (3) name of said
-- species, and (4) the sequence fasta file.

module Biobase.Infernal.RfamFasta where

import Bio.Core.Sequence
import Data.ByteString.Char8 as BS
import Data.Map as M
import qualified Data.ByteString.Lazy.Char8 as BSL
import Text.Printf

import Biobase.Infernal.Types



-- | Rfam FASTA entry.

data RfamFasta = RfamFasta
  { modelAccession    :: !ModelAccession      -- ^ Rfam accession number RFxxxxx (the xxxxx part).
  , modelIdentifier   :: !ModelIdentification -- ^ Rfam identifier (like 5S_rRNA).
  , sequenceAccession :: !EmblAccession       -- ^ EMBL sequence accession identifier and position.
  , speciesAccession  :: !SpeciesAccession    -- ^ Rfam species accession.
  , speciesName       :: !SpeciesName         -- ^ Species name.
  , fastaData         :: !StrictSeqData       -- ^ FASTA data
  } deriving (Show)

-- | Since RfamFasta entries are just fasta entries...

instance BioSeq RfamFasta where
  seqlabel RfamFasta{..}  = SeqLabel . BSL.fromChunks $ [BS.concat
    [ BS.pack . printf "RF%05d" . unModelAccession $ modelAccession
    , ";"
    , unModelIdentification modelIdentifier
    , ";"
    , let (a,b,c) = unEmblAccession sequenceAccession in BS.concat [a, "/", BS.pack $ show b, "-", BS.pack $ show c]
    , "   "
    , BS.pack . show . unSpeciesAccession $ speciesAccession
    , ":"
    , unSpeciesName speciesName
    ] ]
  seqdata RfamFasta{..}   = SeqData . BSL.fromChunks $ [unStrictSeqData fastaData]
  seqlength RfamFasta{..} = Offset . fromInteger . toInteger . BS.length . unStrictSeqData $ fastaData



-- * Some in-memory lookup systems.

-- | Model accession to model identifier

type ModelAC2ID = Map ModelAccession ModelIdentification

-- | Model identifier to model accession

type ModelID2AC = Map ModelIdentification ModelAccession

-- | Model accession and sequence accession to 'RfamFasta' entry (and model
-- accession to all entries for this accession).

type ACAC2RfamFasta = Map ModelAccession (Map EmblAccession RfamFasta)

-- | Model identifier and sequence accession to 'RfamFasta' entry.

type IDAC2RfamFasta = Map ModelIdentification (Map EmblAccession RfamFasta)