Bio/Sequence.hs

{- |
   This is a meta-module importing and re-exporting sequence-related stuff.

   It encompasses the "Bio.Sequence.SeqData", "Bio.Sequence.Fasta", and "Bio.Sequence.TwoBit" modules.
-}

module Bio.Sequence 
    (
    -- * Data structures etc ("Bio.Sequence.SeqData")
      Sequence(..), Unknown, Offset, SeqData, Qual, QualData
    -- ** Accessor functions
    , seqlength, seqlabel, seqheader, seqdata, seqqual, (!)
    , appendHeader, setHeader

    -- ** Converting to and from String.
    , fromStr, toStr
    -- ** Nucleotide functionality.
    , compl, revcompl, revcompl', Nuc, castToNuc
    -- ** Protein sequence functionality
    , Amino(..), translate, fromIUPAC, toIUPAC, castToAmino
    -- ** Other utility functions
    , defragSeq, seqmap

    -- * File IO
    -- ** Generic sequence reading
    , readNuc, readProt

    -- ** The Fasta file format ("Bio.Sequence.Fasta")
    , readFasta, hReadFasta 
    , writeFasta, hWriteFasta
    -- ** Quality data 
    -- | Not part of the Fasta format, and treated separately.
    , readQual, writeQual, hWriteQual
    , readFastaQual
    , writeFastaQual, hWriteFastaQual

    -- ** The FastQ format ("Bio.Sequence.FastQ") 
    -- Combines sequence data and quality in one file.
    -- Warning: Solexa uses a different formula for the quality values!
    , readFastQ, writeFastQ, hReadFastQ, hWriteFastQ
    , readSangerQ, writeSangerQ, hReadSangerQ, hWriteSangerQ
    , readIllumina, writeIllumina, hReadIllumina, hWriteIllumina

    -- ** The phd file format ("Bio.Sequence.Phd")
    -- | These contain base (nucleotide) calling information,
    --   and are generated by @phred@.
    , readPhd, hReadPhd

    -- ** TwoBit file format support ("Bio.Seqeunce.TwoBit")
    -- | Used by @BLAT@ and related tools.
    , decode2Bit, read2Bit, hRead2Bit 
    -- ,encode2Bit, write2Bit, hWrite2Bit

    -- * Hashing functionality ("Bio.Sequence.HashWord")
    -- | Packing words from sequences into integral data types 
    , HashF (..)
    , contigous, rcontig, rcpacked

    -- * Entropy calculations
    , KWords(..), entropy
    ) where

-- basic sequence data structures
import Bio.Sequence.SeqData

-- file formats
import Bio.Sequence.Fasta
import Bio.Sequence.FastQ
import Bio.Sequence.Phd
import Bio.Sequence.TwoBit
import Bio.Sequence.SFF

-- sequence-oriented stuff
import Bio.Sequence.Entropy
import Bio.Sequence.HashWord

import Control.Monad (filterM)
import System.Directory (doesFileExist)

-- | Read nucleotide sequences in any format - Fasta, SFF, FastQ, 2bit, PHD...
--   Todo: detect Illumina vs Sanger FastQ, transparent compression
readNuc :: FilePath -> IO [Sequence Nuc]
readNuc fp  
  | ext `elem` ["fasta", "fna", "fa", "fst"] = do 
       ps <- findQual fp 
       ss <- (case ps of [q] -> readFastaQual fp q
                         []  -> readFasta fp
                         qs  -> error ("Ambigous quality file for "++show fp++": "++show qs))
       return (map castSeq ss)
  | ext == "2bit"                     = read2Bit                               $ fp
  | ext == "sff"                      = fmap sffToSequence . readSFF           $ fp
  | ext `elem` ["fq","fastq"]         = readSangerQ                            $ fp
  | ext == "txt"                      = readIllumina                           $ fp
  | ext2 == "phd"                     = fmap return . readPhd                  $ fp -- only a single sequence
  -- "ace" ?
  | otherwise                         = error "readNuc: unknown file suffix!"
  where
    ext = reverse . takeWhile (/='.') . reverse $ fp
    ext2 = reverse . takeWhile (/='.') . dropWhile (=='.') . dropWhile (/='.') . reverse $ fp
    basename = reverse . dropWhile (=='.') . dropWhile (/= '.') . reverse
    findQual = filterM doesFileExist . qualnames
    qualnames f = [f++".qual",basename f++".qual"]

-- | Read protein sequences in any supported format (i.e. Fasta)
readProt :: FilePath -> IO [Sequence Amino]
readProt xs = map castSeq `fmap` readFasta xs