-- | Functions for processing of genbank data -- Extraction of feature sequences (header,sequencedata) or sequence data module Bio.GenbankTools ( extractSpecificFeatureSeqData, extractSpecificFeatureSequence, module Bio.GenbankData ) where import Bio.GenbankData import Data.Maybe import Bio.Core.Sequence import Data.Int import Bio.Sequence.Fasta import qualified Data.ByteString.Lazy.Char8 as L -- | Extract nucleotide sequence data for all features of specified type, Nothing as specific feature extracts all feature sequence seqdatas extractSpecificFeatureSeqData :: Maybe String -> Genbank -> [SeqData] extractSpecificFeatureSeqData specificFeature genbank = seqdatas where currentFeatures = getCurrentFeatures specificFeature genbank coordinates = map featureCoordinates currentFeatures fullSequence = origin genbank seqdatas = concatMap (extractSeqDataList fullSequence) coordinates -- | Extract header (locus identifier, locus tag) and nucleotide sequence data for all features of specified type, Nothing as specific feature extracts all feature sequences extractSpecificFeatureSequence :: Maybe String -> Genbank -> [Sequence] extractSpecificFeatureSequence specificFeature genbank = sequences where currentAccession = L.unpack (locus genbank) currentFeatures = getCurrentFeatures specificFeature genbank fields = [ x | x@(Field {}) <- concatMap attributes currentFeatures] locusTags = map fieldValue (filter (\field -> fieldType field == L.pack "locus_tag") fields) currentHeaders = map (\locus_tag -> L.pack (L.unpack locus_tag ++ " " ++ currentAccession)) locusTags coordinates = map featureCoordinates currentFeatures fullSequence = origin genbank seqdata = concatMap (extractSeqDataList fullSequence) coordinates sequences = map (\(header,seqdata) -> Seq (SeqLabel header) seqdata Nothing) $ zip currentHeaders seqdata --------------------------- --Auxiliary functions: -- | Extracts features dependent on provided Feature type, Nothing as specific feature extracts all features getCurrentFeatures :: Maybe String -> Genbank -> [Feature] getCurrentFeatures specificFeature genbank | isNothing specificFeature = features genbank | otherwise = filter (\x -> featureType x == L.pack (fromJust specificFeature))(features genbank) -- | Extract the seqdata for a coordinateSet from the ORIGIN section of genbank data extractSeqDataList :: SeqData -> CoordinateSet -> [SeqData] extractSeqDataList genbankSeq seqCoordinates | isNothing (setType seqCoordinates) = [extractSeqData genbankSeq (head (setCoordinates seqCoordinates))] | fromJust (setType seqCoordinates) == "join" = extractJoinSeqData genbankSeq seqCoordinates | fromJust (setType seqCoordinates) == "order" = extractOrderSeqData genbankSeq seqCoordinates | otherwise = [] -- | Extract sequence data for CoordinateSets of type "join" extractJoinSeqData :: SeqData -> CoordinateSet -> [SeqData] extractJoinSeqData genbankSeq seqCoordinates = joinSequence where coordinateList = setCoordinates seqCoordinates partialSequences = map (extractByteStringFromSeqData genbankSeq) coordinateList joinSequence = [SeqData (L.concat partialSequences)] -- | Extract sequence data for CoordinateSets of type "order" extractOrderSeqData :: SeqData -> CoordinateSet -> [SeqData] extractOrderSeqData fullSeq seqCoordinates = orderSequences where coordinateList = setCoordinates seqCoordinates orderSequences = map (extractSeqData fullSeq) coordinateList -- | Extracts sequence according to a Coordinates pair extractSeqData :: SeqData -> Coordinates -> SeqData extractSeqData fullSequence seqCoordinates | complement seqCoordinates = SeqData (revcomplement' subsequence) | otherwise = SeqData subsequence where subsequence = extractByteStringFromSeqData fullSequence seqCoordinates -- | Extract partial ByteString from ORIGIN section of genbank data according to provided coordinates extractByteStringFromSeqData :: SeqData -> Coordinates -> L.ByteString extractByteStringFromSeqData fullSequence seqCoordinates = substring where endTruncatedSequence = L.take (fromIntegral (coordinatesFrom seqCoordinates + coordinatesTo seqCoordinates):: Int64) (unSD fullSequence) substring = L.drop (fromIntegral (coordinatesFrom seqCoordinates) :: Int64) endTruncatedSequence --The following two functions are copied from Ketil Maldes hackage bio package. -- The same functionality has not been reincluded into the biocore package -- | Calculate the reverse complement for SeqData only. revcomplement' :: L.ByteString -> L.ByteString revcomplement' = L.map complement' . L.reverse -- | Complement a single character. I.e. identify the nucleotide it -- can hybridize with. Note that for multiple nucleotides, you usually -- want the reverse complement (see 'revcompl' for that). complement' :: Char -> Char complement' 'A' = 'T' complement' 'T' = 'A' complement' 'C' = 'G' complement' 'G' = 'C' complement' 'a' = 't' complement' 't' = 'a' complement' 'c' = 'g' complement' 'g' = 'c' complement' x = x