{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE Arrows #-} -- | Interface for the NCBI Entrez REST webservice. -- -- The entrezHTTP function provides a interface to the NCBI Entrez REST service. -- -- A series of different eutilites and databases are provided by the REST interface. -- Response depends on the combination of eutil and database, as well requested returntype. -- Specific combinations have wrapper functions with corresponding parsing functions included (see Usage example). -- -- If you use this libary in a tool, please read carefully and register your tool at eutilities@ncbi.nlm.nih.gov. You can append your registration info generated -- with the included buildRegistration function to your query. -- -- == Usage example: -- -- Retrieve a nucleotide sequence for Escherichia coli -- -- > nucleotideFasta <- fetchNucleotideString "NC_000913.3" 50 1000 "+" Nothing module Bio.EntrezHTTP (-- * Datatypes module Bio.EntrezHTTPData, -- * Retrieval function entrezHTTP, retrieveGeneSymbolFasta, fetchNucleotideString, retrieveElementsEntrez, portionListElements, -- * Parsing functions readEntrezTaxonSet, readEntrezSimpleTaxons, readEntrezParentIds, readEntrezSummaries, readEntrezSearch, -- * auxiliary functions buildRegistration, maybeBuildRegistration, setStrand, convertCoordinatesToStrand ) where import Network.HTTP.Conduit import qualified Data.ByteString.Lazy.Char8 as L8 import Text.XML.HXT.Core import Network import Data.Maybe import Bio.EntrezHTTPData import Bio.TaxonomyData --(Rank,SimpleTaxon) import qualified Data.ByteString.Char8 as B import Network.HTTP.Base -- | Send query and parse return XML startSession :: String -> String -> String -> IO String startSession program' database' query' = do requestXml <- withSocketsDo $ sendQuery program' database' query' let requestXMLString = L8.unpack requestXml return requestXMLString -- | Send query and return response XML sendQuery :: String -> String -> String -> IO L8.ByteString sendQuery program' database' query' = simpleHttp ("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/"++ program' ++ ".fcgi?" ++ "db=" ++ database' ++ "&" ++ query') -- | Function for querying the NCBI entrez REST interface. Input EntrezHTTPQuery datatype is used to select database, program of interest and contains the query string. -- Please note that query strings containing whitespace or special characters need to be urlencoded. The response format and content depends on the query type, the output -- therefore provided as String. entrezHTTP :: EntrezHTTPQuery -> IO String entrezHTTP (EntrezHTTPQuery program' database' query') = do let defaultProgram = "summary" let defaultDatabase = "nucleotide" let selectedProgram = fromMaybe defaultProgram program' let selectedDatabase = fromMaybe defaultDatabase database' startSession selectedProgram selectedDatabase query' -- | Wrapper function for eutils that accept a list of querys (e.g. a list of gene ids) that ensures that only chunks of 20 queries are sent per request. Sending to long queries otherwise results in a serverside exception. retrieveElementsEntrez :: [a] -> ([a] -> IO b) -> IO [b] retrieveElementsEntrez listElements retrievalfunction = do let splits = portionListElements listElements 20 mapM retrievalfunction splits -- Auxiliary function for retrieveElementsEntrez portionListElements :: [a] -> Int -> [[a]] portionListElements listElements elementsPerSplit | not (null listElements) = filter (not . null) result | otherwise = [] where (heads,xs) = splitAt elementsPerSplit listElements result = heads:portionListElements xs elementsPerSplit --------------------------------------- -- Parsing functions -- | Read entrez fetch for taxonomy database into a simplyfied datatype -- Result of e.g: http://eutils.ncbi.nlm.nih. readEntrezTaxonSet :: String -> [Taxon] readEntrezTaxonSet = runLA (xreadDoc >>> parseEntrezTaxonSet) parseEntrezTaxonSet :: ArrowXml a => a XmlTree Taxon parseEntrezTaxonSet = atTag "TaxaSet" >>> getChildren >>> proc entrezTaxons -> do _taxons <- parseEntrezTaxon -< entrezTaxons returnA -< _taxons parseEntrezTaxon :: ArrowXml a => a XmlTree Taxon parseEntrezTaxon = (isElem >>> hasName "Taxon") >>> proc entrezTaxon -> do _taxonomyId <- getChildren >>> (isElem >>> hasName "TaxId") >>> getChildren >>> getText -< entrezTaxon _scientificName <- getChildren >>> (isElem >>> hasName "ScientificName") >>> getChildren >>> getText -< entrezTaxon _parentTaxonomyId <- getChildren >>> (isElem >>> hasName "ParentTaxId") >>> getChildren >>> getText -< entrezTaxon _rank <- getChildren >>> (isElem >>> hasName "Rank") >>> getChildren >>> getText -< entrezTaxon _divison <- getChildren >>> (isElem >>> hasName "Division") >>> getChildren >>> getText -< entrezTaxon _geneticCode <- parseTaxonGeneticCode -< entrezTaxon _mitoGeneticCode <- parseTaxonMitoGeneticCode -< entrezTaxon _lineage <- getChildren >>> atTag "Lineage" >>> getChildren >>> getText -< entrezTaxon _lineageEx <- parseTaxonLineageEx -< entrezTaxon _createDate <- getChildren >>> (isElem >>> hasName "CreateDate") >>> getChildren >>> getText -< entrezTaxon _updateDate <- getChildren >>> (isElem >>> hasName "UpdateDate") >>> getChildren >>> getText -< entrezTaxon _pubDate <- getChildren >>> (isElem >>> hasName "PubDate") >>> getChildren >>> getText -< entrezTaxon returnA -< Taxon { taxonTaxId = read _taxonomyId :: Int, taxonScientificName = _scientificName, taxonParentTaxId = read _parentTaxonomyId :: Int, taxonRank = read _rank :: Rank, division = _divison, geneticCode = _geneticCode, mitoGeneticCode = _mitoGeneticCode, lineage = _lineage, lineageEx = _lineageEx, createDate = _createDate, updateDate = _updateDate, pubDate = _pubDate } parseTaxonGeneticCode :: ArrowXml a => a XmlTree TaxGenCode parseTaxonGeneticCode = getChildren >>> atTag "GeneticCode" >>> proc geneticcode -> do _gcId <- atTag "GCId" >>> getChildren >>> getText -< geneticcode _gcName <- atTag "GCName" >>> getChildren >>> getText -< geneticcode returnA -< TaxGenCode { geneticCodeId = read _gcId :: Int, abbreviation = Nothing, geneCodeName = _gcName, cde = [], starts = [] } parseTaxonMitoGeneticCode :: ArrowXml a => a XmlTree TaxGenCode parseTaxonMitoGeneticCode = getChildren >>> atTag "MitoGeneticCode" >>> proc mitogeneticcode -> do _mgcId <- atTag "MGCId" >>> getChildren >>> getText -< mitogeneticcode _mgcName <- atTag "MGCName" >>> getChildren >>> getText -< mitogeneticcode returnA -< TaxGenCode { geneticCodeId = read _mgcId :: Int, abbreviation = Nothing, geneCodeName = _mgcName, cde = [], starts = [] } parseTaxonLineageEx :: ArrowXml a => a XmlTree [LineageTaxon] parseTaxonLineageEx = getChildren >>> atTag "LineageEx" >>> proc taxonLineageEx -> do _lineageEx <- listA parseLineageTaxon -< taxonLineageEx returnA -< _lineageEx parseLineageTaxon :: ArrowXml a => a XmlTree LineageTaxon parseLineageTaxon = getChildren >>> atTag "Taxon" >>> proc lineageTaxon -> do _lineageTaxId <- atTag "TaxId" >>> getChildren >>> getText -< lineageTaxon _lineageScienticName <- atTag "ScientificName" >>> getChildren >>> getText -< lineageTaxon _lineageRank <- atTag "Rank" >>> getChildren >>> getText -< lineageTaxon returnA -< LineageTaxon { lineageTaxId = read _lineageTaxId :: Int, lineageScienticName = _lineageScienticName, lineageRank = read _lineageRank :: Rank } -- | Read entrez fetch for taxonomy database into a simplyfied datatype -- Result of e.g: http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id=1406860 readEntrezSimpleTaxons :: String -> [SimpleTaxon] readEntrezSimpleTaxons = runLA (xreadDoc >>> parseEntrezSimpleTaxons) parseEntrezSimpleTaxons :: ArrowXml a => a XmlTree SimpleTaxon parseEntrezSimpleTaxons = getChildren >>> atTag "Taxon" >>> proc entrezSimpleTaxon -> do simple_TaxId <- atTag "TaxId" >>> getChildren >>> getText -< entrezSimpleTaxon simple_ScientificName <- atTag "ScientificName" >>> getChildren >>> getText -< entrezSimpleTaxon simple_ParentTaxId <- atTag "ParentTaxId" >>> getChildren >>> getText -< entrezSimpleTaxon simple_Rank <- atTag "Rank" >>> getChildren >>> getText -< entrezSimpleTaxon returnA -< SimpleTaxon { simpleTaxId = read simple_TaxId :: Int, simpleScientificName = B.pack simple_ScientificName, simpleParentTaxId = read simple_ParentTaxId :: Int, simpleRank = read simple_Rank :: Rank } readEntrezParentIds :: String -> [Int] readEntrezParentIds = runLA (xreadDoc >>> parseEntrezParentTaxIds) parseEntrezParentTaxIds :: ArrowXml a => a XmlTree Int parseEntrezParentTaxIds = getChildren >>> atTag "Taxon" >>> proc entrezSimpleTaxon -> do simple_ParentTaxId <- atTag "ParentTaxId" >>> getChildren >>> getText -< entrezSimpleTaxon returnA -< read simple_ParentTaxId :: Int -- | Read entrez summary from internal haskell string readEntrezSummaries :: String -> [EntrezSummary] readEntrezSummaries = runLA (xreadDoc >>> parseEntrezSummaries) -- | Parse entrez summary result parseEntrezSummaries :: ArrowXml a => a XmlTree EntrezSummary parseEntrezSummaries = atTag "eSummaryResult" >>> proc entrezSummary -> do document_Summaries <- listA parseEntrezDocSums -< entrezSummary returnA -< EntrezSummary { documentSummaries = document_Summaries } -- | parseEntrezDocSums :: ArrowXml a => a XmlTree EntrezDocSum parseEntrezDocSums = atTag "DocSum" >>> proc entrezDocSum -> do summary_Id <- atTag "Id" >>> getChildren >>> getText -< entrezDocSum summary_Items <- listA parseSummaryItems -< entrezDocSum returnA -< EntrezDocSum { summaryId = summary_Id, summaryItems = summary_Items } -- | parseSummaryItems :: ArrowXml a => a XmlTree SummaryItem parseSummaryItems = atTag "Item" >>> proc summaryItem -> do item_Name <- getAttrValue "Name" -< summaryItem item_Type <- getAttrValue "Type" -< summaryItem item_Content <- getText <<< getChildren -< summaryItem returnA -< SummaryItem { itemName = item_Name, itemType = item_Type, itemContent = item_Content } -- | Read entrez summary from internal haskell string readEntrezSearch :: String -> [EntrezSearch] readEntrezSearch = runLA (xreadDoc >>> parseEntrezSearch) -- | Parse entrez search result parseEntrezSearch :: ArrowXml a => a XmlTree EntrezSearch parseEntrezSearch = atTag "eSearchResult" >>> proc entrezSearch -> do _count <- atTag "Count" >>> getChildren >>> getText -< entrezSearch _retMax <- atTag "RetMax" >>> getChildren >>> getText -< entrezSearch _retStart <- atTag "RetStart" >>> getChildren >>> getText -< entrezSearch _searchIds <- atTag "IdList" >>> listA parseSearchId -< entrezSearch _translationStack <- listA parseTranslationStack -< entrezSearch _queryTranslation <- atTag "QueryTranslation" >>> getChildren >>> getText -< entrezSearch returnA -< EntrezSearch { count = readInt _count, retMax = readInt _retMax, retStart = readInt _retStart, searchIds = _searchIds, translationStack = _translationStack, queryTranslation = _queryTranslation } -- | Parse entrez TranslationStack parseSearchId :: ArrowXml a => a XmlTree Int parseSearchId = atTag "Id" >>> proc entrezSearchId -> do searchId <- getChildren >>> getText -< entrezSearchId returnA -< (readInt searchId) -- | Parse entrez TranslationStack parseTranslationStack :: ArrowXml a => a XmlTree TranslationStack parseTranslationStack = atTag "TranslationStack" >>> proc entrezTranslationStack -> do _termSets <- listA parseTermSet -< entrezTranslationStack _operation <- atTag "OP" >>> getChildren >>> getText -< entrezTranslationStack returnA -< TranslationStack { termSets = _termSets, operation = _operation } -- | Parse entrez TermSet parseTermSet :: ArrowXml a => a XmlTree TermSet parseTermSet = atTag "TermSet" >>> proc entrezTermSet -> do _term <- atTag "Term" >>> getChildren >>> getText -< entrezTermSet _field <- atTag "Field" >>> getChildren >>> getText -< entrezTermSet _termCount <- atTag "Count" >>> getChildren >>> getText -< entrezTermSet _explode <- atTag "Explode" >>> getChildren >>> getText -< entrezTermSet returnA -< TermSet { term = _term, field = _field, termCount = readInt _termCount, explode = _explode } -- | Read entrez summary from internal haskell string readEntrezGeneSummaries :: String -> [EntrezGeneSummary] readEntrezGeneSummaries = runLA (xreadDoc >>> parseEntrezGeneSummaries) -- | Parse entrez summary result parseEntrezGeneSummaries :: ArrowXml a => a XmlTree EntrezGeneSummary parseEntrezGeneSummaries = atTag "eSummaryResult" >>> getChildren >>> atTag "DocumentSummarySet" >>> proc entrezSummary -> do _geneSummaries <- listA parseEntrezGeneDocSums -< entrezSummary returnA -< EntrezGeneSummary { geneSummaries = _geneSummaries } -- | parseEntrezGeneDocSums :: ArrowXml a => a XmlTree EntrezGeneDocSummary parseEntrezGeneDocSums = atTag "DocumentSummary" >>> proc entrezDocSum -> do _geneId <- atTag "Name" >>> getChildren >>> getText -< entrezDocSum _geneName <- atTag "Description" >>> getChildren >>> getText -< entrezDocSum _geneStatus <- atTag "Status" >>> getChildren >>> getText -< entrezDocSum _geneCurrentID <- atTag "CurrentID" >>> getChildren >>> getText -< entrezDocSum _geneGeneticSource <- atTag "GeneticSource" >>> getChildren >>> getText -< entrezDocSum _geneOtherAliases <- atTag "OtherAliases" >>> getChildren >>> getText -< entrezDocSum _geneGenomicInfo <- parseEntrezGenomicInfo -< entrezDocSum returnA -< EntrezGeneDocSummary { geneId = _geneId, geneName = _geneName, geneStatus = _geneStatus, geneCurrentID = _geneCurrentID, geneGeneticSource = _geneGeneticSource, geneOtherAliases = _geneOtherAliases, geneGenomicInfo = _geneGenomicInfo } parseEntrezGenomicInfo :: ArrowXml a => a XmlTree EntrezGenomicInfo parseEntrezGenomicInfo = atTag "GenomicInfo" >>> getChildren >>> atTag "GenomicInfoType" >>> proc entrezGenomicInfo -> do _chrAccVer <- atTag "ChrAccVer" >>> getChildren >>> getText -< entrezGenomicInfo _chrStart <- atTag "ChrStart" >>> getChildren >>> getText -< entrezGenomicInfo _chrStop <- atTag "ChrStop" >>> getChildren >>> getText -< entrezGenomicInfo _exonCount <- atTag "ExonCount" >>> getChildren >>> getText -< entrezGenomicInfo returnA -< EntrezGenomicInfo { chrAccVer = _chrAccVer, chrStart = readInt _chrStart, chrStop = readInt _chrStop, exonCount = readInt _exonCount } -- | gets all subtrees with the specified tag name atTag :: ArrowXml a => String -> a XmlTree XmlTree atTag tag = deep (isElem >>> hasName tag) --------------------------------------- -- Retrieval functions -- | Retrieve sequence for gene symbol (e.g. yhfA) from accession number (e.g. NC_000913.3) and if available entrez registration (toolname,devemail) retrieveGeneSymbolFasta :: String -> String -> Maybe (String,String) -> IO String retrieveGeneSymbolFasta genesymbol accession registrationInfo = do let query1 = EntrezHTTPQuery (Just "esearch") (Just "gene") ("term=" ++ genesymbol ++ urlEncode ("[Gene Name] AND " ++ accession ++ "[Nucleotide Accession]")) --print query1 uniqueidresponse <- entrezHTTP query1 --print uniqueidresponse let uniqueid = head (searchIds (head (readEntrezSearch uniqueidresponse))) let query2 = EntrezHTTPQuery (Just "esummary") (Just "gene") ("id=" ++ show uniqueid) summaryresponse <- entrezHTTP query2 --print summaryresponse let parsedSummary = head (geneSummaries (head (readEntrezGeneSummaries summaryresponse))) let accessionVersion = chrAccVer (geneGenomicInfo parsedSummary) let seqStart = chrStart (geneGenomicInfo parsedSummary) let seqStop = chrStop (geneGenomicInfo parsedSummary) let strand = convertCoordinatesToStrand seqStart seqStop fetchNucleotideString accessionVersion seqStart seqStop strand registrationInfo -- | Fetches sequence strings from the nucleotide database. nucleotideId can be a NCBI accession number or gene id. -- Strand is 1 in case of plus strand (forward) or 2 minus (reverse) strand, the setStrand function can be used for conversion. fetchNucleotideString :: String -> Int -> Int -> Int -> Maybe (String,String) -> IO String fetchNucleotideString nucleotideId seqStart seqStop strand maybeRegistrationInfo = do let registrationInfo = maybeBuildRegistration maybeRegistrationInfo let program' = Just "efetch" let database' = Just "nucleotide" let query' = "id=" ++ nucleotideId ++ "&seq_start=" ++ show seqStart ++ "&seq_stop=" ++ show seqStop ++ "&rettype=fasta" ++ "&strand=" ++ show strand ++ registrationInfo let entrezQuery = EntrezHTTPQuery program' database' query' entrezHTTP entrezQuery convertCoordinatesToStrand :: Int -> Int -> Int convertCoordinatesToStrand start end | start <= end = 1 | otherwise = 2 setStrand :: String -> Int setStrand strandString | strandString == "+" = 1 | strandString == "-" = 2 | strandString == "forward" = 1 | strandString == "reverse" = 2 | strandString == "Forward" = 1 | strandString == "Reverse" = 2 | strandString == "plus" = 1 | strandString == "false" = 2 | strandString == "Plus" = 1 | strandString == "False" = 2 | otherwise = 1 readInt :: String -> Int readInt = read -- | Builds Entrez registration String if present maybeBuildRegistration :: Maybe (String,String) -> String maybeBuildRegistration maybeRegistration | isJust maybeRegistration = buildRegistration toolname developeremail | otherwise = "" where registration = fromJust maybeRegistration toolname = fst registration developeremail = snd registration -- | Builds Entrez registration String that has to be appended to query key buildRegistration :: String -> String -> String buildRegistration toolname developeremail = "&tool=" ++ toolname ++ "&email=" ++ developeremail