{- convert-annotation Gregory W. Schwartz Converts an unknown annotation to Ensembl's annotation, or other annotation. -} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE DataKinds #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TypeOperators #-} module Main where -- Standard import Data.Maybe import Data.Char import Data.List import Control.Monad import GHC.Generics import Data.Semigroup -- Cabal import qualified Data.Vector as V import qualified Data.ByteString.Lazy.Char8 as B import qualified Data.Text as T import qualified Data.Csv as CSV import qualified Control.Lens as L import Pipes import qualified Pipes.Prelude as P import qualified Pipes.ByteString as PB import Pipes.Csv import Options.Generic import qualified Foreign.R as R import Foreign.R (SEXP, SEXPTYPE) import Language.R.Instance as R import Language.R.QQ import Language.R.Literal as R -- Local import Types import EnsemblConvert import HUGOConvert import UniProtConvert import RGeneConvert import MSigDBRDataConvert -- | Command line arguments data Options = Info { delimiter :: Maybe String "([,] | CHAR) The delimiter of the CSV file." , database :: String "(Ensembl | HUGO TYPE | UniProt | RGene (SPECIES, TYPE, TYPE) | MSigDBRdata (FILE, RDATA, TYPE)) Which database to convert with. TYPE is the type of the original gene symbol. The compatible list for TYPE with HUGO is in http://www.genenames.org/help/rest-web-service-help. HUGO is only supported for Annotation. RGene (Annotation only) takes in a type of (SPECIES, FROM, TO) for the gene symbol origin and destination, where species is generally \"hsapiens_gene_ensembl\" or \"mmusculus_gene_ensembl\". MSigDBRdata (Info only) takes an rdata file (tested with http://bioinf.wehi.edu.au/software/MSigDB/), the name of the rdata object containing the named list, and the TYPE of symbol (compatible list at http://bioconductor.org/packages/release/bioc/manuals/biomaRt/man/biomaRt.pdf in getGene) which returns pathways separated by \"/\"." , descriptionField :: Maybe String "(Other TEXT | Description | Synonyms) The info to retrieve about the identifier. Description provides information about the identifier while synonyms provides alternate identifiers for the same entity. Returns a list of information (delimited by '/') for each match to Ensembl's cross references. For UniProt, enter a valid column (http://www.uniprot.org/help/programmatic_access)." , column :: T.Text "(COLUMN) The column containing the identifier. Must be a valid id for info." , newColumn :: Maybe T.Text "([Nothing] | COLUMN) The new column to put the results into. If unspecified, replaces the original column." , remove :: Bool "Whether to remove empty results (no matches to the database)." , strict :: Bool "Whether to load everything in memory, no streaming. Useful for the R conversions only." } | Annotation { delimiter :: Maybe String "([,] | CHAR) The delimiter of the CSV file." , database :: String "(Ensembl | HUGO TYPE | UniProt | RGene (SPECIES, TYPE, TYPE) | MSigDBRdata (FILE, RDATA, TYPE)) Which database to convert with. TYPE is the type of the original gene symbol. The compatible list for TYPE with HUGO is in http://www.genenames.org/help/rest-web-service-help. HUGO is only supported for Annotation. RGene (Annotation only) takes in a type of (SPECIES, FROM, TO) for the gene symbol origin and destination, where species is generally \"hsapiens_gene_ensembl\" or \"mmusculus_gene_ensembl\". MSigDBRdata (Info only) takes an rdata file (tested with http://bioinf.wehi.edu.au/software/MSigDB/), the name of the rdata object containing the named list, and the TYPE of symbol (compatible list at http://bioconductor.org/packages/release/bioc/manuals/biomaRt/man/biomaRt.pdf in getGene) which returns pathways separated by \"/\"." , column :: T.Text "(COLUMN) The column containing the identifier. Must be a valid id for info." , newColumn :: Maybe T.Text "([Nothing] | COLUMN) The new column to put the results into. If unspecified, replaces the original column." , remove :: Bool "Whether to remove empty results (no matches to the database)." , strict :: Bool "Whether to load everything in memory, no streaming. Useful for the R conversions only." } deriving (Generic) instance ParseRecord Options -- | Map the header column to the rest of the file for converting that -- column. pipeConvert :: Options -> Maybe (RMart s) -> Maybe (RData s) -> Pipe [T.Text] [T.Text] IO () pipeConvert opts rMart rData = do h <- await let c = col opts h newCol = unHelpful . newColumn $ opts yield . maybe h (\x -> h <> [x]) $ newCol forever $ do x <- await newX <- lift . convertSingle opts rMart rData . (!! c) $ x unless ((unHelpful . remove $ opts) && T.null newX) . maybe (yield . L.set (L.ix c) newX $ x) (const (yield (x <> [newX]))) $ newCol return () return () -- | Get the index of the column. col :: Options -> [T.Text] -> Int col opts = fromMaybe (error "Column not found.") . elemIndex (unHelpful $ column opts) -- | Convert the entire file at once, no streaming. strictConvert :: Options -> Maybe (RMart s) -> Maybe (RData s) -> [[T.Text]] -> IO () strictConvert _ _ _ [] = error "Empty file." strictConvert _ _ _ (_:[]) = error "Empty file." strictConvert opts rMart rData (h:body) = do let c = col opts h newCol = unHelpful . newColumn $ opts newH = maybe h (\x -> h <> [x]) $ newCol xs = fmap (!! c) body newXS <- convertMultiple opts rMart rData $ xs let addToRow newX row = if (unHelpful . remove $ opts) && (T.null newX) then Nothing else Just . maybe (L.set (L.ix c) newX row) (\x -> row <> [newX]) $ newCol newBody = catMaybes . zipWith addToRow newXS $ body B.putStrLn . CSV.encode . (:) newH $ newBody -- | The conversion process for streaming. convertSingle :: Options -> Maybe (RMart s) -> Maybe (RData s) -> T.Text -> IO T.Text convertSingle opts@(Info { descriptionField = df }) rMart rData = fmap (fromMaybe "" . fmap unDesc) . whichDesc (read . unHelpful . database $ opts) . UnknownAnn where whichDesc Ensembl = toEnsemblDesc ( read . fromMaybe (error "Needs description field.") . unHelpful $ df ) whichDesc (HUGO _) = error "HUGO description not yet supported." whichDesc UniProt = toUniProtDesc ( read . fromMaybe (error "Needs description field.") . unHelpful $ df ) whichDesc (RGene _) = error "RGene description not yet supported." whichDesc (MSigDBRData queryType) = toMSigDBPathways (fromJust rData) (fromJust rMart) (MSigDBType queryType) convertSingle opts@(Annotation {}) rMart rData = fmap (fromMaybe "" . fmap unAnn) . whichAnn (read . unHelpful . database $ opts) . UnknownAnn where whichAnn Ensembl = toEnsemblAnn whichAnn (HUGO queryType) = toHUGOAnn . HUGOType $ queryType whichAnn UniProt = toUniProtAnn whichAnn (RGene queryType) = toRGeneAnn (fromJust rMart) (RType queryType) whichAnn (MSigDBRData _) = error "MSigDBRData annotation not yet supported." -- | The conversion process for all in memory. convertMultiple :: Options -> Maybe (RMart s) -> Maybe (RData s) -> [T.Text] -> IO [T.Text] convertMultiple opts@(Info { descriptionField = df }) rMart rData = fmap (fmap (fromMaybe "" . fmap unDesc)) . whichDesc (read . unHelpful . database $ opts) . fmap UnknownAnn where whichDesc Ensembl = mapM ( toEnsemblDesc ( read . fromMaybe (error "Needs description field.") . unHelpful $ df ) ) whichDesc (HUGO _) = error "HUGO description not yet supported." whichDesc UniProt = mapM (toUniProtDesc ( read . fromMaybe (error "Needs description field.") . unHelpful $ df ) ) whichDesc (RGene _) = error "RGene description not yet supported." whichDesc (MSigDBRData queryType) = toMSigDBPathwaysMultiple (fromJust rData) (fromJust rMart) (MSigDBType queryType) convertMultiple opts@(Annotation {}) rMart rData = fmap (fmap (fromMaybe "" . fmap unAnn)) . whichAnn (read . unHelpful . database $ opts) . fmap UnknownAnn where whichAnn Ensembl = mapM toEnsemblAnn whichAnn (HUGO queryType) = mapM (toHUGOAnn . HUGOType $ queryType) whichAnn UniProt = mapM toUniProtAnn whichAnn (RGene queryType) = toRGeneAnnMultiple (fromJust rMart) (RType queryType) whichAnn (MSigDBRData _) = error "MSigDBRData annotation not yet supported." main :: IO () main = do opts <- getRecord "convert-annotation, Gregory W. Schwartz.\ \ Converts an unknown annotation to some other\ \ annotation." let delim = case unHelpful . delimiter $ opts of Nothing -> ',' (Just "\\t") -> '\t' (Just [x]) -> x (Just []) -> error "No delimiter set" _ -> error "Delimiter is one character" csvOpts = CSV.defaultDecodeOptions { CSV.decDelimiter = fromIntegral (ord delim) } R.withEmbeddedR R.defaultConfig $ R.runRegion $ do rMart <- case read . unHelpful . database $ opts of (RGene (species, _, _)) -> fmap Just $ getRMart species (MSigDBRData _) -> fmap Just $ getRMart "hsapiens_gene_ensembl" _ -> return Nothing rData <- case read . unHelpful . database $ opts of (MSigDBRData (!file, !object, _)) -> fmap Just . getRData (File file) $ object _ -> return Nothing if unHelpful . strict $ opts then do contents <- liftIO B.getContents liftIO . strictConvert opts rMart rData . V.toList . either error id $ ( CSV.decode NoHeader contents :: Either String (V.Vector [T.Text]) ) else liftIO $ runEffect $ decodeWith csvOpts NoHeader PB.stdin >-> P.concat >-> (pipeConvert opts rMart rData) >-> encode >-> PB.stdout return ()