{- Main by Gregory Schwartz -- | Computes a BLOSUM, but takes into account alternate amino acids -} module Main where -- Standard import Data.Maybe import qualified Data.Map.Strict as Map import qualified Data.Sequence as Seq import qualified Data.List.Split as Split import qualified Data.Monoid as Mon import Data.Semigroup ((<>)) import qualified System.IO as IO -- Cabal import qualified Data.Text as T import qualified Data.Text.IO as T import Data.Fasta.Text import Pipes import qualified Pipes.Prelude as P import qualified Pipes.Text.IO as PT import Options.Applicative -- Local import Types import Utility import Cluster import Matrix import Print -- Command line arguments data Options = Options { input :: Maybe String , identity :: Double , blockField :: Maybe Int , csv :: Maybe String , badChars :: Maybe String , allFlag :: Bool , output :: Maybe String } -- Command line options options :: Parser Options options = Options <$> optional ( strOption ( long "input" <> short 'i' <> metavar "FILE" <> help "The input fasta file with blocks labeled in a field.\ \ If empty, then assumed to use stdin to send the many, many\ \ filenames, where each file is a block." ) ) <*> option auto ( long "identity" <> short 'd' <> metavar "[100] | PERCENT" <> value 100 <> help "The percent identity of two sequences for clustering.\ \ BLOSUM's most famous identity is 62" ) <*> optional ( option auto ( long "field" <> short 'f' <> metavar "FIELD" <> help "The field in the fasta headers (1 indexed) which says\ \ the block it belongs to. Assumed to already be next to\ \ each other, so the block members should not be scattered" ) ) <*> optional ( strOption ( long "csv" <> short 'c' <> metavar "STRING" <> help "Print the output as a 2D csv using this ordering" ) ) <*> optional ( strOption ( long "remove-characters" <> short 'b' <> metavar "STRING" <> help "Whether to remove this character from individual sequences\ \ that have at least one of these characters" ) ) <*> switch ( long "all-remove-characters" <> short 'B' <> help "Whether to remove all positions that have at least one of the\ \ characters from remove-character" ) <*> optional ( strOption ( long "output" <> short 'o' <> metavar "FILE" <> help "The output file containing the matrix" ) ) -- | Get a frequency map from a single file getFrequencyMapSingleFile :: Options -> FilePath -> IO FrequencyMap getFrequencyMapSingleFile opts file = do h <- IO.openFile file IO.ReadMode fastaList <- runEffect $ P.toListM $ pipesFasta . PT.fromHandle $ h let clusters = getClusterIdentity (Identity $ identity opts) clusterMaps = Map.elems . Map.map getClusterFrequencyMap . unClusterMap blockMaps = fmap ( getBlockMap (allFlag opts) ((fmap . fmap) AA $ badChars opts) ) . fmap clusterMaps . fmap clusters . fmap Seq.fromList . groupBlocks (fmap Field $ blockField opts) $ fastaList frequencyMap = joinBlockMaps blockMaps return frequencyMap -- | Get a block from a filepath getBlock :: Options -> FilePath -> IO BlockMap getBlock opts file = do h <- IO.openFile file IO.ReadMode fastaList <- runEffect $ P.toListM $ pipesFasta . PT.fromHandle $ h let clusters = getClusterIdentity (Identity $ identity opts) clusterMap = Map.elems . Map.map getClusterFrequencyMap . unClusterMap blockMap = getBlockMap (allFlag opts) ((fmap . fmap) AA $ badChars opts) . clusterMap . clusters . Seq.fromList $ fastaList return blockMap blosum :: Options -> IO () blosum opts = do frequencyMap <- case input opts of Nothing -> fmap FrequencyMap . P.fold (Mon.<>) (AAMap Map.empty) id $ P.stdinLn >-> P.mapM (fmap unBlockMap . getBlock opts) (Just x) -> getFrequencyMapSingleFile opts x let blosumMatrix = getBlosum frequencyMap result = case csv opts of (Just x) -> printBlosumCSV (map AA x) blosumMatrix Nothing -> printBlosum blosumMatrix -- Save results case output opts of Nothing -> T.putStrLn result (Just x) -> T.writeFile x result main :: IO () main = execParser opts >>= blosum where opts = info (helper <*> options) ( fullDesc <> progDesc "Computes a BLOSUM, but takes into account\ \ alternate amino acids" )