{- heatitup-complete Gregory W. Schwartz Find duplications a sequence with assembly. Requires all to be in path: heatitup, Trinity, samtools (if input is a bam), rsem-prepare-reference, rsem-calculate-expression, and bowtie2. Cleans up at the end by deleting all files with INPUT.* pattern, the new period being the main difference. -} {-# LANGUAGE ViewPatterns #-} {-# LANGUAGE OverloadedStrings #-} module Main where -- Standard import Data.Bool import Data.Maybe import qualified Data.List.NonEmpty as NE import qualified Data.Map.Strict as Map import qualified Data.Set as Set -- Cabal import Data.Fasta.Text import Pipes import qualified Pipes.Prelude as P import qualified Pipes.Text as PT import qualified Pipes.Text.IO as PT import qualified Control.Foldl as Fold import qualified Data.ByteString.Lazy.Char8 as B import qualified Data.Csv as CSV import qualified Data.Text as T import qualified Data.Vector as V import qualified Options.Applicative as O import Safe (headMay) import Turtle import Turtle.Line -- Local import Types import Parse import Merge import Blast import Utility -- | Command line arguments data Options = Options { input :: String , spacerFlag :: Bool , refInput :: Maybe String , blacklistInput :: Maybe String , output :: Maybe String , outputPlot :: Maybe String , outputLabel :: String , refField :: Int , posField :: Maybe Int , ignoreField :: Maybe Int , ignoreChar :: Maybe String , inputMinSize :: Int , gaussWindow :: Int , gaussTime :: Double , gaussThreshold :: Double , inputMinMut :: Maybe Int , inputDistance :: Int , refCheckBlacklistFlag :: Bool , refRecBlacklistFlag :: Bool , minRichness :: Int , inputColorDupL :: String , inputColorDupR :: String , inputColorMut :: String , inputColorSpacer :: String , inputColorBackground :: String , inputColorForeground :: String , preprocessType :: Preprocess , blastCommand :: Maybe String , blastArgs :: Maybe String , trinityCommand :: String , trinityArgs :: TrinityArgs , trinityAbundance :: String , dirtyFlag :: Bool , cigarFlag :: Bool , headers :: Maybe String } -- | Command line options options :: O.Parser Options options = Options <$> O.strOption ( O.long "input" <> O.short 'i' <> O.metavar "FILE" <> O.help "The input file" ) <*> O.switch ( O.long "spacer" <> O.short 's' <> O.help "Whether to characterize the spacer. Requires\ \ reference-input and matching labels for the input and\ \ reference-input to compare." ) <*> O.optional ( O.strOption ( O.long "reference-input" <> O.short 'I' <> O.metavar "[Nothing] | FILE" <> O.help "The input file containing the reference sequences to compare\ \ to. The first entry in the field must be the accession and\ \ match the requested field from reference-field for the\ \ input. With no supplied file, no spacer will be annotated." ) ) <*> O.optional ( O.strOption ( O.long "blacklist-input" <> O.short 'b' <> O.metavar "[Nothing] | FILE" <> O.help "The input fasta file containing possible false positives --\ \ sequences which may be duplicate nucleotides in the\ \ reference sequence." ) ) <*> O.optional ( O.strOption ( O.long "output" <> O.short 'o' <> O.metavar "FILE" <> O.help "The output file" ) ) <*> O.optional ( O.strOption ( O.long "output-plot" <> O.short 'O' <> O.metavar "FILE" <> O.help "The output file for the plot. Each new plot gets a new number\ \ on it: output_1.svg, output_2.svg, etc. Each plot uses the\ \ first entry in the fasta header as the label." ) ) <*> O.strOption ( O.long "label" <> O.short 'l' <> O.metavar "STRING" <> O.value "" <> O.help "The label to use in the label column for the output" ) <*> O.option O.auto ( O.long "reference-field" <> O.short 'f' <> O.metavar "[1] | INT" <> O.value 1 <> O.help "The field in each input header that contains the reference\ \ accession number to compare to. Results in an out of bounds\ \ if this field does not exist." ) <*> O.optional ( O.option O.auto ( O.long "position-field" <> O.short 'p' <> O.metavar "[Nothing] | INT" <> O.help "The field in each input header that contains the starting\ \ position of the read. Added to the annotations. Results\ \ in out of bounds if this field does not exist." ) ) <*> O.optional ( O.option O.auto ( O.long "ignore-field" <> O.short 'g' <> O.metavar "[Nothing] | INT" <> O.help "The field in each input header that contains a 0 or a 1:\ \ 0 means to ignore this read (assign as Normal) and 1\ \ means to find a duplication in this read.\ \ Used for reads where there is known to be no duplication\ \ and thus helps remove false positives." ) ) <*> O.optional ( O.strOption ( O.long "ignore-char" <> O.metavar "[Nothing or NonAssembly CHAR] | CHAR" <> O.help "Character to ignore, useful for gaps.\ \ Defaults to nothing or the character used in NonAssembly." ) ) <*> O.option O.auto ( O.long "min-size" <> O.short 's' <> O.metavar "[15] | INT" <> O.value 15 <> O.help "The minimum size of a duplication" ) <*> O.option O.auto ( O.long "gaussian-window" <> O.short 'w' <> O.metavar "[4] | Double" <> O.value 3 <> O.help "The window for the discrete gaussian kernel atypical spacer\ \ determination" ) <*> O.option O.auto ( O.long "gaussian-time" <> O.short 't' <> O.metavar "[2] | Double" <> O.value 2 <> O.help "The time for the discrete gaussian kernel atypical spacer\ \ determination" ) <*> O.option O.auto ( O.long "gaussian-threshold" <> O.short 'T' <> O.metavar "[0.4] | Double" <> O.value 0.4 <> O.help "The cutoff to be considered a mutation for the discrete\ \ gaussian kernel atypical spacer determination" ) <*> O.optional ( O.option O.auto ( O.long "min-mutations" <> O.short 'm' <> O.metavar "INT" <> O.help "The minimum number of nucleotides between mutations" ) ) <*> O.option O.auto ( O.long "levenshtein-distance" <> O.short 'L' <> O.metavar "[2] | INT" <> O.value 2 <> O.help "The minimum Levenshtein distance to the false positive\ \ checker. If the distance to the false positive string\ \ is less than or equal to this number,\ \ the duplication is considered\ \ a false positive. Compares candidates against each sequence\ \ in --blacklist-input" ) <*> O.switch ( O.long "reference-check-blacklist" <> O.short 'c' <> O.help "Whether to use the reference as a blacklist in addition to\ \ the supplied blacklist. That is, we check if the duplication\ \ can be found twice or more in the reference input." ) <*> O.switch ( O.long "reference-recursive-blacklist" <> O.short 'r' <> O.help "Whether to use the reference as a recursive\ \ blacklist in addition to\ \ the supplied blacklist. That is, the reference sequences\ \ are inputed with the same parameters (except distance, which\ \ here is 0)\ \ to the duplication finder, and those duplications found are\ \ added to the blacklist. This process is recursive, executed\ \ until no more duplications are found in the reference.\ \ Beware, too many blacklist entries can slow down the finder\ \ significantly, as each blacklist entry is compared with each\ \ candidate." ) <*> O.option O.auto ( O.long "min-richness" <> O.short 'R' <> O.metavar "[1] | INT" <> O.value 1 <> O.help "The minimum nucleotide richness (number of different types of\ \ nucleotides) allowed in the duplication to be considered\ \ real. Useful if the user knows that a sequence like\ \ \"TTTTTTTTCTTTTTTTTC\" is not likely to be real." ) <*> O.strOption ( O.long "color-left-duplication" <> O.metavar "[#a6cae3] | COLOR" <> O.help "The color of the left side of the repeated sequence." <> O.value "#a6cae3" ) <*> O.strOption ( O.long "color-right-duplication" <> O.metavar "[#b0de8a] | COLOR" <> O.help "The color of the right side of the repeated sequence." <> O.value "#b0de8a" ) <*> O.strOption ( O.long "color-difference" <> O.metavar "[#1978b3] | COLOR" <> O.help "The color of discrepancies between the left and right side of\ \ the duplication." <> O.value "#1978b3" ) <*> O.strOption ( O.long "color-spacer" <> O.metavar "[#fdbd6e] | COLOR" <> O.help "The color of the exogenous nucleotides within the spacer." <> O.value "#fdbd6e" ) <*> O.strOption ( O.long "color-background" <> O.metavar "[#ffffff] | COLOR" <> O.help "The color of the background." <> O.value "#ffffff" ) <*> O.strOption ( O.long "color-foreground" <> O.metavar "[#000000] | COLOR" <> O.help "The color of the foreground." <> O.value "#000000" ) <*> O.option O.auto ( O.long "type" <> O.short 'P' <> O.metavar "Assembly | NonAssembly CHAR" <> O.help "The type of preprocessing before duplication finding.\ \ Assembly would be used on exome sequencing, RNA-seq, etc.\ \ while NonAssembly would be for certain paired end\ \ sequencing like amplicon based. Basically, are the reads\ \ fragmented across a location (Assembly) or are they\ \ all piled up (NonAssembly)? Paired end is required for\ \ NonAssembly, otherwise just use the duplication finding\ \ program directly on the reads for best results.\ \ NonAssembly additionally requires a character to use as\ \ filler between non-overlapping mate pairs, as the\ \ program will remove duplications containing that\ \ character. Input would look like \"NonAssembly 'X'\"." ) <*> O.optional ( O.strOption ( O.long "blast-command" <> O.short 'B' <> O.metavar "[Nothing] | PATH" <> O.help "The command used for blastn. Useful if not in path.\ \ Used for filtering out reads from irrelevant locations.\ \ If using small sequences, be sure to set blast-args\ \ to \"-task blastn-short\"." ) ) <*> O.optional ( O.strOption ( O.long "blast-args" <> O.short 'z' <> O.metavar "[Nothing] | STRING" <> O.help "The additional arguments used for blastn. Separated by space.\ \ Use \"-task blastn-short\" for small sequences." ) ) <*> O.strOption ( O.long "trinity-command" <> O.short 'C' <> O.metavar "[Trinity] | PATH" <> O.value "Trinity" <> O.help "The command used for Trinity. Useful if not in path." ) <*> O.option O.auto ( O.long "trinity-args" <> O.short 'A' <> O.metavar "[TrinityBase] | TrinityGenome | TrinityCustom STRING" <> O.value TrinityBase <> O.help "The arguments used for Trinity.\ \ TrinityBase is --seqType fq --run_as_paired --max_memory 10G --no_version_check --single,\ \ TrinityGenome is --genome_guided_max_intron 10000 --max_memory 10G --no_version_check --genome_guided_bam,\ \ and TrinityCustom STRING is STRING.\ \ Make sure the input argument\ \ is last and points to nothing (like in the default)." ) <*> O.strOption ( O.long "trinity-abundance" <> O.short 'a' <> O.metavar "[align_and_estimate_abundance.pl] | PATH" <> O.value "align_and_estimate_abundance.pl" <> O.help "The command used for align_and_estimate_abundance.pl\ \ in Trinity's util folder. Useful if not in path. Make sure\ \ kallisto is in the path." ) <*> O.switch ( O.long "dirty" <> O.short 'd' <> O.help "Leave behind the INPUT.* files at the end (but the trinity\ \ output is still deleted)." ) <*> O.switch ( O.long "no-cigar-filter" <> O.short 'G' <> O.help "Skip the CIGAR based filtering, that is, look at all reads\ \ and not just reads without all M's in the CIGAR." ) <*> O.optional ( O.strOption ( O.long "header" <> O.short 'H' <> O.metavar "[Nothing] | STRING" <> O.help "The headers used when converting bam to fasta.\ \ The order in the resulting header is\ \ \">FILE|ACCESSION|POSITION|IGNORE|HEADER\".\ \ So take that order\ \ into account for field options with positions and the\ \ rest. Also, make sure this string has fields separated by\ \ a pipe \"|\" character. So if you have HEADER as the\ \ \"ENSE00000SOMETHING\" reference accession that\ \ agrees with --input-reference, that would be\ \ field 5." ) ) -- | Get the Trinity arguments. defTrinityArgs :: TrinityArgs -> T.Text defTrinityArgs TrinityBase = T.pack "--seqType fq\ \ --run_as_paired\ \ --max_memory 10G\ \ --no_version_check\ \ --single" defTrinityArgs TrinityGenome = T.pack "--genome_guided_max_intron 10000\ \ --max_memory 10G\ \ --no_version_check\ \ --genome_guided_bam" defTrinityArgs (TrinityCustom x) = T.pack x -- | Run the trinity command. runTrinity :: Options -> Turtle.FilePath -> Turtle.FilePath -> IO () runTrinity opts tempFile tempDir = do let modifiedArgs = (T.words . defTrinityArgs . trinityArgs $ opts) <> [ format fp tempFile , "--output" , format fp tempDir ] stderr . inproc (T.pack $ trinityCommand opts) modifiedArgs $ mempty return () -- | Run samtools if the input is a bam file. runSamtools :: Options -> Turtle.FilePath -> IO Turtle.FilePath runSamtools opts tempFasta = do let inputFile = T.pack . Main.input $ opts outputType = case preprocessType opts of Assembly -> "fastq" (NonAssembly _) -> "fasta" fastFile <- if hasExtension (fromText inputFile) "bam" then strict $ do Turtle.output tempFasta . inproc "samtools" [outputType, inputFile] $ mempty return . NE.head . textToLines . format fp $ tempFasta else return inputFile return . fromText . T.strip $ fastFile -- | Run the abundance command. runAlignContig :: Options -> Turtle.FilePath -> Turtle.FilePath -> Shell Line runAlignContig opts tempDir trinityFastaFile = do tempFasta <- mktempfile tempDir "trinity.fasta" inputFasta <- liftIO . runSamtools opts $ tempFasta fastaMap <- fmap (fastaToMap ' ') -- Trinity outputs ' ' as a separator. . liftIO . PT.runSafeT . runEffect . P.toListM $ pipesFasta (PT.readFile . T.unpack . format fp $ trinityFastaFile) bamOutput <- strict . inproc "samtools" ["view", "-F", "4", "-"] -- No headers and unmapped reads . inproc (T.pack $ trinityAbundance opts) [ "--transcripts", format fp trinityFastaFile , "--seqType", "fa" , "--single", format fp inputFasta , "--est_method", "kallisto" , "--kallisto_add_opts", "--pseudobam" , "--trinity_mode" , "--prep_reference" , "--output_dir", format fp tempDir ] $ mempty let bamRows' = fmap (BamRow . T.splitOn "\t") . T.lines $ bamOutput bamRows <- case blastCommand opts of (Just cmd) -> filterBlastBamRows 2 (Command . T.pack $ cmd) (fmap Args . blastArgs $ opts) ( Subject . fromText . T.pack . fromMaybe (error "Need --reference-input for blast.") . refInput $ opts ) (QueryFile trinityFastaFile) bamRows' Nothing -> return bamRows' let matchMap = getMatchMap bamRows select . concatMap (NE.toList . textToLines) . concat . nub' . fmap ( bamToFasta (Just fastaMap) (bool (Just matchMap) Nothing . cigarFlag $ opts) (T.pack $ Main.input opts) (fmap (Headers . T.pack) . headers $ opts) ) $ bamRows -- | Find duplications in the Trinity output. runFindDuplication :: Options -> Shell Line -> Shell Line runFindDuplication opts streamIn = inproc "heatitup" commandList streamIn where ignoreCharArg (NonAssembly c) = Just . ("--ignore-char" :) . (\x -> [[x]]) . fromMaybe c . (=<<) headMay . ignoreChar $ opts ignoreCharArg _ = fmap (("--ignore-char" :) . (\x -> [[x]])) . (=<<) headMay . ignoreChar $ opts commandList = fmap T.pack . concat . catMaybes $ [ fmap (("--reference-input" :) . (:[])) . refInput $ opts , bool Nothing (Just ["--spacer"]) . spacerFlag $ opts , fmap (("--blacklist-input" :) . (:[])) . blacklistInput $ opts , fmap (("--output" :) . (:[])) . Main.output $ opts , fmap (("--output-plot" :) . (:[])) . outputPlot $ opts , Just ["--label", outputLabel opts] , Just ["--reference-field", show . refField $ opts] , fmap (("--position-field" :) . (:[]) . show) . posField $ opts , fmap (("--ignore-field" :) . (:[]) . show) . ignoreField $ opts , ignoreCharArg $ preprocessType opts , Just ["--min-size", show . inputMinSize $ opts] , Just ["--gaussian-window", show . gaussWindow $ opts] , Just ["--gaussian-time", show . gaussTime $ opts] , Just ["--gaussian-threshold", show . gaussThreshold $ opts] , fmap (("--min-mutations" :) . (:[]) . show) . inputMinMut $ opts , Just ["--levenshtein-distance", show . inputDistance $ opts] , bool Nothing (Just ["--reference-check-blacklist"]) . refCheckBlacklistFlag $ opts , bool Nothing (Just ["--reference-recursive-blacklist"]) . refRecBlacklistFlag $ opts , Just ["--min-richness", show . minRichness $ opts] , Just ["--color-left-duplication", inputColorDupL $ opts] , Just ["--color-right-duplication", inputColorDupR $ opts] , Just ["--color-difference", inputColorMut $ opts] , Just ["--color-spacer", inputColorSpacer $ opts] , Just ["--color-background", inputColorBackground $ opts] , Just ["--color-foreground", inputColorForeground $ opts] ] -- | Cleanup the intermediate files. cleanup :: Options -> Shell () cleanup opts = sh $ do removableHere <- fold ( grep (has . text $ T.pack (Main.input opts) <> ".") . join . fmap (select . textToLines . format fp) . ls $ "." ) Fold.list removableThere <- fold ( grep (has . text $ T.pack (Main.input opts) <> ".") . join . fmap (select . textToLines . format fp) . ls . directory . fromText . T.pack . Main.input $ opts ) Fold.list mapM_ (rm . fromText . lineToText) $ removableHere <> removableThere return () assembly :: Options -> IO () assembly opts = sh $ do tempDir <- mktempdir "." "trinity" tempFile' <- mktempfile "." "tmp.fq" tempFile <- case trinityArgs opts of TrinityBase -> liftIO . runSamtools opts $ tempFile' TrinityGenome -> return . fromText . T.pack . Main.input $ opts (TrinityCustom _) -> return . fromText . T.pack . Main.input $ opts liftIO . runTrinity opts tempFile $ tempDir trinityFastaFile <- fold (find (has "/Trinity" <> suffix ".fasta") tempDir) Fold.head _ <- if isNothing trinityFastaFile then error "Fasta not generated." else return () duplicationOutput <- strict . runFindDuplication opts . runAlignContig opts tempDir . fromJust $ trinityFastaFile unless (dirtyFlag opts) . cleanup $ opts abundances <- strict . maybe mempty (Turtle.input . const (tempDir "abundance.tsv")) $ trinityFastaFile unless (dirtyFlag opts) . cleanup $ opts let abundanceMap' = getAbundanceMap . B.pack . T.unpack $ abundances (dupHeader, dupRows) = parseDuplications . B.pack . T.unpack $ duplicationOutput accSet = getAccSet dupRows abundanceMap = --We only want the frequency of valid contigs AbundanceMap . Map.filterWithKey (\k _ -> Set.member k . unAccSet $ accSet) . unAbundanceMap $ abundanceMap' frequencyMap = getFrequencyMap abundanceMap newRows = fmap (mergeAbundance frequencyMap) dupRows finalOutput = CSV.encodeByName (V.snoc dupHeader "frequency") . fmap unDuplicationRow $ newRows liftIO . B.putStrLn $ finalOutput -- | Remove rows where the duplication contains the filler. removeFillDups :: Fill -> WithHeader Line -> Maybe Line removeFillDups _ (Header x) = Just x removeFillDups (Fill fill) (Row h row) = if T.isInfixOf (T.singleton fill) . fromMaybe (error "dSubstring column not found.") . lookup "dSubstring" . zip (getCols h) . getCols $ row then Nothing else Just row where getCols = T.splitOn "," . lineToText nonAssembly :: Options -> Fill -> IO () nonAssembly opts fill = sh $ do bamOutput <- strict . inproc "samtools" ["view", "-"] -- No headers . inproc "samtools" [ "sort" , "-n" , "-O", "sam" , T.pack $ Main.input opts ] $ mempty let bamRows' = parseBAM bamOutput query = QueryShell . select . concatMap (NE.toList . textToLines) . concatMap blastBamToFasta $ bamRows' -- Get rid of sequences that don't blast to the reference. bamRows <- case blastCommand opts of (Just cmd) -> filterBlastBamRows 0 (Command . T.pack $ cmd) (fmap Args . blastArgs $ opts) ( Subject . fromText . T.pack . fromMaybe (error "Need --reference-input for blast.") . refInput $ opts ) query bamRows' Nothing -> return bamRows' let mergedMates = mergeMates fill bamRows fastaOutput = select . concatMap (NE.toList . textToLines) . concatMap ( bamToFasta Nothing Nothing (T.pack $ Main.input opts) (fmap (Headers . T.pack) . headers $ opts) ) $ mergedMates stdout . fmap fromJust . mfilter isJust . fmap (removeFillDups fill) . Turtle.header . runFindDuplication opts $ fastaOutput mainFunc :: Options -> IO () mainFunc opts@(preprocessType -> Assembly) = assembly opts mainFunc opts@(preprocessType -> NonAssembly fill) = nonAssembly opts (Fill fill) main :: IO () main = O.execParser opts >>= mainFunc where opts = O.info (O.helper <*> Main.options) ( O.fullDesc <> O.progDesc "Finds duplications in a sequence with assembly" <> O.header "heatitup-complete, Gregory W. Schwartz" )