-- modify-fasta -- By Gregory W. Schwartz -- Takes a fasta file filters the fasta file in several ways. {-# LANGUAGE BangPatterns #-} -- Built-in import Data.Maybe import qualified Data.Map as M import qualified System.IO as IO import Control.Monad -- Cabal import qualified Data.Text as T import qualified Data.Text.IO as T import Options.Applicative import Data.Fasta.Text import Pipes import qualified Pipes.Prelude as P import qualified Pipes.Text as PT import qualified Pipes.Text.IO as PT import qualified Data.List.Split as Split -- Local import Types import Utility import FilterCloneMap import FilterFastaList import FilterCloneList import TransformFastaList import TransformCloneList import Print -- Command line arguments data Options = Options { input :: String , aminoAcidsFlag :: GeneticUnit , legacyFlag :: Bool , clipFastaFlag :: Bool , convertToAminoAcidsFlag :: Bool , inputCodonTable :: CodonTable , inputFillIn :: FillInValue , inputStart :: Maybe Int , inputStop :: Maybe Int , inputMinSequenceMutation :: Maybe Int , inputMutationCount :: Maybe Int , inputMutationPercent :: Maybe Double , addLengthFlag :: Bool , removeTheNsFlag :: Bool , removeGermlinesPreFlag :: Bool , removeHighlyMutatedFlag :: Bool , removeStopsFlag :: Bool , removeDuplicatesFlag :: Bool , removeOutOfFrameFlag :: Bool , removeUnknownNuc :: Bool , inputInFrame :: Maybe Field , inputOutFrame :: Maybe Field , trimFrame :: Bool , trackMutations :: Maybe Field , translateTrackMutations :: Bool , inputStopRange :: Int , inputCodonMut :: CodonMut , inputCodonMutType :: String , inputMutType :: String , inputChangeField :: String , inputCustomFilter :: String , customGermlineFlag :: Bool , customRemoveFlag :: Bool , inputGeneAlleleField :: Int , countFlag :: Bool , output :: String } -- Command line options options :: Parser Options options = Options <$> strOption ( long "input" <> short 'i' <> metavar "FILE" <> value "" <> help "The input fasta file or CLIP fasta file" ) <*> option auto ( long "unit" <> short 'u' <> metavar "AminoAcid | Nucleotide" <> help "Whether these sequences are composed of\ \ amino acids (AminoAcid) or nucleotides (Nucleotide)" ) <*> switch ( long "legacy" <> short 'L' <> help "Whether to use the legacy version with no pipes. Note: The\ \ legacy version supports more features but is greedy\ \ in terms of speed and memory. Use only if really needed.\ \ Features that are legacy only are noted in this\ \ documentation" ) <*> switch ( long "clip-fasta" <> short 'A' <> help "Whether the input is a clip fasta file (has germline >>\ \ sequences)." ) <*> switch ( long "convert-to-amino-acids" <> short 'C' <> help "Whether to convert the filtered sequences to amino acids\ \ in the output. Applied last, even after add length." ) <*> option auto ( long "codon-table" <> metavar "[(CODON, AA)]" <> value [] <> help "The codon table used for converting to amino acids. This\ \ is a list of codon AA pairs, so for example:\ \ [(\"AAT\", '@'), (\"ATC\", '#')] for AATATCGGG would result\ \ in @#G as defaults are used if the codon is not in\ \ the custom table." ) <*> option auto ( long "fill-in" <> short 'F' <> metavar "(FIELD, START, 'CHARACTER')" <> value (-1, -1, 'X') <> help "Use the FIELD index (1 indexed, split by '|') in the\ \ header to fill in the unknown character CHARACTER with the\ \ respective character in that field. Use the START value\ \ to let the program know where the string in the field starts\ \ from in the sequence. For instance, a header of >H2O|HELLO\ \ for the sequence HIMANHELXODUDE and a value of fill-in of\ \ (2, 6, 'X') would change the sequence to HIMANHELLODUDE. If\ \ the string in the field has the unknown character as well,\ \ i.e. >H2O|HELXO, then the sequence is considered bad and is\ \ removed" ) <*> optional ( option auto ( long "start" <> short 't' <> metavar "[ ] | INT" <> help "Remove everything before this position (1 indexed).\ \ Done first just after filtering." ) ) <*> optional ( option auto ( long "stop" <> short 'p' <> metavar "[ ] | INT" <> help "Remove everything after this position (1 indexed).\ \ Done first just after filtering." ) ) <*> optional ( option auto ( long "frequent-mutation-min" <> metavar "[ ] | INT" <> help "Minimum number of sequences required for a clone to be valid\ \ in the calculation of frequent mutations, replaces\ \ with gaps otherwise" ) ) <*> optional ( option auto ( long "frequent-mutation-count" <> metavar "[ ] | INT" <> help "Only include codons containing a mutation present in this\ \ many sequences in the clone or more. 0 is all sequences.\ \ Converts the unincluded codons to gaps." ) ) <*> optional ( option auto ( long "frequent-mutation-percent" <> metavar "[ ] | PERCENT" <> help "Only include codons containing a mutation present in this\ \ percentage of sequences in the clone or more.\ \ 0 is all sequences. Converts the unincluded codons\ \ to gaps." ) ) <*> switch ( long "add-length" <> short 'l' <> help "Whether to append the length of the sequence to the end of\ \ the header, calculated after --convert-to-amino-acids\ \ if enabled" ) <*> switch ( long "remove-N" <> short 'N' <> help "Whether to replace N or n in the sequence with a gap, '-'" ) <*> switch ( long "remove-germlines" <> short 'g' <> help "Whether to remove germlines." ) <*> switch ( long "remove-highly-mutated" <> short 'h' <> help "Whether to remove highly mutated clone sequences (a third\ \ of their sequence are different amino acids). Optionally\ \ uses codon-table." ) <*> switch ( long "remove-stops" <> short 's' <> help "Whether to remove sequences with stop codons (\"*\").\ \ Optionally uses codon-table." ) <*> switch ( long "legacy-remove-duplicates" <> short 'd' <> help "Whether to remove duplicate sequences. LEGACY ONLY" ) <*> switch ( long "remove-out-of-frame" <> short 'O' <> help "Whether to remove sequences that are out of frame--if the\ \ sequences or number of gaps is not divisible by 3" ) <*> switch ( long "remove-unknown-nucleotides" <> short 'Y' <> help "Convert unknown nucleotides (not ACGTN-.) to gaps (-)" ) <*> optional ( option auto ( long "input-inframe" <> metavar "[ ] | FIELD" <> help "Represents the 1 indexed field split by '|'\ \ containing the inframe\ \ value (frames are 0, 1, or 2 like\ \ USCS definitions). For use with trim-frame." ) ) <*> optional ( option auto ( long "input-outframe" <> metavar "[ ] | FIELD" <> help "Represents the 1 indexed field split by '|'\ \ containing the outframe\ \ value (frames are 0, 1, or 2 like\ \ USCS definitions). For use with trim-frame." ) ) <*> switch ( long "trim-frame" <> short 'y' <> help "Trim each sequence to be in frame by remove extra nucleotides\ \ at the end. If input-inframe or input-outframe is\ \ specified, follow those rules instead." ) <*> optional ( option auto ( long "track-mutations" <> metavar "[ ] | FIELD" <> help "Represents the 1 indexed field split by '|'\ \ containing the sequence to compare to. Takes the differences\ \ from the two sequences and puts them in the header." ) ) <*> switch ( long "translate-track-mutations" <> help "Whether to translate the field in track-mutations before\ \ finding differences." ) <*> option auto ( long "input-stop-range" <> short 'r' <> metavar "[106]|INT" <> value 106 <> help "Only search for stops with remove-stops up to this\ \ amino acid position" ) <*> option auto ( long "input-codon-mut" <> short 'c' <> metavar "[-1]|0|1|2|3" <> value (-1) <> help "Only include codons with this many mutations or less or more,\ \ depending on input-codon-mut-type (-1 is the same as include\ \ all codons). Converts the unincluded codon to gaps." ) <*> strOption ( long "input-codon-mut-type" <> short 'T' <> metavar "[=]|>|<" <> value "=" <> help "Only include codons with this many mutations (=)\ \ (or lesser (<) or greater (>), depending on\ \ input-codon-mut). Converts the unincluded codon to gaps.\ \ For use with input-codon-mut." ) <*> strOption ( long "input-mut-type" <> short 'M' <> metavar "[All]|Silent|Replacement" <> value "All" <> help "Only include codons with all mutations (All),\ \ (or silent (Silent) or replacement (Replacement)).\ \ For use with input-codon-mut." ) <*> strOption ( long "input-change-field" <> short 'e' <> metavar "((FIELD (Int), VALUE (String))" <> value "" <> help "Change a field to a match, so a regex \"ch.*_\" to field 2\ \ of \">abc|brie_cheese_dude\" would result in\ \ \">abc|cheese_\". Useful for getting specific properties\ \ from a field. Can take a list of format\ \ \"(Int, String)&&(Int, String)&& ...\" and so on. The String\ \ is in regex format (POSIX extended).\ \ The first in the tuple is the location of the field\ \ (1 indexed, split by '|')." ) <*> strOption ( long "input-custom-filter" <> short 'f' <> metavar "((FIELD (Int), VALUE (String))" <> value "" <> help "A custom filter. Can take a list of format\ \ \"(Int, String)&&(Int, String)&& ...\" and so on. The String\ \ is in regex format (POSIX extended), so if the entire\ \ string 'a' is in the whole field, then you need to input\ \ '^a$' for the beginning to the end!\ \ The first in the tuple is the location of the field\ \ (1 indexed, split by '|'). If you want to apply to\ \ the entire header, either have the location as 0 or\ \ exclude the location altogether (, Day 3|IGHV3) for instance\ \ will match if the entire header is '>Day 3|IGHV3'.\ \ This list will be filtered one at a time, so you cannot\ \ get multiple filters, but you can remove multiple filters." ) <*> switch ( long "legacy-custom-germline" <> short 'G' <> help "Whether to apply the custom filter to germlines (>>)\ \ instead of sequences (>). LEGACY ONLY" ) <*> switch ( long "custom-remove" <> short 'm' <> help "Whether to remove the sequences containing the custom filter\ \ as opposed to remove the sequences that don't contain the\ \ filter" ) <*> option auto ( long "legacy-gene-allele-field" <> short 'V' <> metavar "[1]|INT" <> value 1 <> help "The field (1 indexed) of the gene allele name. LEGACY ONLY" ) <*> switch ( long "legacy-count" <> short 'v' <> help "Do not save output, just count genes and alleles from\ \ the results. Requires gene-allele-field. LEGACY ONLY" ) <*> strOption ( long "output" <> short 'o' <> metavar "FILE" <> value "" <> help "The output fasta file" ) -- | Parse the argument fieldInt fieldIntParser :: String -> [(Maybe Int, T.Text)] fieldIntParser "" = [] fieldIntParser s = map (\x -> (first x, second x)) . Split.splitOn "&&" $ s where first x | (head . Split.splitOn "," $ x) == "(" = Nothing | otherwise = Just (read (tail . head . Split.splitOn "," $ x) :: Int) second = T.pack . init . dropWhile (== ' ') . last . Split.splitOn "," -- | Check for amino acid isAminoAcid :: GeneticUnit -> Bool isAminoAcid AminoAcid = True isAminoAcid _ = False modifyFastaList :: Options -> IO () modifyFastaList opts = do hIn <- if null . input $ opts then return IO.stdin else IO.openFile (input opts) IO.ReadMode hOut <- if null . output $ opts then return IO.stdout else IO.openFile (output opts) IO.WriteMode let genUnit = aminoAcidsFlag opts stopRange = inputStopRange opts customFilters = fieldIntParser . inputCustomFilter $ opts changeFields = fieldIntParser . inputChangeField $ opts codonMut = inputCodonMut opts codonMutType = T.pack . inputCodonMutType $ opts mutType = T.pack . inputMutType $ opts -- Remove out of frame sequences seqInFrame x = not ( removeOutOfFrameFlag opts && (not . isAminoAcid $ genUnit) ) || isInFrame x -- Start filtering out sequences -- Include only custom filter sequences customFilter x = null customFilters || hasAllCustomFilters (customRemoveFlag opts) customFilters x -- Remove clones with stops in the range noStops x = not (removeStopsFlag opts) || hasNoStops genUnit (inputCodonTable opts) stopRange x -- Remove Ns from fasta list noNs = if removeTheNsFlag opts && (not . isAminoAcid $ genUnit) then removeN else id -- Change fasta headers with match changeHeader x = if not . null $ changeFields then changeAllFields x changeFields else x -- Get a specific region of the sequence cutSequence = case (inputStart opts, inputStop opts) of (Nothing, Nothing) -> id (start, stop) -> getRegionSequence start stop -- Remove non standard nucleotides removeUnknown = if removeUnknownNuc opts then removeUnknownNucs else id -- Trim sequence trim fs = if trimFrame opts then trimFasta genUnit ((read . T.unpack . flip getField fs) <$> inputInFrame opts) ((read . T.unpack . flip getField fs) <$> inputOutFrame opts) fs else fs -- Put mutations in the header includeMutations fs = case trackMutations opts of Nothing -> fs (Just x) -> addMutationsHeader (translateTrackMutations opts) x fs -- Fill in bad characters at the requested section with possible -- replacements fillIn = case inputFillIn opts of (-1, -1, 'X') -> id (f, s, c) -> fillInSequence f s c -- Convert to amino acids ntToaa = if convertToAminoAcidsFlag opts then convertToAminoAcidsFastaSequence (inputCodonTable opts) else id -- Include sequence length in header at the end includeLength = if addLengthFlag opts then addLengthHeader else id -- CLIP fasta specific filters and transformations -- Remove highly mutated sequences removeHighMutations = if removeHighlyMutatedFlag opts then filterHighlyMutatedEntry genUnit (inputCodonTable opts) else id -- Extract mutations to a certain degree getMutations = if codonMut > -1 then onlyMutations codonMut codonMutType mutType else id -- Extract mutations to a certain degree getFrequentMutations = if isJust (inputMutationCount opts) || isJust (inputMutationPercent opts) then frequentMutations (inputMinSequenceMutation opts) (inputMutationCount opts) (inputMutationPercent opts) else id -- Final order filterOrder x = seqInFrame x && customFilter x && noStops x transformOrder = includeLength . includeMutations . ntToaa . changeHeader . removeUnknown . trim . noNs . fillIn . cutSequence -- Specifically for germlines, as we don't want to change header or -- fill in the germline because that would make no sense in this -- case transformGermline = includeLength . includeMutations . ntToaa . removeUnknown . trim . noNs . cutSequence -- Specifically for CLIP fasta files transformOrderCLIP = getFrequentMutations . getMutations . removeHighMutations executePrintFasta x = mappend (showFasta x) (T.pack "\n") executePrintCLIPFasta x = mappend (printCloneEntry (removeGermlinesPreFlag opts) x) (T.pack "\n") -- Execute pipes if clipFastaFlag opts then runEffect $ ( ( pipesCLIPFasta (PT.fromHandle hIn) >-> P.map ( \(!germline, !fseqs) -> (germline, filter filterOrder fseqs) ) -- Filter sequences >-> P.map transformOrderCLIP -- Transform specifically for CLIP fasta >-> P.map ( \(!germline, !fseqs) -> ( transformGermline germline , map transformOrder fseqs ) ) -- Transform sequences, only do some for germline >-> P.map ( \(!germline, !fs) -> ( germline , filter (not . T.null . fastaSeq) fs ) ) -- Remove empty sequences >-> P.filter (not . null . snd) -- Remove empty clones >-> P.map executePrintCLIPFasta ) -- Print the results >> yield (T.pack "\n") ) -- want that newline at the end >-> PT.toHandle hOut else runEffect $ ( ( pipesFasta (PT.fromHandle hIn) >-> P.filter filterOrder -- Filter >-> P.map transformOrder -- Transform >-> P.filter (not . T.null . fastaSeq) -- Remove empty sequences >-> P.map executePrintFasta ) -- Print the results >> yield (T.pack "\n") ) -- want that newline at the end >-> PT.toHandle hOut -- Finish up by closing file if written unless (null . input $ opts) (IO.hClose hIn) unless (null . output $ opts) (IO.hClose hOut) -- Legacy function modifyFastaCloneMap :: Options -> IO () modifyFastaCloneMap opts = do contents <- if null . input $ opts then T.getContents else T.readFile . input $ opts -- No redundant newlines in sequence let genUnit = aminoAcidsFlag opts stopRange = inputStopRange opts codonMut = inputCodonMut opts codonMutType = T.pack . inputCodonMutType $ opts mutType = T.pack . inputMutType $ opts customFilters = fieldIntParser $ inputCustomFilter opts removeGermlinesFlag = if not . clipFastaFlag $ opts then True else removeGermlinesPreFlag opts -- Initiate CloneMap cloneMapFrames = if not . clipFastaFlag $ opts then addFillerGermlines . parsecFasta $ contents else parsecCLIPFasta contents -- Remove out of frame sequences cloneMapInFrame = if (removeOutOfFrameFlag opts && ( not . isAminoAcid $ genUnit ) ) then removeOutOfFrameSeqs cloneMapFrames else cloneMapFrames -- Remove Ns from CloneMap cloneMap = if (removeTheNsFlag opts && ( not . isAminoAcid $ genUnit ) ) then removeCLIPNs cloneMapInFrame else cloneMapInFrame -- Start filtering out sequences -- Include only custom filter sequences cloneMapCustom = if not . null $ customFilters then removeAllCustomFilters (customGermlineFlag opts) (customRemoveFlag opts) cloneMap customFilters else cloneMap -- Remove clones with stops in the range (cloneMapNoStops, errorString) = if removeStopsFlag opts then removeStopsCloneMap genUnit (inputCodonTable opts) stopRange cloneMapCustom else (cloneMapCustom, Nothing) -- Output Error if necessary case errorString of Nothing -> return () Just x -> error x -- Remove duplicate sequences let cloneMapNoDuplicates = if removeDuplicatesFlag opts then removeDuplicatesCloneMap cloneMapNoStops else cloneMapNoStops -- Remove clones that are highly mutated (cloneMapLowMutation, errorString2) = if (removeHighlyMutatedFlag opts) && ( not . isAminoAcid $ genUnit ) then filterHighlyMutated genUnit (inputCodonTable opts) cloneMapNoDuplicates else ( cloneMapNoDuplicates , Nothing ) -- Output Error if necessary case errorString2 of Nothing -> return () Just x -> error x -- Remove codons with codons with a certain number of mutations let cloneMapNoCodonMut = if codonMut > -1 then removeCodonMutCount codonMut codonMutType mutType cloneMapLowMutation else cloneMapLowMutation -- Remove empty clones cloneMapNoEmptyClones = removeEmptyClone cloneMapNoCodonMut -- Convert sequences to amino acids (cloneMapAA, errorString3) = if (convertToAminoAcidsFlag opts) && (not . isAminoAcid $ genUnit) then convertToAminoAcidsCloneMap (inputCodonTable opts) cloneMapNoEmptyClones else (cloneMapNoEmptyClones, Nothing) -- Output Error if necessary case errorString3 of Nothing -> return () Just x -> error x -- Break if there are no sequences to output case M.null cloneMapAA of True -> error "No sequences left! Nothing written." False -> return () -- What to do with results case countFlag opts of True -> do -- Print results let outputText = printSequenceCount (clipFastaFlag opts) (inputGeneAlleleField opts) cloneMapAA -- Print results to stdout T.putStrLn outputText False -> do -- Print results let outputText = if removeGermlinesFlag then printFastaNoGermline cloneMapAA else printFasta cloneMapAA -- Save results if null . output $ opts then T.putStrLn outputText else T.writeFile (output opts) outputText modifyFasta :: Options -> IO () modifyFasta opts = if legacyFlag opts then modifyFastaCloneMap opts else modifyFastaList opts main :: IO () main = execParser opts >>= modifyFasta where opts = info (helper <*> options) ( fullDesc <> progDesc "Modify fasta (and CLIP) files in several optional ways.\ \ Order of transformation goes: seqInFrame -> customFilter\ \ -> noStops -> removeHighMutations -> getMutations ->\ \ getFrequentMutations -> cutSequence -> fillIn -> noNs\ \ -> changeHeader -> ntToaa -> includeLength,\ \ so if you require a different\ \ order (which can change results dramatically), then do\ \ so one at a time through the wonderful world of piping." <> header "modify-fasta, Gregory W. Schwartz" )