{- collapse-duplication Gregory W. Schwartz Collapse the duplication output into clones and return their frequencies. -} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE DuplicateRecordFields #-} {-# LANGUAGE DataKinds #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TypeOperators #-} module Main where -- Standard import Data.Bool import Data.Function (on) import Data.List import qualified Data.Foldable as F import qualified Data.Map.Strict as Map -- Cabal import Control.Lens import Data.Csv import Options.Generic import qualified Data.ByteString.Lazy.Char8 as B import qualified Data.List.Split as Split -- Local import Types import Collapse import Filter -- | Command line arguments data Options = Options { output :: Maybe String "(FILE) The output file." , collapseClone :: Bool "Collapse the clone into a representative sequence instead of appending clone IDs to the reads." , wiggle :: Maybe Double "([0] | DOUBLE) Highly recommended to play around with! The amount of wiggle room for defining clones. Instead of grouping exactly by same duplication and spacer location and length, allow for a position distance of this much (so no two reads have a difference of more than this number)." , filterCloneFrequency :: Double "([0.01] | DOUBLE) Filter reads (or clones) from clones with too low a frequency. Default is 0.01 (1%)." , filterReadFrequency :: Maybe Double "([Nothing] | DOUBLE) Filter duplications with too high a frequency (probably false positive if very high, for instance if over half of reads or 0.5). Converts these duplications to \"Normal\" sequences. Frequencies and counts are taken place before collapsing and filtering." , absolute :: Bool "Whether to filter reads (or clones) from clones with too low an absolute number for filterReadFrequency instead frequency." , filterType :: Maybe String "([Substring] | Position) Whether to filter reads with filterReadFrequency using the dSubstring field or the dLocations field." , method :: Maybe String "([CompareAll] | Hierarchical) The method used to group together wiggle room reads. Compare all compares the current element with all elements in the previous sublist. Hierarchical is for clustering, but is most likely worse at this point in time." } deriving (Generic) instance ParseRecord Main.Options main :: IO () main = do opts <- getRecord "collapse-duplication, Gregory W. Schwartz.\ \ Collapse the duplication output into clones and return\ \ their frequencies or clone IDs. Make sure format\ \ of the label field is SUBJECT_SAMPLE" contents <- fmap (F.toList . snd . either error id . decodeByName) B.getContents let inputMethod = maybe CompareAll read . unHelpful . method $ opts absOrFrac = bool Fraction Absolute . unHelpful . absolute $ opts entity = bool Read Clone . unHelpful . collapseClone $ opts inputFilterType = maybe Substring read . unHelpful . filterType $ opts reads :: [ITDInfo] reads = maybe (fmap printToInfo contents) (\ readFreq -> fmap printToInfo . convertHighFreqToNormal inputFilterType absOrFrac readFreq $ contents ) . fmap Frequency . unHelpful . filterReadFrequency $ opts freq = Frequency . unHelpful . filterCloneFrequency $ opts grouped = case unHelpful . wiggle $ opts of Nothing -> gather reads (Just x) -> gatherWiggle inputMethod (Wiggle x) reads labelMap = getLabelMap reads countFromGrouped :: [ITDInfo] -> Int countFromGrouped = (Map.!) labelMap . Label . (\x -> label (x :: ITDInfo)) . head clones :: [PrintWithCloneID] clones = filterFrequency freq . concat . concatMap (\ (!cloneID, !xs) -> fmap (\ys -> addCloneIDs entity cloneID (countFromGrouped ys) ys) xs) . fmap (over _2 ( groupBy ((==) `on` (label :: ITDInfo -> B.ByteString)) . sortBy (compare `on` (label :: ITDInfo -> B.ByteString)) ) ) . concat . over (unsafePartsOf (each . each)) (zip (fmap ID [1..])) $ grouped result = encodeDefaultOrderedByName clones case unHelpful . output $ opts of Nothing -> B.putStr result (Just x) -> B.writeFile x result return ()