{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TupleSections #-} {-| Module : Hyax.Abif.Generate Description : Generate AB1 from a weighted FASTA Copyright : (c) HyraxBio, 2018 License : BSD3 Maintainer : andre@hyraxbio.co.za, andre@andrevdm.com Functionality for generating AB1 files from an input FASTA. These AB1s are supported by both PHRED and recall, if you are using other software you may need to add additional required sections. = Weighted reads The input FASTA files have "weighted" reads. The name for each read is an value between 0 and 1 which specifies the height of the peak relative to a full peak. == Single read The most simple example is a single FASTA with a single read with a weight of 1 @ > 1 ACTG @ <> The chromatogram for this AB1 shows perfect traces for the input `ACTG` nucleotides with a full height peak. == Mixes & multiple reads The source FASTA can have multiple reads, which results in a chromatogram with mixes @ > 1 ACAG > 0.3 ACTG @ <> There is an `AT` mix at the third nucleotide. The first read has a weight of 1 and the second a weight of 0.3. The chromatogram shows the mix and the `T` with a lower peak (30% of the `A` peak) == Summing weights - The weigh of a read specifies the intensity of the peak from 0 to 1. - Weights for each position are added to a maximum of 1 per nucleotide - You can use `_` as a "blank" nucleotide, in which only the nucleotides from other reads will be considered E.g. @ > 1 ACAG > 0.3 _GT > 0.2 _G @ <> See README.md for additional details and examples -} module Hyrax.Abif.Generate ( generateAb1s , generateAb1 , readWeightedFasta , iupac , unIupac ) where import Protolude import qualified Data.Text as Txt import qualified Data.Text.Encoding as TxtE import qualified Data.List as Lst import qualified Data.Binary.Put as B import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as BSL import qualified System.FilePath as FP import System.FilePath (()) import qualified System.Directory as Dir import Hyrax.Abif import Hyrax.Abif.Write import Hyrax.Abif.Fasta data TraceData = TraceData { trData09G :: ![Int16] , trData10A :: ![Int16] , trData11T :: ![Int16] , trData12C :: ![Int16] , trValsPerBase :: !Int , trFasta :: !Text } deriving (Show) -- | Generate a set of AB1s. One for every FASTA found in the source directory generateAb1s :: FilePath -> FilePath -> IO () generateAb1s source dest = do Dir.createDirectoryIfMissing True dest weighted <- readWeightedFastas source case weighted of Left e -> putText e Right rs -> do let ab1s = (\(n, r) -> (n, generateAb1 (n, r))) <$> rs traverse_ (\(name, ab1) -> BS.writeFile (dest Txt.unpack name <> ".ab1") $ BSL.toStrict ab1) ab1s -- | Create the 'ByteString' data for an AB1 given the data from a weighted FASTA (see 'readWeightedFasta') generateAb1 :: (Text, [(Double, Text)]) -> BSL.ByteString generateAb1 (fName, sourceFasta) = let tr = generateTraceData sourceFasta valsPerBase = trValsPerBase tr generatedFastaLen = (Txt.length $ trFasta tr) -- The point that is the peak of the trace, i.e. mid point of trace for a single base midPeek = valsPerBase `div` 2 -- Get the peak locations for all bases peakLocations = take generatedFastaLen [midPeek, valsPerBase + midPeek..] -- Sample name (from the FASTA name) sampleName = fst . Txt.breakOn "_" $ fName -- Create the ABIF directories dirs = [ mkData 9 $ trData09G tr -- G , mkData 10 $ trData10A tr -- A , mkData 11 $ trData11T tr -- T , mkData 12 $ trData12C tr -- C , mkBaseOrder BaseG BaseA BaseT BaseC -- Base order, should be GATC for 3500 , mkLane 1 -- Lane or capliary number , mkCalledBases $ trFasta tr -- Called bases , mkMobilityFileName 1 "KB_3500_POP7_BDTv3.mob" -- Mobility file name , mkMobilityFileName 2 "KB_3500_POP7_BDTv3.mob" -- Mobility file name , mkPeakLocations $ fromIntegral <$> peakLocations -- Peak locations , mkDyeSignalStrength 53 75 79 48 -- Signal strength per dye , mkSampleName sampleName -- Sample name , mkComment "Generated by HyraxBio AB1 generator" ] -- The ABIF abif = Abif { aHeader = mkHeader , aRootDir = mkRoot , aDirs = dirs } in -- Generate the data B.runPut (putAbif abif) -- | Generate the traces for the AB1 from the parsed weighted FASTA generateTraceData :: [(Double, Text)] -> TraceData generateTraceData weighted = let weightedNucs' = (\(w, ns) -> (w,) . unIupac <$> Txt.unpack ns) <$> weighted weightedNucs = Lst.transpose weightedNucs' -- Values for a base that was present. This defines the shape of the chromatogram curve, and defines the number of values per base curve = [0, 0, 128, 512, 1024, 1024, 512, 128, 0, 0] valsPerBase = length curve -- Create the G, A, T and C traces data09G = concat $ getWeightedTrace curve 'G' <$> weightedNucs data10A = concat $ getWeightedTrace curve 'A' <$> weightedNucs data11T = concat $ getWeightedTrace curve 'T' <$> weightedNucs data12C = concat $ getWeightedTrace curve 'C' <$> weightedNucs -- Create fasta sequence for the trace fastaSeq = concat <$> (snd <<$>> weightedNucs) fasta = Txt.pack $ iupac fastaSeq in TraceData { trData09G = data09G , trData10A = data10A , trData11T = data11T , trData12C = data12C , trFasta = fasta , trValsPerBase = valsPerBase } where getWeightedTrace :: [Int] -> Char -> [(Double, [Char])] -> [Int16] getWeightedTrace curve nuc ws = let found = filter ((nuc `elem`) . snd) ws score' = foldl' (+) 0 $ fst <$> found score = min 1 . max 0 $ score' wave = floor . (score *) . fromIntegral <$> curve in wave -- | Read a weighted FASTA file. See the module comments for the expected format. -- See the module documentation for details on the format of the weighted FASTA -- -- e.g. weighted FASTA -- -- @ -- > 1 -- ACAG -- > 0.3 -- _GT -- > 0.2 -- _G -- @ -- -- -- The result data has the type -- -- @ -- ('Text', [('Double', 'Text')]) -- ^ ^ ^ -- | | | -- file name -------------+ | +---- read -- | -- +---- weight -- @ -- readWeightedFasta :: ByteString -> Either Text [(Double, Text)] readWeightedFasta fastaData = case parseFasta $ TxtE.decodeUtf8 fastaData of Left e -> Left e Right fs -> getWeightedFasta fs where getWeightedFasta :: [Fasta] -> Either Text [(Double, Text)] getWeightedFasta fs = case sequenceA $ readWeighted <$> fs of Left e -> Left e Right r -> Right r readWeighted :: Fasta -> Either Text (Double, Text) readWeighted (Fasta hdr dta) = case (readMaybe . Txt.unpack $ hdr :: Maybe Double) of Just weight -> Right (min 1 . max 0 $ weight, Txt.strip dta) Nothing -> Left $ "Invalid header reading, expecting numeric weight, got: " <> hdr -- | Read all FASTA files in a directory readWeightedFastas :: FilePath -> IO (Either Text [(Text, [(Double, Text)])]) readWeightedFastas source = do files <- filter (Txt.isSuffixOf ".fasta" . Txt.pack) <$> getFiles source let names = Txt.pack . FP.takeBaseName <$> files contents <- traverse BS.readFile files case sequenceA $ readWeightedFasta <$> contents of Left e -> pure . Left $ e Right rs -> pure . Right $ zip names rs -- | Find all files in a directory getFiles :: FilePath -> IO [FilePath] getFiles p = do entries <- (p ) <<$>> Dir.listDirectory p filterM Dir.doesFileExist entries -- | Convert a IUPAC ambiguity code to the set of nucleotides it represents unIupac :: Char -> [Char] unIupac c = case c of 'T' -> "T" 'C' -> "C" 'A' -> "A" 'G' -> "G" 'U' -> "T" 'M' -> "AC" 'R' -> "AG" 'W' -> "AT" 'S' -> "CG" 'Y' -> "CT" 'K' -> "GT" 'V' -> "ACG" 'H' -> "ACT" 'D' -> "AGT" 'B' -> "CGT" 'N' -> "GATC" 'X' -> "GATC" _ -> "" -- | Given a set of nucleotides get the IUPAC ambiguity code iupac :: [[Char]] -> [Char] iupac ns = go <$> ns where go cs = let a = 'A' `elem` cs c = 'C' `elem` cs g = 'G' `elem` cs t = 'T' `elem` cs in case (a, c, g, t) of (True, False, False, False) -> 'A' (False, True, False, False) -> 'C' (False, False, True, False) -> 'G' (False, False, False, True ) -> 'T' (True, True, False, False) -> 'M' (True, False, True, False) -> 'R' (True, False, False, True ) -> 'W' (False, True, True, False) -> 'S' (False, True, False, True ) -> 'Y' (False, False, True, True ) -> 'K' (True, True, True, False) -> 'V' (True, True, False, True ) -> 'H' (True, False, True, True ) -> 'D' (False, True, True, True ) -> 'B' (True, True, True, True ) -> 'N' _ -> '_'