{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TupleSections #-} module Hyrax.Abi.Write ( generateAb1s , generateAb1 , writeAb1 , putAbi , putTextStr , putHeader , putDirectory , readWeightedFasta ) where import Protolude import qualified Data.Text as Txt import qualified Data.Text.Encoding as TxtE import qualified Data.List as Lst import qualified Data.Binary as B import qualified Data.Binary.Put as B import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as BSL import qualified System.FilePath as FP import System.FilePath (()) import qualified System.Directory as Dir import Hyrax.Abi import Hyrax.Abi.Fasta data TraceData = TraceData { trData09G :: ![Int16] , trData10A :: ![Int16] , trData11T :: ![Int16] , trData12C :: ![Int16] , trValsPerBase :: !Int , trFasta :: !Text } deriving (Show) generateAb1s :: FilePath -> FilePath -> IO () generateAb1s source dest = do Dir.createDirectoryIfMissing True dest weighted <- readWeightedFastas source case weighted of Left e -> putText e Right rs -> do let ab1s = (\(n, r) -> (n, generateAb1 (n, r))) <$> rs traverse_ (\(name, ab1) -> BS.writeFile (dest Txt.unpack name <> ".ab1") $ BSL.toStrict ab1) ab1s generateTraceData :: [(Double, Text)] -> TraceData generateTraceData weighted = let weightedNucs' = (\(w, ns) -> (w,) . unIupac <$> Txt.unpack ns) <$> weighted weightedNucs = Lst.transpose weightedNucs' -- Values for a base that was present. This defines the shape of the chromatogram curve, and defines the number of values per base curve = [0, 0, 128, 512, 1024, 1024, 512, 128, 0, 0] valsPerBase = length curve -- Create the G & A & T & C traces data09G = concat $ getWeightedTrace curve 'G' <$> weightedNucs data10A = concat $ getWeightedTrace curve 'A' <$> weightedNucs data11T = concat $ getWeightedTrace curve 'T' <$> weightedNucs data12C = concat $ getWeightedTrace curve 'C' <$> weightedNucs -- Create fasta sequence for the trace fastaSeq = concat <$> (snd <<$>> weightedNucs) fasta = Txt.pack $ iupac fastaSeq in TraceData { trData09G = data09G , trData10A = data10A , trData11T = data11T , trData12C = data12C , trFasta = fasta , trValsPerBase = valsPerBase } where getWeightedTrace :: [Int] -> Char -> [(Double, [Char])] -> [Int16] getWeightedTrace curve nuc ws = let found = filter ((nuc `elem`) . snd) ws score' = foldl' (+) 0 $ fst <$> found score = min 1 . max 0 $ score' wave = floor . (score *) . fromIntegral <$> curve in wave generateAb1 :: (Text, [(Double, Text)]) -> BSL.ByteString generateAb1 (fName, sourceFasta) = let -- Create the G & A & T & C traces tr = generateTraceData sourceFasta data09G = B.runPut . createBytes $ trData09G tr data10A = B.runPut . createBytes $ trData10A tr data11T = B.runPut . createBytes $ trData11T tr data12C = B.runPut . createBytes $ trData12C tr valsPerBase = trValsPerBase tr generatedFastaLen = (Txt.length $ trFasta tr) -- The point that is the peak of the trace, i.e. mid point of trace for a single base midPeek = valsPerBase `div` 2 -- Get the peak locations for all bases peakLocations' = take generatedFastaLen [midPeek, valsPerBase + midPeek..] peakLocations = B.runPut (traverse_ (B.putInt16be . fromIntegral) peakLocations') -- PBAS = called bases pbas = BSL.fromStrict . TxtE.encodeUtf8 $ trFasta tr -- FWO_ = Base order, should be GACT for 3500 fwo = BSL.fromStrict . TxtE.encodeUtf8 $ "GATC" -- PDMF = Mobility file name pdfm = B.runPut $ putPStr "KB_3500_POP7_BDTv3.mob" -- Sample name (from the FASTA name) sampleName = B.runPut . putPStr . fst . Txt.breakOn "_" $ fName -- Dye signal strength sigStrength = B.runPut $ do B.putInt16be 53 B.putInt16be 75 B.putInt16be 79 B.putInt16be 48 comment = B.runPut $ putPStr "Generated by HyraxBio AB1 generator" header = Header { hName = "ABIF" , hVersion = 101 } root = Directory { dTagName = "tdir" , dTagNum = 1 , dElemTypeCode = 1023 , dElemTypeDesc = "root" , dElemType = ElemRoot , dElemSize = 28 , dDataOffset = 0 , dDataDebug = [] , dData = data09G , dDataSize = fromIntegral (BSL.length data09G) , dElemNum = generatedFastaLen * valsPerBase } dirs = [ Directory { dTagName = "DATA" -- G , dTagNum = 9 , dElemTypeCode = 4 , dElemTypeDesc = "short" , dElemType = ElemShort , dElemSize = 2 , dDataOffset = 0 , dDataDebug = [] , dData = data09G , dDataSize = fromIntegral (BSL.length data09G) , dElemNum = generatedFastaLen * valsPerBase } , Directory { dTagName = "DATA" -- A , dTagNum = 10 , dElemTypeCode = 4 , dElemTypeDesc = "short" , dElemType = ElemShort , dElemSize = 2 , dDataOffset = 0 , dDataDebug = [] , dData = data10A , dDataSize = fromIntegral (BSL.length data10A) , dElemNum = generatedFastaLen * valsPerBase } , Directory { dTagName = "DATA" -- T , dTagNum = 11 , dElemTypeCode = 4 , dElemTypeDesc = "short" , dElemType = ElemShort , dElemSize = 2 , dDataOffset = 0 , dDataDebug = [] , dData = data11T , dDataSize = fromIntegral (BSL.length data11T) , dElemNum = generatedFastaLen * valsPerBase } , Directory { dTagName = "DATA" -- C , dTagNum = 12 , dElemTypeCode = 4 , dElemTypeDesc = "short" , dElemType = ElemShort , dElemSize = 2 , dDataOffset = 0 , dDataDebug = [] , dData = data12C , dDataSize = fromIntegral (BSL.length data12C) , dElemNum = generatedFastaLen * valsPerBase } , Directory { dTagName = "FWO_" -- Base order , dTagNum = 1 , dElemTypeCode = 2 , dElemTypeDesc = "char" , dElemType = ElemChar , dElemSize = 1 , dDataOffset = 0 , dDataDebug = [] , dData = fwo , dDataSize = 4 , dElemNum = 4 } , Directory { dTagName = "LANE" -- Lane or capliary number , dTagNum = 1 , dElemTypeCode = 4 , dElemTypeDesc = "short" , dElemType = ElemShort , dElemSize = 2 , dElemNum = 1 , dDataSize = 2 , dDataOffset = 0 , dData = B.runPut $ B.putInt16be 1 , dDataDebug = [] } , Directory { dTagName = "PBAS" -- Called bases , dTagNum = 1 , dElemTypeCode = 2 , dElemTypeDesc = "char" , dElemType = ElemChar , dElemSize = 1 , dDataOffset = 0 , dDataDebug = [] , dData = pbas , dDataSize = generatedFastaLen , dElemNum = generatedFastaLen } , Directory { dTagName = "PDMF" -- Mobility file name , dTagNum = 1 , dElemTypeCode = 18 , dElemTypeDesc = "pString" , dElemType = ElemPString , dElemSize = 1 , dDataOffset = 0 , dDataDebug = [] , dData = pdfm , dDataSize = fromIntegral (BSL.length pdfm) , dElemNum = fromIntegral (BSL.length pdfm) } , Directory { dTagName = "PDMF" -- Mobility file name , dTagNum = 2 , dElemTypeCode = 18 , dElemTypeDesc = "pString" , dElemType = ElemPString , dElemSize = 1 , dDataOffset = 0 , dDataDebug = [] , dData = pdfm , dDataSize = fromIntegral (BSL.length pdfm) , dElemNum = fromIntegral (BSL.length pdfm) } , Directory { dTagName = "PLOC" -- Peak locations , dTagNum = 1 , dElemTypeCode = 4 , dElemTypeDesc = "short" , dElemType = ElemShort , dElemSize = 2 , dDataOffset = 0 , dDataDebug = [] , dData = peakLocations , dDataSize = fromIntegral (BSL.length peakLocations) , dElemNum = generatedFastaLen } , Directory { dTagName = "S/N%" -- Signal strength per dye , dTagNum = 1 , dElemTypeCode = 4 , dElemTypeDesc = "short" , dElemType = ElemShort , dElemSize = 2 , dElemNum = 4 , dDataOffset = 0 , dDataDebug = [] , dData = sigStrength , dDataSize = fromIntegral (BSL.length sigStrength) } , Directory { dTagName = "SMPL" -- Sample name , dTagNum = 1 , dElemTypeCode = 18 , dElemTypeDesc = "pString" , dElemType = ElemPString , dElemSize = 1 , dElemNum = 10 , dDataOffset = 0 , dDataDebug = [] , dData = sampleName , dDataSize = fromIntegral (BSL.length sampleName) } , Directory { dTagName = "CMNT" -- Comment , dTagNum = 1 , dElemTypeCode = 18 , dElemTypeDesc = "pString" , dElemType = ElemPString , dElemSize = 1 , dElemNum = 1 , dDataOffset = 0 , dDataDebug = [] , dData = comment , dDataSize = fromIntegral (BSL.length comment) } ] abi = Abi { aHeader = header , aRootDir = root , aDirs = dirs } in B.runPut (putAbi abi) createBytes :: [Int16] -> B.PutM () createBytes = traverse_ B.putInt16be writeAb1 :: Abi -> BSL.ByteString writeAb1 ab1 = B.runPut (putAbi ab1) putAbi :: Abi -> B.Put putAbi (Abi header root dirs) = do let startDataOffset = 128 let dataSize = foldl' (\acc i -> if i > 4 then acc + i else acc) 0 $ dDataSize <$> dirs putHeader header putDirectory (startDataOffset + dataSize) $ root { dDataSize = 28 * length dirs , dElemNum = length dirs } traverse_ B.putInt16be $ replicate 47 0 traverse_ (B.putLazyByteString . dData) $ filter (\d -> dDataSize d > 4) dirs foldM_ writeDir startDataOffset dirs where writeDir offset dir = do putDirectory offset dir pure $ if dDataSize dir > 4 then offset + dDataSize dir else offset readWeightedFastas :: FilePath -> IO (Either Text [(Text, [(Double, Text)])]) readWeightedFastas source = do files <- getFiles source let names = Txt.pack . FP.takeBaseName <$> files contents <- traverse BS.readFile files case sequenceA $ readWeightedFasta <$> contents of Left e -> pure . Left $ e Right rs -> pure . Right $ zip names rs readWeightedFasta :: ByteString -> Either Text [(Double, Text)] readWeightedFasta fastaData = case parseFasta $ TxtE.decodeUtf8 fastaData of Left e -> Left e Right fs -> getWeightedFasta fs where getWeightedFasta :: [Fasta] -> Either Text [(Double, Text)] getWeightedFasta fs = case sequenceA $ readWeighted <$> fs of Left e -> Left e Right r -> Right r readWeighted :: Fasta -> Either Text (Double, Text) readWeighted (Fasta hdr dta) = case (readMaybe . Txt.unpack $ hdr :: Maybe Double) of Just weight -> Right (min 1 . max 0 $ weight, Txt.strip dta) Nothing -> Left $ "Invalid header reading, expecting numeric weight, got: " <> hdr putTextStr :: Text -> B.Put putTextStr t = B.putByteString $ TxtE.encodeUtf8 t putPStr :: Text -> B.Put putPStr t = do B.putInt8 . fromIntegral $ Txt.length t B.putByteString $ TxtE.encodeUtf8 t putHeader :: Header -> B.Put putHeader h = do putTextStr $ hName h B.putInt16be . fromIntegral $ hVersion h putDirectory :: Int -> Directory -> B.Put putDirectory dirOffset d = do let name = Txt.justifyLeft 4 ' ' . Txt.take 4 $ dTagName d putTextStr name B.putInt32be . fromIntegral $ dTagNum d B.putInt16be . fromIntegral $ dElemTypeCode d B.putInt16be . fromIntegral $ dElemSize d B.putInt32be . fromIntegral $ dElemNum d B.putInt32be . fromIntegral $ dDataSize d -- data with a size >= 4 are written in the offset if dDataSize d > 4 then B.putInt32be . fromIntegral $ dirOffset else B.putLazyByteString . BSL.take 4 $ dData d <> "\0\0\0\0" B.putInt32be 0 -- reserved / datahandle getFiles :: FilePath -> IO [FilePath] getFiles p = do entries <- (p ) <<$>> Dir.listDirectory p filterM Dir.doesFileExist entries unIupac :: Char -> [Char] unIupac c = case c of 'T' -> "T" 'C' -> "C" 'A' -> "A" 'G' -> "G" 'U' -> "T" 'M' -> "AC" 'R' -> "AG" 'W' -> "AT" 'S' -> "CG" 'Y' -> "CT" 'K' -> "GT" 'V' -> "ACG" 'H' -> "ACT" 'D' -> "AGT" 'B' -> "CGT" 'N' -> "GATC" 'X' -> "GATC" _ -> "" iupac :: [[Char]] -> [Char] iupac ns = go <$> ns where go cs = let a = 'A' `elem` cs c = 'C' `elem` cs g = 'G' `elem` cs t = 'T' `elem` cs in case (a, c, g, t) of (True, False, False, False) -> 'A' (False, True, False, False) -> 'C' (False, False, True, False) -> 'G' (False, False, False, True ) -> 'T' (True, True, False, False) -> 'M' (True, False, True, False) -> 'R' (True, False, False, True ) -> 'W' (False, True, True, False) -> 'S' (False, True, False, True ) -> 'Y' (False, False, True, True ) -> 'K' (True, True, True, False) -> 'V' (True, True, False, True ) -> 'H' (True, False, True, True ) -> 'D' (False, True, True, True ) -> 'B' (True, True, True, True ) -> 'N' _ -> '_'