{-# LANGUAGE ViewPatterns #-}

module Bio.Tools.Sequence.Primers.Optimization
 ( designPrimer
 ) where

import           Bio.Chain                            (fromList)
import           Bio.Chain.Alignment                  (AffineGap (..),
                                                       AffineGap2,
                                                       AlignmentResult (..),
                                                       LocalAlignment (..),
                                                       Operation (..), align)
import           Bio.NucleicAcid.Chain                (NucleicAcidChain (..))
import           Bio.NucleicAcid.Nucleotide           (Complementary (..),
                                                       DNA (..), symbol)
import           Bio.Tools.Sequence.Primers.Constants as Constants (annealingTemp,
                                                                    bindingRate,
                                                                    eTgtEFoldRel,
                                                                    maxPrimerGCContent,
                                                                    maxPrimerLength,
                                                                    maxPrimerMeltingTemp,
                                                                    minPrimerGCContent,
                                                                    minPrimerLength,
                                                                    minPrimerMeltingTemp,
                                                                    topN)
import           Bio.Tools.Sequence.Primers.Types     (Primer,
                                                       ScoredPrimer (..), eTgt,
                                                       eTgtEFold, gcContent,
                                                       meltingTemp, seq')
import           Bio.Tools.Sequence.ViennaRNA.Cofold  (cofold)
import           Bio.Tools.Sequence.ViennaRNA.Fold    (fold)
import           Control.Lens                         ((&), (.~), (^.))
import           Control.Monad                        (when)
import           Control.Monad.Except                 (MonadError, throwError)
import           Data.List                            (sortOn)
import           Data.Maybe                           (catMaybes)
import           Data.Text                            (Text)

-- | Given 'DNA' sequence and position in that sequence designs forward primer
-- for that sequence. Primer will start at the given position. @isCyclic@ marks
-- whether the sequence that we design primer for is cyclic or not.
--
-- Flag @isCyclic@ also defines type of algorithm that will be used to design primer.
--
-- If @isCyclic@ is set to False, ViennaRNA will be used to check that primer
-- has no off-target interactions with given sequence. This is pretty accurate method,
-- but when it is used to process long sequences (more than 1000 bps), it's quite slow.
--
-- If @isCyclic@ is set to True, algorithm that uses several heuristics will be
-- used to check that primer has no off-target interactions with given sequence.
-- This method is less accurate than using ViennaRNA to calculate energy of primer's interaction
-- with sequence, but much more faster.
--
-- We did such segregation of algorithms, because cyclic DNA sequences (plasmids)
-- are very long and non-cyclic sequences that we work with are no longer than 1000 bps.
--
designPrimer :: MonadError Text m => Bool -> [DNA] -> Int -> m ScoredPrimer
designPrimer isCyclic dna' pos = do
    when (pos >= length dna') $ throwError "Bio.Tools.Sequence.Primers.Optimization: given position is out of range."

    withE <- energyFilter . gcContentFilter . lengthFilter $ candidates

    case gcOnEndFilter withE of
      [] -> throwError badPrimersError
      l  -> pure $ head $ sortOn (fmap negate . (^. gcContent)) l
  where
    dna        = if isCyclic then dna' <> take Constants.maxPrimerLength dna' else dna'
    candidates = genTemperatureCandidates (drop pos dna) []

    badPrimersError :: Text
    badPrimersError = "Bio.Tools.Sequence.Primers.Optimization: all primers designed from given position are seriously flawed."

    -- | Generates candidate primers with melting temperature in needed range.
    --
    genTemperatureCandidates :: [DNA] -> [DNA] -> [ScoredPrimer]
    genTemperatureCandidates [] _          = []
    genTemperatureCandidates (x : xs) base = res
      where
        newBase        = base <> [x]
        newPrimer      = toScored newBase
        primerWithTemp = calcMeltingTemp newPrimer

        Just mTemp = primerWithTemp ^. meltingTemp

        res | mTemp < Constants.minPrimerMeltingTemp = genTemperatureCandidates xs newBase
            | mTemp > Constants.maxPrimerMeltingTemp = []
            | otherwise                              = primerWithTemp : genTemperatureCandidates xs newBase

        toScored :: Primer -> ScoredPrimer
        toScored p = ScoredPrimer p Nothing Nothing Nothing Nothing

    -- | Filters 'ScoredPrimer's based on their length.
    --
    lengthFilter :: [ScoredPrimer] -> [ScoredPrimer]
    lengthFilter = filter lengthPred
      where
        lengthPred :: ScoredPrimer -> Bool
        lengthPred sp = Constants.minPrimerLength <= ls && ls <= Constants.maxPrimerLength
          where
            ls = length $ sp ^. seq'

    -- | Leaves only 'ScoredPrimer's that have GC on their 3' end.
    -- If no such primers are found, filtering doesn't happen.
    --
    gcOnEndFilter :: [ScoredPrimer] -> [ScoredPrimer]
    gcOnEndFilter sps | null filtered = sps
                      | otherwise     = filtered
      where
        filtered = filter gcOnEndPred sps

        gcOnEndPred :: ScoredPrimer -> Bool
        gcOnEndPred = (`elem` [DC, DG]) . last . (^. seq')

    -- | Leaves primers whose GC-content is in needed range. If no such primers
    -- are found, leaves @Constants.topN@ primers with GC-content nearest to needed range.
    --
    gcContentFilter :: [ScoredPrimer] -> [ScoredPrimer]
    gcContentFilter sps | null filtered = take Constants.topN sorted
                        | otherwise     = filtered
      where
        primersWithGC = fmap calcGCContent sps
        filtered      = filter gcContentPred primersWithGC

        sorted = sortOn scoringFunc primersWithGC

        gcContentPred :: ScoredPrimer -> Bool
        gcContentPred sp = Constants.minPrimerGCContent <= gcc && gcc <= Constants.maxPrimerGCContent
          where
            Just gcc = sp ^. gcContent

        scoringFunc :: ScoredPrimer -> Float
        scoringFunc sp = min (abs $ Constants.minPrimerGCContent - gcc) (abs $ Constants.maxPrimerGCContent - gcc)
          where
            Just gcc = sp ^. gcContent

    -- | Filters primers using energy characteristics.
    --
    energyFilter :: MonadError Text m => [ScoredPrimer] -> m [ScoredPrimer]
    energyFilter sps = fmap eTgtEFoldFilter . eTgtFilter $ sps

    -- | Leaves only primers whose relation of energy of interaction with target
    -- to energy of forming a secondary structure is higher then @Constants.eTgtEFoldRel@.
    --
    eTgtEFoldFilter :: [ScoredPrimer] -> [ScoredPrimer]
    eTgtEFoldFilter s = filter eTgtEFoldPred . fmap calcTargetFold $ s
      where
        eTgtEFoldPred :: ScoredPrimer -> Bool
        eTgtEFoldPred sp = etef >= Constants.eTgtEFoldRel
          where
            Just etef = sp ^. eTgtEFold

    -- | Leaves only primers that bind to target on source sequence.
    --
    eTgtFilter :: MonadError Text m => [ScoredPrimer] -> m [ScoredPrimer]
    eTgtFilter s | null res  = throwError badPositionError
                 | otherwise = pure res
      where
        res = catMaybes $ fmap (calcTarget isCyclic dna pos) s

        badPositionError :: Text
        badPositionError = "Bio.Tools.Sequence.Primers.Optimization: all primers designed from given position don't bind to target."

-- | Calculates melting temperature for 'ScoredPrimer'.
-- Temperature is calculated using the following formula: 4 * (C + G) + 2 * (A + T).
--
calcMeltingTemp :: ScoredPrimer -> ScoredPrimer
calcMeltingTemp sp = sp & meltingTemp .~ Just temperature
  where
    temperature = sum $ fmap tempForNuc (sp ^. seq')

    tempForNuc :: DNA -> Int
    tempForNuc DC = 4
    tempForNuc DG = 4
    tempForNuc DA = 2
    tempForNuc DT = 2

-- | Calculates GC-content for 'ScoredPrimer'.
-- GC-content is calulcated using the following formula: (G + C) / (A + T + G + C).
--
calcGCContent :: ScoredPrimer -> ScoredPrimer
calcGCContent sp = sp & gcContent .~ Just content
  where
    nGC     = length $ filter (\x -> x == DC || x == DG) $ sp ^. seq'
    content = fromIntegral nGC / fromIntegral (length $ sp ^. seq')

-- | Calculates relation of 'ScoredPrimers's target interaction energy to
-- its folding energy.
--
calcTargetFold :: ScoredPrimer -> ScoredPrimer
calcTargetFold sp = sp & eTgtEFold .~ Just tgtToFold
  where
    foldingEnergy = fst $ fold Constants.annealingTemp $ sp ^. seq'

    -- @calcTargetFold@ is used only after 'ScoredPrimer's target interaction energy
    -- has been calculated
    Just tgt  = sp ^. eTgt
    tgtToFold = abs $ tgt / toEps foldingEnergy

-- | Calculates energy of interaction of given 'DNA' and given 'DNA' sequence's
-- complementary strand.
--
cofoldEnergy :: [DNA] -> [DNA] -> Float
cofoldEnergy sp s = abs $ fst $ cofold Constants.annealingTemp (reverse sp, fmap cNA s)

-- | Local alignment algorithm that aligns arguments based on their complementarity
-- and doesn't allow gaps on the second argument of the alignment (that is primer in our terms).
--
-- We don't allow gaps on the primer during alignment, because this situation
-- is not physical: it means that hairpins can appear on the plasmid.
--
localAlignment :: LocalAlignment AffineGap2 DNA DNA
localAlignment = LocalAlignment complementScoring (plasmidGap, primerGap)
  where
    plasmidGap :: AffineGap
    plasmidGap = AffineGap (-5) (-1)

    primerGap :: AffineGap
    primerGap = AffineGap (-1000) (-1000)

    complementScoring :: DNA -> DNA -> Int
    complementScoring DA (cNA -> DT) = 3
    complementScoring DT (cNA -> DA) = 3
    complementScoring DC (cNA -> DG) = 5
    complementScoring DG (cNA -> DC) = 5
    complementScoring _ _            = -3

-- | Calculates energy of interaction of primer @sp@ with sequence @dna@.
-- If @sp@ doesn't bind to target area on @dna@ (target area starts at @sourceInd@),
-- then Nothing is returned.
--
-- There are different algorithms to check off-target interaction depending of whether
-- @dna@ is cyclic or not. It is defined by the first parameter.
--
calcTarget :: Bool -> [DNA] -> Int -> ScoredPrimer -> Maybe ScoredPrimer
-- algorithm for cyclic sequences
calcTarget True dna sourceInd sp | null offTargets || tgtE > maxOffTarget = res
                                 | otherwise                              = Nothing
  where
    primerSeq     = sp ^. seq'
    primerLen     = length primerSeq

    -- position in the @dna@, where primer reaches half of its length
    primerLenHalf = sourceInd + primerLen `div` 2

    -- cut all @matchingSeqs@ into k-mers of all sizes in range [@primerLenHalf@; @primerLen@].
    offTargets = concatMap seqToKMers matchingSeqs

    tgtE         = cofoldEnergy primerSeq primerSeq
    maxOffTarget = maximum $ fmap (cofoldEnergy primerSeq) offTargets

    res = pure $ sp & eTgt .~ Just tgtE

    seqToKMers :: [DNA] -> [[DNA]]
    seqToKMers s = concatMap (toKMers s) [primerLenHalf .. primerLen]
      where
        toKMers :: [a] -> Int -> [[a]]
        toKMers l k | length l < k = []
                    | otherwise    = take k l : toKMers (tail l) k

    -- | All parts of @dna@ that align the best on given @sp@ using algorithm @localAlignment@.
    -- We consider complementary strands of these parts to be most energetically preferrable
    -- off-targets for our primer.
    --
    matchingSeqs :: [[DNA]]
    matchingSeqs = fmap matchingSeq [invertedDna, dnaRevComp]
      where
        -- we invert the cyclic @dna@ in such way that target area is being cut in half,
        -- so that it won't be considered as off-target interaction
        invertedDna | [x]    <- invertPoints = drop x dna <> drop Constants.maxPrimerLength (take x dna)
                    | [x, y] <- invertPoints = take (y - x) . drop x $ dna
                    | otherwise              = error "This branch is never visited."

        -- here we check that @primerLenHalf@ is not in the part of sequence,
        -- that was appended at the end to create cyclic sequence.
        -- If it's not in this part, invert at @primerLenHalf@.
        -- Otherwise consider everything in between @modPos@ and (@modPos@ + (length @dna@ - @Constants.maxPrimerLength@))
        -- as inverted sequence.
        modPos       = primerLenHalf `mod` (length dna - Constants.maxPrimerLength)
        invertPoints | modPos >= Constants.maxPrimerLength = [primerLenHalf]
                     | otherwise                           = [modPos, modPos + (length dna - Constants.maxPrimerLength)]

        -- off-target interaction could happen in reversed direction
        dnaRevComp = reverse $ fmap cNA dna

        matchingSeq :: [DNA] -> [DNA]
        matchingSeq dna' = res'
          where
            ar = alignment $ alignmentFunc dna' primerSeq

            traceStart = toCoord $ last ar
            traceEnd   = toCoord $ head ar

            -- (traceStart, traceEnd) is inclusive range that describes off-target area
            -- in @dna'@. We want to extend that range by @primerLen@ in both directions
            -- to consider different possibilities
            (l, r)  = (max 0 (traceEnd - primerLen), min (length dna' - 1) (traceStart + primerLen))
            res'    = take (r - l + 1) $ drop l dna'

        toCoord :: Operation Int Int -> Int
        toCoord = getI

        alignmentFunc :: [DNA] -> [DNA] -> AlignmentResult (NucleicAcidChain Int DNA) (NucleicAcidChain Int DNA)
        alignmentFunc plasmid primer = alRes
          where
            plasmidC = toChain plasmid
            primerC  = toChain primer

            alRes = align localAlignment plasmidC primerC

            toChain :: [DNA] -> NucleicAcidChain Int DNA
            toChain = NucleicAcidChain . fromList
-- algorithm for linear sequences
calcTarget False dna sourceInd sp | Just e <- primerTargetEnergy = pure $ sp & eTgt .~ Just e
                                  | otherwise = Nothing
  where
    primerTargetEnergy :: Maybe Float
    primerTargetEnergy | abs e < revE = Nothing
                       | Just cnt <- actualBindingRateM, checkBindingRate cnt = Just e
                       | otherwise    = Nothing
      where
        primerSeq = sp ^. seq'
        spStr     = symbol <$> primerSeq

        compPrimerSeq = fmap cNA dna

        (e, bindStr) = cofold Constants.annealingTemp (reverse primerSeq, compPrimerSeq)
        primerLength = length spStr

        -- energy of primer's interaction revesed dna strand
        revE = cofoldEnergy primerSeq (reverse $ fmap cNA dna)

        -- inclusive range in which target binding site is being contained in @bindStr@
        (lInd, rInd) = (primerLength + sourceInd, lInd + primerLength - 1)

        bindWithInds       = zip bindStr [0..]
        actualBindingRateM = calcMatchesInBindingSite [] 0 bindWithInds

        -- | Calculates number of nucleotides in target binding site that bind
        -- to primer. Also checks that last nucleotide of primer binds to last nucleotide
        -- of target binding site. If this condition is not satisfied, Nothing is returned.
        --
        -- @bindStr@ is used for these calculations. @bindStr@ represents interaction between primer
        -- and sequence in form of a dot plot. Dot plot also contains balanced bracket sequence
        -- that shows how nucleotides bind to each other.
        --
        calcMatchesInBindingSite :: [Int] -> Int -> [(Char, Int)] -> Maybe Int
        calcMatchesInBindingSite _ cnt []           = Just cnt
        calcMatchesInBindingSite stack cnt (x : xs) -- if we encounter close bracket, then we check that its position is in target binding site
                                                    -- and position of open bracket that corresponds to it is in primer
                                                    | c == closeBracket, lInd <= i && i < rInd && i' < primerLength = calcMatchesInBindingSite l (cnt + 1) xs
                                                    -- close bracket in target corresponds to non-primer interaction, we skip this bracket
                                                    | c == closeBracket, lInd <= i && i < rInd = calcMatchesInBindingSite l cnt xs
                                                    -- next two conditions check that 3' end of primer binds to end of the target binding site
                                                    | c == closeBracket, i == rInd, i' == 0 = Just $ cnt + 1
                                                    | c == closeBracket, i == rInd, i' /= 0 = Nothing
                                                    -- close bracket is out of target range, we skip it
                                                    | c == closeBracket = calcMatchesInBindingSite l cnt xs
                                                    -- open brackets are put on stack
                                                    | c == openBracket = calcMatchesInBindingSite (i : stack) cnt xs
                                                    -- we ignore dots
                                                    | otherwise        = calcMatchesInBindingSite stack cnt xs
          where
            (c, i) = x

            -- since the bracket sequence is balanced, we will get to this pattern-matching only if
            -- there is something on top of the stack
            (i' : l) = stack

        openBracket :: Char
        openBracket = '('

        closeBracket :: Char
        closeBracket = ')'

        -- | Checks that not less then @Constants.bindingRate@ * 100 precents of primer's nucleotides
        -- interact with target.
        --
        checkBindingRate :: Int -> Bool
        checkBindingRate cnt = fromIntegral cnt / fromIntegral primerLength >= Constants.bindingRate

--------------------------------------------------------------------------------
-- Utility functions.
--------------------------------------------------------------------------------

-- | This function is used to avoid dividing by zero.
--
toEps :: Float -> Float
toEps x | abs x < eps = eps
        | otherwise   = x
  where
    eps = 0.01