module Bio.SelectSequencesLibrary (
preprocessClustalForRNAz,
preprocessClustalForRNAzExternal,
preprocessClustalForRNAcodeExternal
)
where
import System.Process
import Data.List
import Bio.ClustalParser
import Data.Either.Unwrap
import Data.Maybe
import qualified Data.Vector as V
import Data.Matrix
import qualified Data.Text as T
import qualified Data.Text.IO as TI
import Text.Printf
import qualified Data.Text.Metrics as TM
textIdentity :: T.Text -> T.Text -> Double
textIdentity text1 text2 = identityPercent
where distance = TM.hamming text1 text2
maximumDistance = maximum [T.length text1, T.length text2]
distanceDouble = toInteger ( fromJust distance )
identityPercent = 1 (fromIntegral distanceDouble/fromIntegral maximumDistance)
preprocessClustalForRNAzExternal :: String -> String -> Int -> Int -> Int -> Bool -> IO (Either String (String,String))
preprocessClustalForRNAzExternal clustalFilepath outputPath seqenceNumber optimalIdentity maximalIdenity referenceSequence = do
clustalText <- TI.readFile clustalFilepath
let reformatedClustalPath = outputPath ++ "result.reformated"
let reformatedClustalText = T.map reformatAln clustalText
TI.writeFile reformatedClustalPath reformatedClustalText
let selectedClustalpath = outputPath ++ "result.selected"
let sequenceNumberOption = " -n " ++ show seqenceNumber ++ " "
let optimalIdentityOption = " -i " ++ show optimalIdentity ++ " "
let maximalIdentityOption = " --max-id=" ++ show maximalIdenity ++ " "
let referenceSequenceOption = if referenceSequence then " " else " -x "
let syscall = "rnazSelectSeqs.pl " ++ reformatedClustalPath ++ " " ++ sequenceNumberOption ++ optimalIdentityOption ++ maximalIdentityOption ++ referenceSequenceOption ++ " >" ++ selectedClustalpath
system syscall
selectedClustalText <- readFile selectedClustalpath
return (Right ([],selectedClustalText))
preprocessClustalForRNAcodeExternal :: String -> String -> Int -> Int -> Int -> Bool -> IO (Either String (String,String))
preprocessClustalForRNAcodeExternal clustalFilepath outputPath seqenceNumber optimalIdentity maximalIdenity referenceSequence = do
clustalText <- TI.readFile clustalFilepath
let reformatedClustalPath = outputPath ++ "result.reformated"
let clustalTextLines = T.lines clustalText
let headerClustalTextLines = T.unlines (take 2 clustalTextLines)
let headerlessClustalTextLines = T.unlines (drop 2 clustalTextLines)
let reformatedClustalText = T.map reformatRNACodeAln headerlessClustalTextLines
TI.writeFile reformatedClustalPath (headerClustalTextLines `T.append` T.singleton '\n' `T.append` reformatedClustalText)
let selectedClustalpath = outputPath ++ "result.selected"
let sequenceNumberOption = " -n " ++ show seqenceNumber ++ " "
let optimalIdentityOption = " -i " ++ show optimalIdentity ++ " "
let maximalIdentityOption = " --max-id=" ++ show maximalIdenity ++ " "
let referenceSequenceOption = if referenceSequence then " " else " -x "
let syscall = "rnazSelectSeqs.pl " ++ reformatedClustalPath ++ " " ++ sequenceNumberOption ++ optimalIdentityOption ++ maximalIdentityOption ++ referenceSequenceOption ++ " >" ++ selectedClustalpath
system syscall
selectedClustalText <- readFile selectedClustalpath
return (Right ([],selectedClustalText))
preprocessClustalForRNAz :: String -> String -> Int -> Double -> Double -> Bool -> String -> IO (Either String (String,String))
preprocessClustalForRNAz clustalFilepath outputPath seqenceNumber optimalIdentity maximalIdenity referenceSequence reformatOption = do
clustalText <- TI.readFile clustalFilepath
let clustalTextLines = T.lines clustalText
parsedClustalInput <- readClustalAlignment clustalFilepath
let selectedClustalpath = outputPath ++ "result.selected"
if length clustalTextLines > 5
then
if isRight parsedClustalInput
then do
let (idMatrix,filteredClustalInput) = rnaCodeSelectSeqs2 (fromRight parsedClustalInput) seqenceNumber optimalIdentity maximalIdenity referenceSequence reformatOption
writeFile selectedClustalpath (show filteredClustalInput)
let formatedIdMatrix = show (fmap formatIdMatrix idMatrix)
return (Right (formatedIdMatrix,selectedClustalpath))
else return (Left (show (fromLeft parsedClustalInput)))
else do
let clustalLines = T.lines clustalText
let headerClustalTextLines = T.unlines (take 2 clustalLines)
let headerlessClustalTextLines = T.unlines (drop 2 clustalLines)
let reformatedClustalText = T.map reformatRNACodeAln headerlessClustalTextLines
TI.writeFile selectedClustalpath (headerClustalTextLines `T.append` T.singleton '\n' `T.append` reformatedClustalText)
return (Right ([],clustalFilepath))
formatIdMatrix :: Maybe (Int,Int,Double) -> String
formatIdMatrix (Just (_,_,c)) = printf "%.2f" c
formatIdMatrix _ = "-"
rnaCodeSelectSeqs2 :: ClustalAlignment -> Int -> Double -> Double -> Bool -> String -> (Matrix (Maybe (Int,Int,Double)),ClustalAlignment)
rnaCodeSelectSeqs2 currentClustalAlignment targetSeqNumber optimalIdentity maximalIdentity referenceSequence reformatOption = (identityMatrix,newClustalAlignment)
where entryVector = V.fromList (alignmentEntries currentClustalAlignment)
entrySequences = V.map entryAlignedSequence entryVector
entryReformatedSequences = V.map (T.map reformatRNACodeAln) entrySequences
totalSeqNumber = V.length entryVector
identityMatrix = computeSequenceIdentityMatrix entryReformatedSequences
entryIdentityVector = V.map fromJust (V.filter isJust (getMatrixAsVector identityMatrix))
entryIdentities = V.toList entryIdentityVector
entriesToDiscard = preFilterIdentityMatrix maximalIdentity targetSeqNumber totalSeqNumber [] entryIdentities
allEntries = [1..totalSeqNumber]
prefilteredEntries = allEntries \\ entriesToDiscard
costList = map (computeEntryCost optimalIdentity entryIdentityVector) prefilteredEntries
sortedCostList = sortBy compareEntryCost2 costList
sortedIndices = map fst sortedCostList
selectedEntryIndices = selectEntryIndices referenceSequence targetSeqNumber sortedIndices
selectedEntries = map (\ind -> entryVector V.! (ind1)) selectedEntryIndices
selectedEntryHeader = map entrySequenceIdentifier selectedEntries
selectedReformatFunction = selectReformatFunction reformatOption
reformatedSelectedEntryHeader = map (T.map selectedReformatFunction) selectedEntryHeader
selectedEntrySequences = map (\ind -> entryReformatedSequences V.! (ind1)) selectedEntryIndices
gapfreeEntrySequences = T.transpose (filter (not . T.all isGap) (T.transpose selectedEntrySequences))
gapfreeEntries = map (uncurry ClustalAlignmentEntry)(zip reformatedSelectedEntryHeader gapfreeEntrySequences)
emptyConservationTrack = setEmptyConservationTrack gapfreeEntries (conservationTrack currentClustalAlignment)
newClustalAlignment = currentClustalAlignment {alignmentEntries = gapfreeEntries, conservationTrack = emptyConservationTrack}
selectReformatFunction :: String -> (Char -> Char)
selectReformatFunction reformatOption
| reformatOption == "RNAcode" = reformatRNACodeId
| otherwise = id
selectEntryIndices :: Bool -> Int -> [Int] -> [Int]
selectEntryIndices referenceSequence targetSeqNumber sortedIndices
| referenceSequence = if (1 :: Int) `elem` firstX then firstRefX else 1:firstXm1
| otherwise = firstX
where firstXm1 = take (targetSeqNumber 1) sortedIndices
firstX = take targetSeqNumber sortedIndices
firstRefX =(1 :: Int):(filter (\i -> i /= (1 :: Int)) firstX)
setEmptyConservationTrack :: [ClustalAlignmentEntry] -> T.Text -> T.Text
setEmptyConservationTrack alnentries currentConservationTrack
| null alnentries = currentConservationTrack
| otherwise = newConservationTrack
where trackLength = T.length (entryAlignedSequence (head alnentries))
newConservationTrack = T.replicate (trackLength + 0) (T.pack " ")
isGap :: Char -> Bool
isGap a
| a == '-' = True
| otherwise = False
computeEntryCost :: Double -> V.Vector (Int,Int,Double) -> Int -> (Int,Double)
computeEntryCost optimalIdentity allIdentities currentIndex = (currentIndex,entryCost)
where entryCost = V.sum (V.map (computeCost optimalIdentity) entryIdentities)
entryIdentities = getEntryIdentities currentIndex allIdentities
getEntryIdentities :: Int -> V.Vector (Int,Int,Double) -> V.Vector (Int,Int,Double)
getEntryIdentities currentIndex allIdentities = V.filter (isIIdx currentIndex) allIdentities V.++ V.filter (isJIdx currentIndex) allIdentities
isIIdx :: Int -> (Int,Int,Double) -> Bool
isIIdx currentIdx (i,_,_) = currentIdx == i
isJIdx :: Int -> (Int,Int,Double) -> Bool
isJIdx currentIdx (_,j,_) = currentIdx == j
computeCost :: Double -> (Int,Int,Double) -> Double
computeCost optimalIdentity (_,_,c) = (c optimalIdentity) * (c optimalIdentity)
compareEntryCost2 :: (Int, Double) -> (Int, Double) -> Ordering
compareEntryCost2 (_,costA) (_,costB) = compare costA costB
preFilterIdentityMatrix :: Double -> Int -> Int-> [Int] -> [(Int,Int,Double)] -> [Int]
preFilterIdentityMatrix identityCutoff minSeqNumber totalSeqNumber filteredIds entryIdentities
| (totalSeqNumber length filteredIds) <= minSeqNumber = []
| identityCutoff == (100 :: Double) = []
| Prelude.null entryIdentities = []
| otherwise = entryresult ++ preFilterIdentityMatrix identityCutoff minSeqNumber totalSeqNumber (filteredIds ++ entryresult) (tail entryIdentities)
where currentEntry = head entryIdentities
entryresult = checkIdentityEntry identityCutoff filteredIds currentEntry
checkIdentityEntry :: Double -> [Int] -> (Int,Int,Double) -> [Int]
checkIdentityEntry identityCutoff filteredIds (i,j,ident)
| i `elem` filteredIds = []
| j `elem` filteredIds = []
| ident > identityCutoff = [j]
| otherwise = []
computeSequenceIdentityMatrix :: V.Vector T.Text -> Matrix (Maybe (Int,Int,Double))
computeSequenceIdentityMatrix entryVector = matrix (V.length entryVector) (V.length entryVector) (computeSequenceIdentityEntry entryVector)
computeSequenceIdentityEntry :: V.Vector T.Text -> (Int,Int) -> Maybe (Int,Int,Double)
computeSequenceIdentityEntry entryVector (row,col)
| i < j = Just (row,col,ident)
| otherwise = Nothing
where i=row1
j=col1
ientry = entryVector V.! i
jentry = entryVector V.! j
(gfi,gfj) = unzip (filter notDoubleGap (T.zip ientry jentry))
gfitext = T.pack gfi
gfjtext = T.pack gfj
ident=textIdentity gfitext gfjtext
notDoubleGap :: (Char,Char) -> Bool
notDoubleGap (a,b)
| a == '-' && b == '-' = False
| otherwise = True
reformatRNACodeId :: Char -> Char
reformatRNACodeId c
| c == ':' = '-'
| c == '|' = '-'
| c == '.' = '-'
| c == '~' = '-'
| c == '_' = '-'
| c == '/' = '-'
| otherwise = c
reformatRNACodeAln :: Char -> Char
reformatRNACodeAln c
| c == ':' = '-'
| c == '|' = '-'
| c == '.' = '-'
| c == '~' = '-'
| c == '_' = '-'
| c == 'u' = 'U'
| c == 't' = 'T'
| c == 'g' = 'G'
| c == 'c' = 'C'
| c == 'a' = 'A'
| otherwise = c
reformatAln :: Char -> Char
reformatAln c
| c == '.' = '-'
| c == '~' = '-'
| c == '_' = '-'
| c == 'u' = 'U'
| c == 't' = 'T'
| c == 'g' = 'G'
| c == 'c' = 'C'
| c == 'a' = 'A'
| otherwise = c