{-# LANGUAGE RecordWildCards #-}

-- | Prospective 'TrainingData' elements need to be filtered as there are a
-- number of entries which do not provide good training.

module Biobase.TrainingData.Filter where

import Data.List
import qualified Data.Vector.Unboxed as VU

import Biobase.TrainingData
import Biobase.TrainingData.Manip



-- | Filter out elements containing not enough base pairs (in relative terms)

fMinRelPairs :: Maybe Double -> TDmanip -> TDmanip
fMinRelPairs Nothing x    = x
fMinRelPairs _ l@(Left _) = l
fMinRelPairs (Just rel) r@(Right td@(TrainingData{..}))
  | rel > numps * 2 / lenpri = Left td
  | otherwise = r
  where
    lenpri = genericLength primary
    numps  = genericLength secondary

-- | Error-checking filter.

fErrorCheck :: TDmanip -> TDmanip
fErrorCheck l@(Left _) = l
fErrorCheck r@(Right td@(TrainingData{..}))
  | any (<0) ixs = Left td -- that one is really strange
  | any (>=l) ixs = Left td -- indices out of bounds
  | any ((=='&') . (v VU.!)) ixs = Left td -- index pointing to intramolecular symbol
  | otherwise = r
  where
    ixs = concatMap (\((i,j),_) -> [i,j]) secondary
    l = length primary
    v = VU.fromList primary