{-# LANGUAGE BangPatterns #-} {-# LANGUAGE ExplicitNamespaces #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeApplications #-} {-# LANGUAGE UndecidableInstances #-} module DataFrame.Synthesis where import qualified DataFrame.Functions as F import DataFrame.Internal.Column import DataFrame.Internal.DataFrame ( DataFrame (..), ) import DataFrame.Internal.Expression ( Expr (..), eSize, interpret, ) import DataFrame.Internal.Statistics import DataFrame.Operations.Core (columnAsDoubleVector) import qualified DataFrame.Operations.Statistics as Stats import DataFrame.Operations.Subset (exclude) import Control.Exception (throw) import Data.Containers.ListUtils import Data.Function import qualified Data.List as L import qualified Data.Map as M import Data.Maybe (listToMaybe) import qualified Data.Set as S import qualified Data.Text as T import Data.Type.Equality import qualified Data.Vector.Unboxed as VU import DataFrame.Functions ((.&&), (.<=), (.>), (.||)) import qualified DataFrame.Operations.Core as D import Debug.Trace (trace) import Type.Reflection (typeRep) generateConditions :: TypedColumn Double -> [Expr Bool] -> [Expr Double] -> DataFrame -> [Expr Bool] generateConditions labels conds ps df = let newConds = [ p .<= q | p <- filter (not . isLiteral) ps , q <- ps , p /= q ] ++ [ F.not p | p <- conds ] expandedConds = conds ++ newConds ++ [p .&& q | p <- newConds, q <- conds, p /= q] ++ [p .|| q | p <- newConds, q <- conds, p /= q] in pickTopNBool df labels (deduplicate df expandedConds) generatePrograms :: Bool -> [Expr Bool] -> [Expr Double] -> [Expr Double] -> [Expr Double] -> [Expr Double] generatePrograms _ _ vars' constants [] = vars' ++ constants generatePrograms includeConds conds vars constants ps = let existingPrograms = ps ++ vars ++ constants in existingPrograms ++ [ transform p | p <- ps ++ vars , Prelude.not (isConditional p) , transform <- [ sqrt , abs , log . (+ Lit 1) , exp , sin , cos , F.relu , signum ] ] ++ [ F.pow p i | p <- existingPrograms , Prelude.not (isConditional p) , i <- [2 .. 6] ] ++ [ p + q | (i, p) <- zip [0 ..] existingPrograms , (j, q) <- zip [0 ..] existingPrograms , Prelude.not (isLiteral p && isLiteral q) , Prelude.not (isConditional p || isConditional q) , i >= j ] ++ [ p - q | (i, p) <- zip [0 ..] existingPrograms , (j, q) <- zip [0 ..] existingPrograms , Prelude.not (isLiteral p && isLiteral q) , Prelude.not (isConditional p || isConditional q) , i /= j ] ++ ( if includeConds then [ F.min p q | (i, p) <- zip [0 ..] existingPrograms , (j, q) <- zip [0 ..] existingPrograms , Prelude.not (isLiteral p && isLiteral q) , Prelude.not (isConditional p || isConditional q) , p /= q , i > j ] ++ [ F.max p q | (i, p) <- zip [0 ..] existingPrograms , (j, q) <- zip [0 ..] existingPrograms , Prelude.not (isLiteral p && isLiteral q) , Prelude.not (isConditional p || isConditional q) , p /= q , i > j ] ++ [ F.ifThenElse cond r s | cond <- conds , r <- existingPrograms , s <- existingPrograms , Prelude.not (isConditional r || isConditional s) , r /= s ] else [] ) ++ [ p * q | (i, p) <- zip [0 ..] existingPrograms , (j, q) <- zip [0 ..] existingPrograms , Prelude.not (isLiteral p && isLiteral q) , Prelude.not (isConditional p || isConditional q) , i >= j ] ++ [ p / q | p <- existingPrograms , q <- existingPrograms , Prelude.not (isLiteral p && isLiteral q) , Prelude.not (isConditional p || isConditional q) , p /= q ] isLiteral :: Expr a -> Bool isLiteral (Lit _) = True isLiteral _ = False isConditional :: Expr a -> Bool isConditional (If{}) = True isConditional _ = False deduplicate :: forall a. (Columnable a) => DataFrame -> [Expr a] -> [(Expr a, TypedColumn a)] deduplicate df = go S.empty . nubOrd . L.sortBy (\e1 e2 -> compare (eSize e1) (eSize e2)) where go _ [] = [] go seen (x : xs) | hasInvalid = go seen xs | S.member res seen = go seen xs | otherwise = (x, res) : go (S.insert res seen) xs where res = case interpret @a df x of Left e -> throw e Right v -> v hasInvalid = case res of (TColumn (UnboxedColumn (col :: VU.Vector b))) -> case testEquality (typeRep @Double) (typeRep @b) of Just Refl -> VU.any (\n -> isNaN n || isInfinite n) col Nothing -> False _ -> False -- | Checks if two programs generate the same outputs given all the same inputs. equivalent :: DataFrame -> Expr Double -> Expr Double -> Bool equivalent df p1 p2 = case (==) <$> interpret df p1 <*> interpret df p2 of Left e -> throw e Right v -> v synthesizeFeatureExpr :: -- | Target expression T.Text -> BeamConfig -> DataFrame -> Either String (Expr Double) synthesizeFeatureExpr target cfg df = let df' = exclude [target] df t = case interpret df (Col target) of Left e -> throw e Right v -> v in case beamSearch df' cfg t (percentiles df') [] [] of Nothing -> Left "No programs found" Just p -> Right p f1FromBinary :: VU.Vector Double -> VU.Vector Double -> Maybe Double f1FromBinary trues preds = let (!tp, !fp, !fn) = VU.foldl' step (0 :: Int, 0 :: Int, 0 :: Int) $ VU.zip (VU.map (> 0) preds) (VU.map (> 0) trues) in f1FromCounts tp fp fn where step (!tp, !fp, !fn) (!p, !t) = case (p, t) of (True, True) -> (tp + 1, fp, fn) (True, False) -> (tp, fp + 1, fn) (False, True) -> (tp, fp, fn + 1) (False, False) -> (tp, fp, fn) f1FromCounts :: Int -> Int -> Int -> Maybe Double f1FromCounts tp fp fn = let tp' = fromIntegral tp fp' = fromIntegral fp fn' = fromIntegral fn precision = if tp' + fp' == 0 then 0 else tp' / (tp' + fp') recall = if tp' + fn' == 0 then 0 else tp' / (tp' + fn') in if precision + recall == 0 then Nothing else Just (2 * precision * recall / (precision + recall)) fitClassifier :: -- | Target expression T.Text -> -- | Depth of search (Roughly, how many terms in the final expression) Int -> -- | Beam size - the number of candidate expressions to consider at a time. Int -> DataFrame -> Either String (Expr Int) fitClassifier target d b df = let df' = exclude [target] df t = case interpret df (Col target) of Left e -> throw e Right v -> v in case beamSearch df' (BeamConfig d b F1 True) t (percentiles df' ++ [Lit 1, Lit 0, Lit (-1)]) [] [] of Nothing -> Left "No programs found" Just p -> Right (F.ifThenElse (p .> 0) 1 0) percentiles :: DataFrame -> [Expr Double] percentiles df = let doubleColumns = map (either throw id . ((`columnAsDoubleVector` df) . Col)) (D.columnNames df) in concatMap (\c -> map (Lit . roundTo2SigDigits . (`percentile'` c)) [1, 25, 75, 99]) doubleColumns ++ map (Lit . roundTo2SigDigits . variance') doubleColumns ++ map (Lit . roundTo2SigDigits . sqrt . variance') doubleColumns roundToSigDigits :: Int -> Double -> Double roundToSigDigits n x | x == 0 = 0 | otherwise = let magnitude = floor (logBase 10 (abs x)) scale = 10 ** fromIntegral (n - 1 - magnitude) in fromIntegral (round (x * scale)) / scale roundTo2SigDigits :: Double -> Double roundTo2SigDigits = roundToSigDigits 2 fitRegression :: -- | Target expression T.Text -> -- | Depth of search (Roughly, how many terms in the final expression) Int -> -- | Beam size - the number of candidate expressions to consider at a time. Int -> DataFrame -> Either String (Expr Double) fitRegression target d b df = let df' = exclude [target] df targetMean = Stats.mean (Col @Double target) df t = case interpret df (Col target) of Left e -> throw e Right v -> v cfg = BeamConfig d b MeanSquaredError True constants = percentiles df' ++ [Lit targetMean] ++ [ F.pow p i | i <- [1 .. 6] , p <- [Lit 10, Lit 1, Lit 0.1] ] in case beamSearch df' cfg t constants [] [] of Nothing -> Left "No programs found" Just p -> Right p data LossFunction = PearsonCorrelation | MutualInformation | MeanSquaredError | F1 getLossFunction :: LossFunction -> (VU.Vector Double -> VU.Vector Double -> Maybe Double) getLossFunction f = case f of MutualInformation -> ( \l r -> mutualInformationBinned (Prelude.max 10 (ceiling (sqrt (fromIntegral (VU.length l))))) l r ) PearsonCorrelation -> (\l r -> (^ 2) <$> correlation' l r) MeanSquaredError -> (\l r -> fmap negate (meanSquaredError l r)) F1 -> f1FromBinary data BeamConfig = BeamConfig { searchDepth :: Int , beamLength :: Int , lossFunction :: LossFunction , includeConditionals :: Bool } defaultBeamConfig :: BeamConfig defaultBeamConfig = BeamConfig 2 100 PearsonCorrelation False beamSearch :: DataFrame -> -- | Parameters of the beam search. BeamConfig -> -- | Examples TypedColumn Double -> -- | Constants [Expr Double] -> -- | Conditions [Expr Bool] -> -- | Programs [Expr Double] -> Maybe (Expr Double) beamSearch df cfg outputs constants conds programs | searchDepth cfg == 0 = case ps of [] -> Nothing (x : _) -> Just x | otherwise = beamSearch df (cfg{searchDepth = searchDepth cfg - 1}) outputs constants conditions (generatePrograms (includeConditionals cfg) conditions vars constants ps) where vars = map Col names conditions = generateConditions outputs conds (vars ++ constants) df ps = pickTopN df outputs cfg $ deduplicate df programs names = (map fst . L.sortBy (compare `on` snd) . M.toList . columnIndices) df pickTopN :: DataFrame -> TypedColumn Double -> BeamConfig -> [(Expr Double, TypedColumn a)] -> [Expr Double] pickTopN _ _ _ [] = [] pickTopN df (TColumn col) cfg ps = let l = case toVector @Double @VU.Vector col of Left e -> throw e Right v -> v ordered = Prelude.take (beamLength cfg) ( map fst $ L.sortBy ( \(_, c2) (_, c1) -> if maybe False isInfinite c1 || maybe False isInfinite c2 || maybe False isNaN c1 || maybe False isNaN c2 then LT else compare c1 c2 ) ( map (\(e, res) -> (e, getLossFunction (lossFunction cfg) l (asDoubleVector res))) ps ) ) asDoubleVector c = let (TColumn col') = c in case toVector @Double @VU.Vector col' of Left e -> throw e Right v -> VU.convert v interpretDoubleVector e = let (TColumn col') = case interpret df e of Left e -> throw e Right v -> v in case toVector @Double @VU.Vector col' of Left e -> throw e Right v -> VU.convert v in trace ( "Best loss: " ++ show ( getLossFunction (lossFunction cfg) l . interpretDoubleVector <$> listToMaybe ordered ) ++ " " ++ (if null ordered then "empty" else show (listToMaybe ordered)) ) ordered pickTopNBool :: DataFrame -> TypedColumn Double -> [(Expr Bool, TypedColumn Bool)] -> [Expr Bool] pickTopNBool _ _ [] = [] pickTopNBool df (TColumn col) ps = let l = case toVector @Double @VU.Vector col of Left e -> throw e Right v -> v ordered = Prelude.take 10 ( map fst $ L.sortBy ( \(_, c2) (_, c1) -> if maybe False isInfinite c1 || maybe False isInfinite c2 || maybe False isNaN c1 || maybe False isNaN c2 then LT else compare c1 c2 ) ( map (\(e, res) -> (e, getLossFunction MutualInformation l (asDoubleVector res))) ps ) ) asDoubleVector c = let (TColumn col') = c in case toVector @Bool @VU.Vector col' of Left e -> throw e Right v -> VU.map (fromIntegral @Int @Double . fromEnum) v in ordered satisfiesExamples :: DataFrame -> TypedColumn Double -> Expr Double -> Bool satisfiesExamples df col expr = let result = case interpret df expr of Left e -> throw e Right v -> v in result == col