{-# LANGUAGE ExplicitNamespaces #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeApplications #-} module DataFrame.Operations.Core where import qualified Data.List as L import qualified Data.Map as M import qualified Data.Map.Strict as MS import qualified Data.Text as T import qualified Data.Vector as V import qualified Data.Vector.Generic as VG import qualified Data.Vector.Unboxed as VU import Control.Exception (throw) import Data.Either import Data.Function (on, (&)) import Data.Maybe import Data.Type.Equality (TestEquality (..)) import DataFrame.Errors import DataFrame.Internal.Column ( Column (..), Columnable, columnLength, columnTypeString, expandColumn, fromList, fromVector, ) import DataFrame.Internal.DataFrame (DataFrame (..), empty, getColumn) import DataFrame.Internal.Expression import DataFrame.Internal.Parsing (isNullish) import DataFrame.Internal.Row (Any, mkColumnFromRow) import Type.Reflection import Prelude hiding (null) {- | O(1) Get DataFrame dimensions i.e. (rows, columns) ==== __Example__ @ ghci> D.dimensions df (100, 3) @ -} dimensions :: DataFrame -> (Int, Int) dimensions = dataframeDimensions {-# INLINE dimensions #-} {- | O(k) Get column names of the DataFrame in order of insertion. ==== __Example__ @ ghci> D.columnNames df ["col_a", "col_b", "col_c"] @ -} columnNames :: DataFrame -> [T.Text] columnNames = map fst . L.sortBy (compare `on` snd) . M.toList . columnIndices {-# INLINE columnNames #-} {- | Adds a vector to the dataframe. If the vector has less elements than the dataframe and the dataframe is not empty the vector is converted to type `Maybe a` filled with `Nothing` to match the size of the dataframe. Similarly, if the vector has more elements than what's currently in the dataframe, the other columns in the dataframe are change to `Maybe ` and filled with `Nothing`. ==== __Example__ @ ghci> import qualified Data.Vector as V ghci> D.insertVector "numbers" (V.fromList [1..10]) D.empty -------- numbers -------- Int -------- 1 2 3 4 5 6 7 8 9 10 @ -} insertVector :: forall a. (Columnable a) => -- | Column Name T.Text -> -- | Vector to add to column V.Vector a -> -- | DataFrame to add column to DataFrame -> DataFrame insertVector name xs = insertColumn name (fromVector xs) {-# INLINE insertVector #-} {- | /O(k)/ Add a column to the dataframe providing a default. This constructs a new vector and also may convert it to an unboxed vector if necessary. Since columns are usually large the runtime is dominated by the length of the list, k. -} insertVectorWithDefault :: forall a. (Columnable a) => -- | Default Value a -> -- | Column name T.Text -> -- | Data to add to column V.Vector a -> -- | DataFrame to add the column to DataFrame -> DataFrame insertVectorWithDefault defaultValue name xs d = let (rows, _) = dataframeDimensions d values = xs V.++ V.replicate (rows - V.length xs) defaultValue in insertColumn name (fromVector values) d {- | /O(n)/ Adds an unboxed vector to the dataframe. Same as insertVector but takes an unboxed vector. If you insert a vector of numbers through insertVector it will either way be converted into an unboxed vector so this function saves that extra work/conversion. -} insertUnboxedVector :: forall a. (Columnable a, VU.Unbox a) => -- | Column Name T.Text -> -- | Unboxed vector to add to column VU.Vector a -> -- | DataFrame to add the column to DataFrame -> DataFrame insertUnboxedVector name xs = insertColumn name (UnboxedColumn xs) {- | /O(n)/ Add a column to the dataframe. ==== __Example__ @ ghci> D.insertColumn "numbers" (D.fromList [1..10]) D.empty -------- numbers -------- Int -------- 1 2 3 4 5 6 7 8 9 10 @ -} insertColumn :: -- | Column Name T.Text -> -- | Column to add Column -> -- | DataFrame to add the column to DataFrame -> DataFrame insertColumn name column d = let (r, c) = dataframeDimensions d n = max (columnLength column) r in case M.lookup name (columnIndices d) of Just i -> DataFrame (V.map (expandColumn n) (columns d V.// [(i, column)])) (columnIndices d) (n, c) Nothing -> DataFrame (V.map (expandColumn n) (columns d `V.snoc` column)) (M.insert name c (columnIndices d)) (n, c + 1) {- | /O(n)/ Clones a column and places it under a new name in the dataframe. ==== __Example__ @ ghci> import qualified Data.Vector as V ghci> df = insertVector "numbers" (V.fromList [1..10]) D.empty ghci> D.cloneColumn "numbers" "others" df ----------------- numbers | others ---------|------- Int | Int ---------|------- 1 | 1 2 | 2 3 | 3 4 | 4 5 | 5 6 | 6 7 | 7 8 | 8 9 | 9 10 | 10 @ -} cloneColumn :: T.Text -> T.Text -> DataFrame -> DataFrame cloneColumn original new df = fromMaybe ( throw $ ColumnNotFoundException original "cloneColumn" (M.keys $ columnIndices df) ) $ do column <- getColumn original df return $ insertColumn new column df {- | /O(n)/ Renames a single column. ==== __Example__ @ ghci> import qualified Data.Vector as V ghci> df = insertVector "numbers" (V.fromList [1..10]) D.empty ghci> D.rename "numbers" "others" df ------- others ------- Int ------- 1 2 3 4 5 6 7 8 9 10 @ -} rename :: T.Text -> T.Text -> DataFrame -> DataFrame rename orig new df = either throw id (renameSafe orig new df) {- | /O(n)/ Renames many columns. ==== __Example__ @ ghci> import qualified Data.Vector as V ghci> df = D.insertVector "others" (V.fromList [11..20]) (D.insertVector "numbers" (V.fromList [1..10]) D.empty) ghci> df ----------------- numbers | others ---------|------- Int | Int ---------|------- 1 | 11 2 | 12 3 | 13 4 | 14 5 | 15 6 | 16 7 | 17 8 | 18 9 | 19 10 | 20 ghci> D.renameMany [("numbers", "first_10"), ("others", "next_10")] df ------------------- first_10 | next_10 ----------|-------- Int | Int ----------|-------- 1 | 11 2 | 12 3 | 13 4 | 14 5 | 15 6 | 16 7 | 17 8 | 18 9 | 19 10 | 20 @ -} renameMany :: [(T.Text, T.Text)] -> DataFrame -> DataFrame renameMany = fold (uncurry rename) renameSafe :: T.Text -> T.Text -> DataFrame -> Either DataFrameException DataFrame renameSafe orig new df = fromMaybe (Left $ ColumnNotFoundException orig "rename" (M.keys $ columnIndices df)) $ do columnIndex <- M.lookup orig (columnIndices df) let origRemoved = M.delete orig (columnIndices df) let newAdded = M.insert new columnIndex origRemoved return (Right df{columnIndices = newAdded}) data ColumnInfo = ColumnInfo { nameOfColumn :: !T.Text , nonNullValues :: !Int , nullValues :: !Int , typeOfColumn :: !T.Text } {- | O(n * k ^ 2) Returns the number of non-null columns in the dataframe and the type associated with each column. ==== __Example__ @ ghci> import qualified Data.Vector as V ghci> df = D.insertVector "others" (V.fromList [11..20]) (D.insertVector "numbers" (V.fromList [1..10]) D.empty) ghci> D.describeColumns df -------------------------------------------------------- Column Name | # Non-null Values | # Null Values | Type -------------|-------------------|---------------|----- Text | Int | Int | Text -------------|-------------------|---------------|----- others | 10 | 0 | Int numbers | 10 | 0 | Int @ -} describeColumns :: DataFrame -> DataFrame describeColumns df = empty & insertColumn "Column Name" (fromList (map nameOfColumn infos)) & insertColumn "# Non-null Values" (fromList (map nonNullValues infos)) & insertColumn "# Null Values" (fromList (map nullValues infos)) & insertColumn "Type" (fromList (map typeOfColumn infos)) where infos = L.sortBy (compare `on` nonNullValues) (V.ifoldl' go [] (columns df)) :: [ColumnInfo] indexMap = M.fromList (map (\(a, b) -> (b, a)) $ M.toList (columnIndices df)) columnName i = M.lookup i indexMap go acc i col@(OptionalColumn (c :: V.Vector a)) = let cname = columnName i countNulls = nulls col columnType = T.pack $ show $ typeRep @a in if isNothing cname then acc else ColumnInfo (fromMaybe "" cname) (columnLength col - countNulls) countNulls columnType : acc go acc i col@(BoxedColumn (c :: V.Vector a)) = let cname = columnName i columnType = T.pack $ show $ typeRep @a in if isNothing cname then acc else ColumnInfo (fromMaybe "" cname) (columnLength col) 0 columnType : acc go acc i col@(UnboxedColumn c) = let cname = columnName i columnType = T.pack $ columnTypeString col in -- Unboxed columns cannot have nulls since Maybe -- is not an instance of Unbox a if isNothing cname then acc else ColumnInfo (fromMaybe "" cname) (columnLength col) 0 columnType : acc nulls :: Column -> Int nulls (OptionalColumn xs) = VG.length $ VG.filter isNothing xs nulls (BoxedColumn (xs :: V.Vector a)) = case testEquality (typeRep @a) (typeRep @T.Text) of Just Refl -> VG.length $ VG.filter isNullish xs Nothing -> case testEquality (typeRep @a) (typeRep @String) of Just Refl -> VG.length $ VG.filter (isNullish . T.pack) xs Nothing -> case typeRep @a of App t1 t2 -> case eqTypeRep t1 (typeRep @Maybe) of Just HRefl -> VG.length $ VG.filter isNothing xs Nothing -> 0 _ -> 0 nulls _ = 0 partiallyParsed :: Column -> Int partiallyParsed (BoxedColumn (xs :: V.Vector a)) = case typeRep @a of App (App tycon t1) t2 -> case eqTypeRep tycon (typeRep @Either) of Just HRefl -> VG.length $ VG.filter isLeft xs Nothing -> 0 _ -> 0 partiallyParsed _ = 0 {- | Creates a dataframe from a list of tuples with name and column. ==== __Example__ @ ghci> df = D.fromNamedColumns [("numbers", D.fromList [1..10]), ("others", D.fromList [11..20])] ghci> df ----------------- numbers | others ---------|------- Int | Int ---------|------- 1 | 11 2 | 12 3 | 13 4 | 14 5 | 15 6 | 16 7 | 17 8 | 18 9 | 19 10 | 20 @ -} fromNamedColumns :: [(T.Text, Column)] -> DataFrame fromNamedColumns = L.foldl' (\df (name, column) -> insertColumn name column df) empty {- | Create a dataframe from a list of columns. The column names are "0", "1"... etc. Useful for quick exploration but you should probably always rename the columns after or drop the ones you don't want. ==== __Example__ @ ghci> df = D.fromUnnamedColumns [D.fromList [1..10], D.fromList [11..20]] ghci> df ----------------- 0 | 1 -----|---- Int | Int -----|---- 1 | 11 2 | 12 3 | 13 4 | 14 5 | 15 6 | 16 7 | 17 8 | 18 9 | 19 10 | 20 @ -} fromUnnamedColumns :: [Column] -> DataFrame fromUnnamedColumns = fromNamedColumns . zip (map (T.pack . show) [0 ..]) {- | Create a dataframe from a list of column names and rows. ==== __Example__ @ ghci> df = D.fromRows ["A", "B"] [[D.toAny 1, D.toAny 11], [D.toAny 2, D.toAny 12], [D.toAny 3, D.toAny 13]] ghci> df ---------- A | B -----|---- Int | Int -----|---- 1 | 11 2 | 12 3 | 13 @ -} fromRows :: [T.Text] -> [[Any]] -> DataFrame fromRows names rows = L.foldl' (\df i -> insertColumn (names !! i) (mkColumnFromRow i rows) df) empty [0 .. length names - 1] {- | O (k * n) Counts the occurences of each value in a given column. ==== __Example__ @ ghci> df = D.fromUnnamedColumns [D.fromList [1..10], D.fromList [11..20]] ghci> D.valueCounts @Int "0" df [(1,1),(2,1),(3,1),(4,1),(5,1),(6,1),(7,1),(8,1),(9,1),(10,1)] @ -} valueCounts :: forall a. (Columnable a) => Expr a -> DataFrame -> [(a, Int)] valueCounts (Col columnName) df = case getColumn columnName df of Nothing -> throw $ ColumnNotFoundException columnName "valueCounts" (M.keys $ columnIndices df) Just (BoxedColumn (column' :: V.Vector c)) -> let column = V.foldl' (\m v -> MS.insertWith (+) v (1 :: Int) m) M.empty column' in case (typeRep @a) `testEquality` (typeRep @c) of Nothing -> throw $ TypeMismatchException ( MkTypeErrorContext { userType = Right $ typeRep @a , expectedType = Right $ typeRep @c , errorColumnName = Just (T.unpack columnName) , callingFunctionName = Just "valueCounts" } ) Just Refl -> M.toAscList column Just (OptionalColumn (column' :: V.Vector c)) -> let column = V.foldl' (\m v -> MS.insertWith (+) v (1 :: Int) m) M.empty column' in case (typeRep @a) `testEquality` (typeRep @c) of Nothing -> throw $ TypeMismatchException ( MkTypeErrorContext { userType = Right $ typeRep @a , expectedType = Right $ typeRep @c , errorColumnName = Just (T.unpack columnName) , callingFunctionName = Just "valueCounts" } ) Just Refl -> M.toAscList column Just (UnboxedColumn (column' :: VU.Vector c)) -> let column = V.foldl' (\m v -> MS.insertWith (+) v (1 :: Int) m) M.empty (V.convert column') in case (typeRep @a) `testEquality` (typeRep @c) of Nothing -> throw $ TypeMismatchException ( MkTypeErrorContext { userType = Right $ typeRep @a , expectedType = Right $ typeRep @c , errorColumnName = Just (T.unpack columnName) , callingFunctionName = Just "valueCounts" } ) Just Refl -> M.toAscList column valueCounts _ _ = error "Cannot call value counts on non-column reference" {- | A left fold for dataframes that takes the dataframe as the last object. This makes it easier to chain operations. ==== __Example__ @ ghci> D.fold (const id) [1..5] df ---------- 0 | 1 -----|---- Int | Int -----|---- 1 | 11 2 | 12 3 | 13 4 | 14 5 | 15 6 | 16 7 | 17 8 | 18 9 | 19 10 | 20 @ -} fold :: (a -> DataFrame -> DataFrame) -> [a] -> DataFrame -> DataFrame fold f xs acc = L.foldl' (flip f) acc xs