-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | DBFunctor - Functional Data Management => ETL/ELT Data Processing in Haskell -- -- Please see the README on Github at -- https://github.com/nkarag/haskell-DBFunctor @package DBFunctor @version 0.1.2.0 -- | This is the core module that implements the relational Table concept -- with the RTable data type. It defines all necessary data types -- like RTable and RTuple as well as all the basic -- relational algebra operations (selection -i.e., filter- , projection, -- inner/outer join, aggregation, grouping etc.) on RTables. -- --
-- -- | Return an new RTable after modifying each RTuple of the input RTable. -- myRTableOperation :: RTable -> RTable -- myRTableOperation rtab = do -- rtup <- rtab -- let new_rtup = doStuff rtup -- return new_rtup -- where -- doStuff :: RTuple -> RTuple -- doStuff = ... -- to be defined ---- -- Many different types of data can be turned into an RTable. For -- example, CSV data can be easily turn into an RTable via the -- toRTable function. Many other types of data could be -- represented as "tabular data" via the RTable data type, as long -- as they adhere to the interface posed by the RTabular type -- class. In other words, any data type that we want to convert into an -- RTable and vice-versa, must become an instance of the RTabular -- type class and thus define the basic toRTable and -- fromRTable functions. -- --
-- import RTable.Core
-- import RTable.Data.CSV (CSV, readCSV, toRTable)
-- import Data.Text as T (take, pack)
--
-- -- This is the input source table metadata
-- src_DBTab_MData :: RTableMData
-- src_DBTab_MData =
-- createRTableMData ( "sourceTab" -- table name
-- ,[ ("OWNER", Varchar) -- Owner of the table
-- ,("TABLE_NAME", Varchar) -- Name of the table
-- ,("TABLESPACE_NAME", Varchar) -- Tablespace name
-- ,("STATUS",Varchar) -- Status of the table object (VALID/IVALID)
-- ,("NUM_ROWS", Integer) -- Number of rows in the table
-- ,("BLOCKS", Integer) -- Number of Blocks allocated for this table
-- ,("LAST_ANALYZED", Timestamp "MMDDYYYY HH24:MI:SS") -- Timestamp of the last time the table was analyzed (i.e., gathered statistics)
-- ]
-- )
-- ["OWNER", "TABLE_NAME"] -- primary key
-- [] -- (alternative) unique keys
--
--
-- -- Result RTable metadata
-- result_tab_MData :: RTableMData
-- result_tab_MData =
-- createRTableMData ( "resultTab" -- table name
-- ,[ ("OWNER", Varchar) -- Owner of the table
-- ,("TABLE_NAME", Varchar) -- Name of the table
-- ,("LAST_ANALYZED", Timestamp "MMDDYYYY HH24:MI:SS") -- Timestamp of the last time the table was analyzed (i.e., gathered statistics)
-- ]
-- )
-- ["OWNER", "TABLE_NAME"] -- primary key
-- [] -- (alternative) unique keys
--
--
-- main :: IO()
-- main = do
-- -- read source csv file
-- srcCSV <- readCSV "./app/test-data.csv"
--
-- putStrLn "\nHow many rows you want to print from the source table? :\n"
-- n <- readLn :: IO Int
--
-- -- RTable A
-- printfRTable ( -- define the order by which the columns will appear on screen. Use the default column formatting.
-- genRTupleFormat ["OWNER", "TABLE_NAME", "TABLESPACE_NAME", "STATUS", "NUM_ROWS", "BLOCKS", "LAST_ANALYZED"] genDefaultColFormatMap) $
-- limit n $ toRTable src_DBTab_MData srcCSV
--
-- putStrLn "\nThese are the tables that start with a "B":\n"
--
-- -- RTable B
-- printfRTable ( genRTupleFormat ["OWNER", "TABLE_NAME","LAST_ANALYZED"] genDefaultColFormatMap) $
-- tabs_start_with_B $ toRTable src_DBTab_MData srcCSV
--
-- putStrLn "\nThese are the tables that were analyzed the same day:\n"
--
-- -- RTable C = A InnerJoin B
-- printfRTable ( genRTupleFormat ["OWNER", "TABLE_NAME", "LAST_ANALYZED", "OWNER_1", "TABLE_NAME_1", "LAST_ANALYZED_1"] genDefaultColFormatMap) $
-- ropB myJoin
-- (limit n $ toRTable src_DBTab_MData srcCSV)
-- (tabs_start_with_B $ toRTable src_DBTab_MData srcCSV)
--
-- -- save result of 2nd operation to CSV file
-- writeCSV ".appresult-data.csv" $
-- fromRTable result_tab_MData $
-- tabs_start_with_B $
-- toRTable src_DBTab_MData srcCSV
--
-- where
-- -- Return RTuples with a table_name starting with a B
-- tabs_start_with_B :: RTable -> RTable
-- tabs_start_with_B rtab = (ropU myProjection) . (ropU myFilter) $ rtab
-- where
-- -- Create a Filter Operation to return only RTuples with table_name starting with a B
-- myFilter = RFilter ( t -> let
-- tbname = case toText (t <!> "TABLE_NAME") of
-- Just t -> t
-- Nothing -> pack ""
-- in (T.take 1 tbname) == (pack "B")
-- )
-- -- Create a Projection Operation that projects only two columns
-- myProjection = RPrj ["OWNER", "TABLE_NAME", "LAST_ANALYZED"]
--
-- -- Create an Inner Join for tables analyzed in the same day
-- myJoin :: ROperation
-- myJoin = RInJoin ( t1 t2 ->
-- let
-- RTime {rtime = RTimestampVal {year = y1, month = m1, day = d1, hours24 = hh1, minutes = mm1, seconds = ss1}} = t1<!>"LAST_ANALYZED"
-- RTime {rtime = RTimestampVal {year = y2, month = m2, day = d2, hours24 = hh2, minutes = mm2, seconds = ss2}} = t2<!>"LAST_ANALYZED"
-- in y1 == y2 && m1 == m2 && d1 == d2
-- )
--
--
-- And here is the output:
--
-- -- :l .srcRTable/example.hs -- :set -XOverloadedStrings -- main ---- --
-- How many rows you want to print from the source table? : -- -- 10 -- --------------------------------------------------------------------------------------------------------------------------------- -- OWNER TABLE_NAME TABLESPACE_NAME STATUS NUM_ROWS BLOCKS LAST_ANALYZED -- ~~~~~ ~~~~~~~~~~ ~~~~~~~~~~~~~~~ ~~~~~~ ~~~~~~~~ ~~~~~~ ~~~~~~~~~~~~~ -- APEX_030200 SYS_IOT_OVER_71833 SYSAUX VALID 0 0 06082012 16:22:36 -- APEX_030200 WWV_COLUMN_EXCEPTIONS SYSAUX VALID 3 3 06082012 16:22:33 -- APEX_030200 WWV_FLOWS SYSAUX VALID 10 3 06082012 22:01:21 -- APEX_030200 WWV_FLOWS_RESERVED SYSAUX VALID 0 0 06082012 16:22:33 -- APEX_030200 WWV_FLOW_ACTIVITY_LOG1$ SYSAUX VALID 1 29 07202012 19:07:57 -- APEX_030200 WWV_FLOW_ACTIVITY_LOG2$ SYSAUX VALID 14 29 07202012 19:07:57 -- APEX_030200 WWV_FLOW_ACTIVITY_LOG_NUMBER$ SYSAUX VALID 1 3 07202012 19:08:00 -- APEX_030200 WWV_FLOW_ALTERNATE_CONFIG SYSAUX VALID 0 0 06082012 16:22:33 -- APEX_030200 WWV_FLOW_ALT_CONFIG_DETAIL SYSAUX VALID 0 0 06082012 16:22:33 -- APEX_030200 WWV_FLOW_ALT_CONFIG_PICK SYSAUX VALID 37 3 06082012 16:22:33 -- -- -- 10 rows returned -- --------------------------------------------------------------------------------------------------------------------------------- -- -- These are the tables that start with a B: -- -- ------------------------------------------------------------- -- OWNER TABLE_NAME LAST_ANALYZED -- ~~~~~ ~~~~~~~~~~ ~~~~~~~~~~~~~ -- DBSNMP BSLN_BASELINES 04152018 16:14:51 -- DBSNMP BSLN_METRIC_DEFAULTS 06082012 16:06:41 -- DBSNMP BSLN_STATISTICS 04152018 17:41:33 -- DBSNMP BSLN_THRESHOLD_PARAMS 06082012 16:06:41 -- SYS BOOTSTRAP$ 04142014 13:53:43 -- -- -- 5 rows returned -- ------------------------------------------------------------- -- -- These are the tables that were analyzed the same day: -- -- ------------------------------------------------------------------------------------------------------------------------------------- -- OWNER TABLE_NAME LAST_ANALYZED OWNER_1 TABLE_NAME_1 LAST_ANALYZED_1 -- ~~~~~ ~~~~~~~~~~ ~~~~~~~~~~~~~ ~~~~~~~ ~~~~~~~~~~~~ ~~~~~~~~~~~~~~~ -- APEX_030200 SYS_IOT_OVER_71833 06082012 16:22:36 DBSNMP BSLN_THRESHOLD_PARAMS 06082012 16:06:41 -- APEX_030200 SYS_IOT_OVER_71833 06082012 16:22:36 DBSNMP BSLN_METRIC_DEFAULTS 06082012 16:06:41 -- APEX_030200 WWV_COLUMN_EXCEPTIONS 06082012 16:22:33 DBSNMP BSLN_THRESHOLD_PARAMS 06082012 16:06:41 -- APEX_030200 WWV_COLUMN_EXCEPTIONS 06082012 16:22:33 DBSNMP BSLN_METRIC_DEFAULTS 06082012 16:06:41 -- APEX_030200 WWV_FLOWS 06082012 22:01:21 DBSNMP BSLN_THRESHOLD_PARAMS 06082012 16:06:41 -- APEX_030200 WWV_FLOWS 06082012 22:01:21 DBSNMP BSLN_METRIC_DEFAULTS 06082012 16:06:41 -- APEX_030200 WWV_FLOWS_RESERVED 06082012 16:22:33 DBSNMP BSLN_THRESHOLD_PARAMS 06082012 16:06:41 -- APEX_030200 WWV_FLOWS_RESERVED 06082012 16:22:33 DBSNMP BSLN_METRIC_DEFAULTS 06082012 16:06:41 -- APEX_030200 WWV_FLOW_ALTERNATE_CONFIG 06082012 16:22:33 DBSNMP BSLN_THRESHOLD_PARAMS 06082012 16:06:41 -- APEX_030200 WWV_FLOW_ALTERNATE_CONFIG 06082012 16:22:33 DBSNMP BSLN_METRIC_DEFAULTS 06082012 16:06:41 -- APEX_030200 WWV_FLOW_ALT_CONFIG_DETAIL 06082012 16:22:33 DBSNMP BSLN_THRESHOLD_PARAMS 06082012 16:06:41 -- APEX_030200 WWV_FLOW_ALT_CONFIG_DETAIL 06082012 16:22:33 DBSNMP BSLN_METRIC_DEFAULTS 06082012 16:06:41 -- APEX_030200 WWV_FLOW_ALT_CONFIG_PICK 06082012 16:22:33 DBSNMP BSLN_THRESHOLD_PARAMS 06082012 16:06:41 -- APEX_030200 WWV_FLOW_ALT_CONFIG_PICK 06082012 16:22:33 DBSNMP BSLN_METRIC_DEFAULTS 06082012 16:06:41 -- -- -- 14 rows returned -- ------------------------------------------------------------------------------------------------------------------------------------- ---- -- Check the output CSV file -- --
-- $ head ./app/result-data.csv -- OWNER,TABLE_NAME,LAST_ANALYZED -- DBSNMP,BSLN_BASELINES,04152018 16:14:51 -- DBSNMP,BSLN_METRIC_DEFAULTS,06082012 16:06:41 -- DBSNMP,BSLN_STATISTICS,04152018 17:41:33 -- DBSNMP,BSLN_THRESHOLD_PARAMS,06082012 16:06:41 -- SYS,BOOTSTRAP$,04142014 13:53:43 --module RTable.Core -- | Definition of the Relational Table entity An RTable is a -- "container" of RTuples. type RTable = Vector RTuple -- | Definition of the Relational Tuple. An RTuple is implemented as -- a HashMap of (ColumnName, RDataType) pairs. This -- ensures fast access of the column value by column name. Note that this -- implies that the RTuple CANNOT have more than one columns with -- the same name (i.e. hashmap key) and more importantly that it DOES NOT -- have a fixed order of columns, as it is usual in RDBMS -- implementations. This gives us the freedom to perform column change -- operations very fast. The only place were we need fixed column order -- is when we try to load an RTable from a fixed-column structure -- such as a CSV file. For this reason, we have embedded the notion of a -- fixed column-order in the RTuple metadata. See -- RTupleMData. type RTuple = HashMap ColumnName RDataType -- | Definition of the Relational Data Type. This is the data type of the -- values stored in each RTable. This is a strict data type, -- meaning whenever we evaluate a value of type RDataType, there -- must be also evaluated all the fields it contains. data RDataType RInt :: !Integer -> RDataType [rint] :: RDataType -> !Integer RText :: !Text -> RDataType [rtext] :: RDataType -> !Text RUTCTime :: !UTCTime -> RDataType [rutct] :: RDataType -> !UTCTime RDate :: !Text -> !Text -> RDataType [rdate] :: RDataType -> !Text -- | e.g., "DD/MM/YYYY" [dtformat] :: RDataType -> !Text RTime :: !RTimestamp -> RDataType [rtime] :: RDataType -> !RTimestamp RDouble :: !Double -> RDataType [rdouble] :: RDataType -> !Double Null :: RDataType -- | Basic data type to represent time. This is a strict data type, meaning -- whenever we evaluate a value of type RTimestamp, there must be -- also evaluated all the fields it contains. data RTimestamp RTimestampVal :: !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> RTimestamp [year] :: RTimestamp -> !Int [month] :: RTimestamp -> !Int [day] :: RTimestamp -> !Int [hours24] :: RTimestamp -> !Int [minutes] :: RTimestamp -> !Int [seconds] :: RTimestamp -> !Int -- | Metadata for an RTable data RTableMData RTableMData :: RTableName -> RTupleMData -> [ColumnName] -> [[ColumnName]] -> RTableMData -- | Name of the RTable [rtname] :: RTableMData -> RTableName -- | Tuple-level metadata other metadata [rtuplemdata] :: RTableMData -> RTupleMData -- | Primary Key [pkColumns] :: RTableMData -> [ColumnName] -- | List of unique keys i.e., each sublist is a unique key column -- combination [uniqueKeys] :: RTableMData -> [[ColumnName]] -- | Basic Metadata of an RTuple. The RTuple metadata are -- accessed through a HashMap ColumnName ColumnInfo -- structure. I.e., for each column of the RTuple, we access the -- ColumnInfo structure to get Column-level metadata. This access -- is achieved by ColumnName. However, in order to provide the -- "impression" of a fixed column order per tuple (see RTuple -- definition), we provide another HashMap, the HashMap -- ColumnOrder ColumnName. So in the follwoing example, if -- we want to access the RTupleMData tupmdata ColumnInfo by column -- order, (assuming that we have N columns) we have to do the following: -- --
-- (snd tupmdata)!((fst tupmdata)!0) -- (snd tupmdata)!((fst tupmdata)!1) -- ... -- (snd tupmdata)!((fst tupmdata)!(N-1)) ---- -- In the same manner in order to access the column of an RTuple -- (e.g., tup) by column order, we do the following: -- --
-- tup!((fst tupmdata)!0) -- tup!((fst tupmdata)!1) -- ... -- tup!((fst tupmdata)!(N-1)) --type RTupleMData = (HashMap ColumnOrder ColumnName, HashMap ColumnName ColumnInfo) -- | Basic metadata for a column of an RTuple data ColumnInfo ColumnInfo :: ColumnName -> ColumnDType -> ColumnInfo [name] :: ColumnInfo -> ColumnName [dtype] :: ColumnInfo -> ColumnDType type ColumnOrder = Int -- | Definition of the Name type type Name = Text -- | Definition of the Column Name type ColumnName = Name -- | Definition of the Table Name type RTableName = Name -- | This is used only for metadata purposes (see ColumnInfo). The -- actual data type of a value is an RDataType The Text component of Date -- and Timestamp data constructors is the date format e.g., "DD/MM/YYYY", -- "DD/MM/YYYY HH24:MI:SS" data ColumnDType UknownType :: ColumnDType Integer :: ColumnDType Varchar :: ColumnDType Date :: Text -> ColumnDType Timestamp :: Text -> ColumnDType Double :: ColumnDType type Delimiter = String -- | Basic class to represent a data type that can be turned into an -- RTable. It implements the concept of "tabular data" class RTabular a toRTable :: RTabular a => RTableMData -> a -> RTable fromRTable :: RTabular a => RTableMData -> RTable -> a -- | Definition of Relational Algebra operations. These are the valid -- operations between RTables data ROperation ROperationEmpty :: ROperation -- | Union RUnion :: ROperation -- | Intersection RInter :: ROperation -- | Difference RDiff :: ROperation -- | Projection RPrj :: [ColumnName] -> ROperation [colPrjList] :: ROperation -> [ColumnName] -- | Filter operation (an RPredicate can be any function of the -- signature RTuple -> Bool so it is much more powerful -- than a typical SQL filter expression, which is a boolean expression of -- comparison operators) RFilter :: RPredicate -> ROperation [fpred] :: ROperation -> RPredicate -- | Inner Join (any type of join predicate allowed. Any function with a -- signature of the form: RTuple -> RTuple -> Bool is a -- valid join predicate. I.e., a function which returns True when -- two RTuples must be paired) RInJoin :: RJoinPredicate -> ROperation [jpred] :: ROperation -> RJoinPredicate -- | Left Outer Join RLeftJoin :: RJoinPredicate -> ROperation [jpred] :: ROperation -> RJoinPredicate -- | Right Outer Join RRightJoin :: RJoinPredicate -> ROperation [jpred] :: ROperation -> RJoinPredicate -- | Semi-Join RSemiJoin :: RJoinPredicate -> ROperation [jpred] :: ROperation -> RJoinPredicate -- | Anti-Join RAntiJoin :: RJoinPredicate -> ROperation [jpred] :: ROperation -> RJoinPredicate -- | Performs aggregation operations on specific columns and returns a -- singleton RTable RAggregate :: [RAggOperation] -> ROperation -- | list of aggregates [aggList] :: ROperation -> [RAggOperation] -- | A Group By operation The SQL equivalent is: SELECT colGrByList, -- aggList FROM... GROUP BY colGrByList Note that compared to SQL, -- we can have a more generic grouping predicate (i.e., when two -- RTuples should belong in the same group) than just the equality -- of values on the common columns between two RTuples. Also note, -- that in the case of an aggregation without grouping (equivalent to a -- single-group group by), then the grouping predicate should be: _ -- _ -> True RGroupBy :: RGroupPredicate -> [RAggOperation] -> [ColumnName] -> ROperation -- | the grouping predicate [gpred] :: ROperation -> RGroupPredicate -- | list of aggregates [aggList] :: ROperation -> [RAggOperation] -- | the Group By list of columns [colGrByList] :: ROperation -> [ColumnName] -- | A combination of unary ROperations e.g., (p plist).(f -- pred) (i.e., RPrj . RFilter) , in the form of an RTable -- -> RTable function. In this sense we can also include a -- binary operation (e.g. join), if we partially apply the join to one -- RTable, e.g., -- --
-- (ij jpred rtab) . (p plist) . (f pred) --RCombinedOp :: UnaryRTableOperation -> ROperation [rcombOp] :: ROperation -> UnaryRTableOperation -- | A generic binary ROperation. RBinOp :: BinaryRTableOperation -> ROperation [rbinOp] :: ROperation -> BinaryRTableOperation -- | Order the RTuples of the RTable acocrding to the -- specified list of Columns. First column in the input list has the -- highest priority in the sorting order. ROrderBy :: [(ColumnName, OrderingSpec)] -> ROperation [colOrdList] :: ROperation -> [(ColumnName, OrderingSpec)] -- | A generic unary operation on a RTable type UnaryRTableOperation = RTable -> RTable -- | A generic binary operation on RTable type BinaryRTableOperation = RTable -> RTable -> RTable -- | This data type represents all possible aggregate operations over an -- RTable. Examples are : Sum, Count, Average, Min, Max but it can be any -- other "aggregation". The essential property of an aggregate operation -- is that it acts on an RTable (or on a group of RTuples - in the case -- of the RGroupBy operation) and produces a single RTuple. -- -- An aggregate operation is applied on a specific column (source column) -- and the aggregated result will be stored in the target column. It is -- important to understand that the produced aggregated RTuple is -- different from the input RTuples. It is a totally new RTuple, that -- will consist of the aggregated column(s) (and the grouping columns in -- the case of an RGroupBy). data RAggOperation RAggOperation :: ColumnName -> ColumnName -> (RTable -> RTuple) -> RAggOperation -- | Source column [sourceCol] :: RAggOperation -> ColumnName -- | Target column [targetCol] :: RAggOperation -> ColumnName -- | here we define the aggegate function to be applied on an RTable [aggFunc] :: RAggOperation -> RTable -> RTuple -- | Aggregation Function type. An aggregation function receives as input a -- source column (i.e., a ColumnName) of a source RTable -- and returns an aggregated value, which is the result of the -- aggregation on the values of the source column. type AggFunction = ColumnName -> RTable -> RDataType -- | Returns an RAggOperation with a custom aggregation function -- provided as input raggGenericAgg :: AggFunction -> ColumnName -> ColumnName -> RAggOperation -- | The Sum aggregate operation raggSum :: ColumnName -> ColumnName -> RAggOperation -- | The Count aggregate operation Count aggregation (no distinct) raggCount :: ColumnName -> ColumnName -> RAggOperation -- | The CountDist aggregate operation Count distinct aggregation (i.e., -- count(distinct col) in SQL). Returns the distinct number of -- values for this column. raggCountDist :: ColumnName -> ColumnName -> RAggOperation -- | The CountStar aggregate operation Returns the number of RTuples -- in the RTable (i.e., count(*) in SQL) raggCountStar :: ColumnName -> RAggOperation -- | The Average aggregate operation raggAvg :: ColumnName -> ColumnName -> RAggOperation -- | The Max aggregate operation raggMax :: ColumnName -> ColumnName -> RAggOperation -- | The Min aggregate operation raggMin :: ColumnName -> ColumnName -> RAggOperation -- | The StrAgg aggregate operation This is known as "string_agg"" in -- Postgresql and "listagg" in Oracle. It aggregates the values of a text -- RDataType column with a specified delimiter raggStrAgg :: ColumnName -> ColumnName -> Delimiter -> RAggOperation -- | A Predicate. It defines an arbitrary condition over the columns of an -- RTuple. It is used primarily in the filter RFilter -- operation and used in the filter function f. type RPredicate = RTuple -> Bool -- | The Group By Predicate It defines the condition for two RTuples -- to be included in the same group. type RGroupPredicate = RTuple -> RTuple -> Bool -- | The Join Predicate. It defines when two RTuples should be -- paired. type RJoinPredicate = RTuple -> RTuple -> Bool -- | The Upsert Predicate. It defines when two RTuples should be -- paired in a merge operation. The matching predicate must be applied on -- a specific set of matching columns. The source RTable in the -- Upsert operation must return a unique set of RTuples, if -- grouped by this set of matching columns. Otherwise an exception -- (UniquenessViolationInUpsert) is thrown. data RUpsertPredicate RUpsertPredicate :: [ColumnName] -> (RTuple -> RTuple -> Bool) -> RUpsertPredicate [matchCols] :: RUpsertPredicate -> [ColumnName] [matchPred] :: RUpsertPredicate -> RTuple -> RTuple -> Bool -- | Execute a Unary ROperation runUnaryROperation :: ROperation -> RTable -> RTable -- | ropU operator executes a unary ROperation. A short name for the -- runUnaryROperation function ropU :: ROperation -> RTable -> RTable -- | Execute a Unary ROperation and return an RTabResult runUnaryROperationRes :: ROperation -> RTable -> RTabResult -- | ropUres operator executes a unary ROperation. A short name for the -- runUnaryROperationRes function ropUres :: ROperation -> RTable -> RTabResult -- | Execute a Binary ROperation runBinaryROperation :: ROperation -> RTable -> RTable -> RTable -- | ropB operator executes a binary ROperation. A short name for the -- runBinaryROperation function ropB :: ROperation -> RTable -> RTable -> RTable -- | Execute a Binary ROperation and return an RTabResult runBinaryROperationRes :: ROperation -> RTable -> RTable -> RTabResult -- | ropBres operator executes a binary ROperation. A short name for the -- runBinaryROperationRes function ropBres :: ROperation -> RTable -> RTable -> RTabResult -- | Number of RTuples returned by an RTable operation type RTuplesRet = Sum Int -- | RTabResult is the result of an RTable operation and is a Writer Monad, -- that includes the new RTable, as well as the number of RTuples -- returned by the operation. type RTabResult = Writer RTuplesRet RTable -- | Creates an RTabResult (i.e., a Writer Monad) from a result RTable and -- the number of RTuples that it returned rtabResult :: (RTable, RTuplesRet) -> RTabResult -- | Returns the info "stored" in the RTabResult Writer Monad runRTabResult :: RTabResult -> (RTable, RTuplesRet) -- | Returns the "log message" in the RTabResult Writer Monad, which is the -- number of returned RTuples execRTabResult :: RTabResult -> RTuplesRet -- | Creates an RTuplesRet type rtuplesRet :: Int -> RTuplesRet -- | Return the number embedded in the RTuplesRet data type getRTuplesRet :: RTuplesRet -> Int -- | Function composition. (.) :: (b -> c) -> (a -> b) -> a -> c infixr 9 . -- | Right-to-left composition of Kleisli arrows. -- (>=>), with the arguments flipped. -- -- Note how this operator resembles function composition -- (.): -- --
-- (.) :: (b -> c) -> (a -> b) -> a -> c -- (<=<) :: Monad m => (b -> m c) -> (a -> m b) -> a -> m c --(<=<) :: Monad m => (b -> m c) -> (a -> m b) -> a -> m c infixr 1 <=< -- | Executes an RFilter operation runRfilter :: RPredicate -> RTable -> RTable -- | Filter (i.e. selection operator). A short name for the -- runRFilter function f :: RPredicate -> RTable -> RTable -- | Implements an Inner Join operation between two RTables (any type of -- join predicate is allowed) This Inner Join implementation follows -- Oracle DB's convention for common column names. When we have two -- tuples t1 and t2 with a common column name (lets say "Common"), then -- the resulting tuple after a join will be "Common", "Common_1", so a -- "_1" suffix is appended. The tuple from the left table by convention -- retains the original column name. So "Column_1" is the column from the -- right table. If "Column_1" already exists, then "Column_2" is used. runInnerJoinO :: RJoinPredicate -> RTable -> RTable -> RTable -- | RTable Inner Join Operator. A short name for the -- runInnerJoinO function iJ :: RJoinPredicate -> RTable -> RTable -> RTable -- | Implements a Left Outer Join operation between two RTables (any type -- of join predicate is allowed), i.e., the rows of the left RTable will -- be preserved. Note that when dublicate keys encountered that is, since -- the underlying structure for an RTuple is a Data.HashMap.Strict, only -- one value per key is allowed. So in the context of joining two RTuples -- the value of the left RTuple on the common key will be prefered. -- -- Implements a Left Outer Join operation between two RTables (any type -- of join predicate is allowed), i.e., the rows of the left RTable will -- be preserved. A Left Join : tabLeft LEFT JOIN tabRight ON -- joinPred where tabLeft is the preserving table can be defined -- as: the Union between the following two RTables: -- --
-- An Example:
-- Source RTable: src =
-- Id | Msg | Other
-- ----|---------------|-------
-- 1 | "hello2" |"a"
-- 2 | "world2" |"a"
-- 3 | "new" |"a"
--
-- Target RTable: trg =
-- Id | Msg | Other
-- ----|---------------|-------
-- 1 | "hello1" |"b"
-- 2 | "world1" |"b"
-- 4 | "old" |"b"
-- 5 | "hello" |"b"
--
-- >>> upsertRTab src
-- RUpsertPredicate {matchCols = ["Id"], matchPred = \t1 t2 -> t1 <!> "Id" == t2 <!> "Id" }
-- ["Msg"]
-- (\t -> let
-- msg = case toText (t <!> "Msg") of
-- Just t -> t
-- Nothing -> pack ""
-- in (take 5 msg) == (pack "hello")
-- ) -- Msg like "hello%"
-- trg
--
-- Result RTable: rslt =
-- Id | Msg | Other
-- ----|---------------|-------
-- 1 | "hello2" |"b" (Note that only column "Msg" has been overwritten, as per the 3rd argument)
-- 2 | "world1" |"b"
-- 3 | "new" |"a"
-- 4 | "old" |"b"
-- 5 | "hello" |"b"
--
upsertRTab :: RTable -> RUpsertPredicate -> [ColumnName] -> RPredicate -> RTable -> RTable
-- | Update an RTuple at a specific column specified by name with a value.
-- If the ColumnName exists, then the value is updated with the
-- input value. If the ColumnName does not exist, then a
-- ColumnDoesNotExist exception is thrown.
updateRTuple :: ColumnName -> RDataType -> RTuple -> RTuple
-- | Upsert (update/insert) an RTuple at a specific column specified by
-- name with a value If the cname key is not found then the (columnName,
-- value) pair is inserted. If it exists then the value is updated with
-- the input value.
upsertRTuple :: ColumnName -> RDataType -> RTuple -> RTuple
-- | emptyRTable: Create an empty RTable
emptyRTable :: RTable
-- | Creates an RTable with a single RTuple
createSingletonRTable :: RTuple -> RTable
-- | Creates an RTable from a list of RTuples
rtableFromList :: [RTuple] -> RTable
-- | addColumn: adds a column to an RTable
addColumn :: ColumnName -> RDataType -> RTable -> RTable
-- | removeColumn : removes a column from an RTable. The column is
-- specified by ColumnName. If this ColumnName does not exist in the
-- RTuple of the input RTable then nothing is happened, the RTuple
-- remains intact.
removeColumn :: ColumnName -> RTable -> RTable
-- | Creates an empty RTuple (i.e., one with no column,value mappings)
emptyRTuple :: RTuple
-- | Creates a Null RTuple based on a list of input Column Names. A
-- Null RTuple is an RTuple where all column names
-- correspond to a Null value (Null is a data constructor
-- of RDataType)
createNullRTuple :: [ColumnName] -> RTuple
-- | createRTuple: Create an Rtuple from a list of column names and values
createRTuple :: [(ColumnName, RDataType)] -> RTuple
-- | Create an RTuple from a list
rtupleFromList :: [(ColumnName, RDataType)] -> RTuple
-- | createRDataType: Get a value of type a and return the corresponding
-- RDataType. The input value data type must be an instance of the
-- Typepable typeclass from Data.Typeable
createRDataType :: Typeable a => a -> RDataType
-- | createRTableMData : creates RTableMData from input given in the form
-- of a list We assume that the column order of the input list defines
-- the fixed column order of the RTuple.
createRTableMData :: (RTableName, [(ColumnName, ColumnDType)]) -> [ColumnName] -> [[ColumnName]] -> RTableMData
-- | Get the Column Names of an RTable
getColumnNamesFromRTab :: RTable -> [ColumnName]
-- | Returns the Column Names of an RTuple
getColumnNamesFromRTuple :: RTuple -> [ColumnName]
getColumnInfoFromRTab :: RTable -> [ColumnInfo]
getColumnInfoFromRTuple :: RTuple -> [ColumnInfo]
-- | Take a column value and return its type
getTheType :: RDataType -> ColumnDType
-- | Define equality for two ColumnInfo structures For two column
-- two have "equal structure" they must have the same name and the same
-- type. If one of the two (or both) have an UknownType, then they
-- are still considered of equal structure.
--
-- Creates a list of the form [(ColumnInfo, RDataType)] from a list of
-- ColumnInfo and an RTuple. The returned list respects the order of the
-- [ColumnInfo]. It guarantees that RDataTypes will be in the same column
-- order as [ColumnInfo], i.e., the correct RDataType for the correct
-- column
listOfColInfoRDataType :: [ColumnInfo] -> RTuple -> [(ColumnInfo, RDataType)]
-- | toListColumnName: returns a list of RTuple column names, in the fixed
-- column order of the RTuple.
toListColumnName :: RTupleMData -> [ColumnName]
-- | toListColumnInfo: returns a list of RTuple columnInfo, in the fixed
-- column order of the RTuple
toListColumnInfo :: RTupleMData -> [ColumnInfo]
-- | Compares the structure of the input RTables and returns
-- True if these are the same. By "structure", we mean that the
-- ColumnNames and the corresponding data types must match.
-- Essentially what we record in the ColumnInfo must be the same
-- for the two RTables. Note that in the case of two columns
-- having the same name but one of the two (or both) have a dtype
-- equal to UknownType, then this function assumes that they are
-- the same (i.e., equal ColumnInfos).
rtabsSameStructure :: RTable -> RTable -> Bool
-- | Compares the structure of the input RTuples and returns
-- True if these are the same. By "structure", we mean that the
-- ColumnNames and the corresponding data types must match.
-- Essentially what we record in the ColumnInfo must be the same
-- for the two RTuples
rtuplesSameStructure :: RTuple -> RTuple -> Bool
-- | Receives two lists of ColumnNames and returns the unique list
-- of ColumnNames after concatenating the two and removing the
-- names from the second one that are a prefix of the first one. This
-- function is intended to dedublicate common columns after a join (see
-- ij), where ColA for example, will also appear as
-- ColA_1. This function DOES NOT dedublicate columns ColA
-- and ColAsomeSuffix, only cases like this one ColName_Num
-- (e.g., ColName_1, ColName_2, etc.) Here is an example:
--
-- -- >>> getUniqueColumnNames ["ColA","ColB"] ["ColC","ColA", "ColA_1", "ColA_2", "ColA_A", "ColA_hello", "ColAhello"] -- -- >>> ["ColA","ColB","ColC","ColA_A","ColA_hello","ColAhello"] --getUniqueColumnNamesAfterJoin :: [ColumnName] -> [ColumnName] -> [ColumnName] -- | This exception is thrown whenever we try to access a specific column -- (i.e., ColumnName) of an RTuple and the column does not -- exist. data ColumnDoesNotExist ColumnDoesNotExist :: ColumnName -> ColumnDoesNotExist -- | This exception means that we have tried to do some operation between -- two RTables, which requires that the structure of the two is -- the same. e.g., an Insert Into TAB RTuples, or a -- UNION or toher set operations. By "structure", we mean that -- the ColumnNames and the corresponding data types must match. -- Essentially what we record in the ColumnInfo must be the same -- for the two RTables data ConflictingRTableStructures -- | Error message indicating the operation that failed. ConflictingRTableStructures :: String -> ConflictingRTableStructures -- | Length mismatch between the format String and the input -- String data RTimestampFormatLengthMismatch = -- RTimestampFormatLengthMismatch String String deriving(Eq,Show) -- instance Exception RTimestampFormatLengthMismatch -- -- One (or both) of the input Strings to function -- toRTimestamp are empty data EmptyInputStringsInToRTimestamp EmptyInputStringsInToRTimestamp :: String -> String -> EmptyInputStringsInToRTimestamp -- | This exception is thrown whenever we provide a Timestamp format with -- not even one valid format pattern data UnsupportedTimeStampFormat UnsupportedTimeStampFormat :: String -> UnsupportedTimeStampFormat -- | This exception means that we have tried an Upsert operation where the -- source RTable does not have a unique set of Rtuples if -- grouped by the columns used in the matching condition. This simply -- means that we cannot determine which of the dublicate RTuples -- in the source RTable will overwrite the target RTable, -- when the matching condition is satisfied. data UniquenessViolationInUpsert -- | Error message UniquenessViolationInUpsert :: String -> UniquenessViolationInUpsert -- | printRTable : Print the input RTable on screen printRTable :: RTable -> IO () -- | Safe printRTable alternative that returns an Either, so -- as to give the ability to handle exceptions gracefully, during the -- evaluation of the input RTable. Example: -- --
-- do -- p <- (eitherPrintRTable printRTable myRTab) :: IO (Either SomeException ()) -- case p of -- Left exc -> putStrLn $ "There was an error in the Julius evaluation: " ++ (show exc) -- Right _ -> return () --eitherPrintRTable :: Exception e => (RTable -> IO ()) -> RTable -> IO (Either e ()) -- | prints an RTable with an RTuple format specification. It can be used -- instead of printRTable when one of the following two is -- required: -- --
-- do -- p <- (eitherPrintfRTable printfRTable myFormat myRTab) :: IO (Either SomeException ()) -- case p of -- Left exc -> putStrLn $ "There was an error in the Julius evaluation: " ++ (show exc) -- Right _ -> return () --eitherPrintfRTable :: Exception e => (RTupleFormat -> RTable -> IO ()) -> RTupleFormat -> RTable -> IO (Either e ()) -- | Basic data type for defining the desired formatting of an -- RTuple when printing an RTable (see printfRTable). data RTupleFormat RTupleFormat :: [ColumnName] -> ColFormatMap -> RTupleFormat -- | For defining the column ordering (i.e., the SELECT clause in SQL) [colSelectList] :: RTupleFormat -> [ColumnName] -- | For defining the formating per Column in "printf style" [colFormatMap] :: RTupleFormat -> ColFormatMap -- | A map of ColumnName to Format Specification type ColFormatMap = HashMap ColumnName FormatSpecifier -- | Format specifier of printf style data FormatSpecifier DefaultFormat :: FormatSpecifier Format :: String -> FormatSpecifier -- | A sum type to help the specification of a column ordering (Ascending, -- or Descending) data OrderingSpec Asc :: OrderingSpec Desc :: OrderingSpec -- | Generate an RTupleFormat data type instance genRTupleFormat :: [ColumnName] -> ColFormatMap -> RTupleFormat -- | Generate a default RTupleFormat data type instance. In this case the -- returned column order (Select list), will be unspecified and dependant -- only by the underlying structure of the RTuple (HashMap) genRTupleFormatDefault :: RTupleFormat -- | Generates a Column Format Specification genColFormatMap :: [(ColumnName, FormatSpecifier)] -> ColFormatMap -- | Generates a default Column Format Specification genDefaultColFormatMap :: ColFormatMap instance GHC.Classes.Eq RTable.Core.ColumnDType instance GHC.Show.Show RTable.Core.ColumnDType instance GHC.Show.Show RTable.Core.IgnoreDefault instance GHC.Classes.Eq RTable.Core.IgnoreDefault instance GHC.Generics.Generic RTable.Core.RTimestamp instance GHC.Read.Read RTable.Core.RTimestamp instance GHC.Show.Show RTable.Core.RTimestamp instance GHC.Generics.Generic RTable.Core.RDataType instance GHC.Read.Read RTable.Core.RDataType instance GHC.Show.Show RTable.Core.RDataType instance GHC.Classes.Eq RTable.Core.ColumnInfo instance GHC.Show.Show RTable.Core.ColumnInfo instance GHC.Classes.Eq RTable.Core.RTableMData instance GHC.Show.Show RTable.Core.RTableMData instance GHC.Classes.Eq RTable.Core.OrderingSpec instance GHC.Show.Show RTable.Core.OrderingSpec instance GHC.Show.Show RTable.Core.FormatSpecifier instance GHC.Classes.Eq RTable.Core.FormatSpecifier instance GHC.Show.Show RTable.Core.RTupleFormat instance GHC.Classes.Eq RTable.Core.RTupleFormat instance GHC.Show.Show RTable.Core.ColumnDoesNotExist instance GHC.Classes.Eq RTable.Core.ColumnDoesNotExist instance GHC.Show.Show RTable.Core.UnsupportedTimeStampFormat instance GHC.Classes.Eq RTable.Core.UnsupportedTimeStampFormat instance GHC.Show.Show RTable.Core.EmptyInputStringsInToRTimestamp instance GHC.Classes.Eq RTable.Core.EmptyInputStringsInToRTimestamp instance GHC.Show.Show RTable.Core.ConflictingRTableStructures instance GHC.Classes.Eq RTable.Core.ConflictingRTableStructures instance GHC.Show.Show RTable.Core.UniquenessViolationInUpsert instance GHC.Classes.Eq RTable.Core.UniquenessViolationInUpsert instance GHC.Exception.Type.Exception RTable.Core.UniquenessViolationInUpsert instance GHC.Exception.Type.Exception RTable.Core.ConflictingRTableStructures instance GHC.Exception.Type.Exception RTable.Core.EmptyInputStringsInToRTimestamp instance GHC.Exception.Type.Exception RTable.Core.UnsupportedTimeStampFormat instance GHC.Exception.Type.Exception RTable.Core.ColumnDoesNotExist instance Control.DeepSeq.NFData RTable.Core.RDataType instance GHC.Classes.Eq RTable.Core.RDataType instance GHC.Classes.Ord RTable.Core.RDataType instance GHC.Num.Num RTable.Core.RDataType instance GHC.Real.Fractional RTable.Core.RDataType instance Control.DeepSeq.NFData RTable.Core.RTimestamp instance GHC.Classes.Eq RTable.Core.RTimestamp instance GHC.Classes.Ord RTable.Core.RTimestamp -- | This is an internal module (i.e., not to be imported directly) that -- implements the core ETL functionality that is exposed via the -- Julius EDSL for ETL/ELT found in the Etl.Julius module) module Etl.Internal.Core -- | This is the basic data type to define the column-to-column mapping -- from a source RTable to a target RTable. Essentially, an -- RColMapping represents the column-level transformations of an -- RTuple that will yield a target RTuple. -- -- A mapping is simply a triple of the form ( Source-Column(s), -- Target-Column(s), Transformation, RTuple-Filter), where we define the -- source columns over which a transformation (i.e. a function) will be -- applied in order to yield the target columns. Also, an -- RPredicate (i.e. a filter) might be applied on the source -- RTuple. Remember that an RTuple is essentially a mapping -- between a key (the Column Name) and a value (the RDataType -- value). So the various RColMapping data constructors below -- simply describe the possible modifications of an RTuple -- orginating from its own columns. -- -- So, we can have the following mapping types: a) single-source column -- to single-target column mapping (1 to 1), the source column will be -- removed or not based on the removeSrcCol flag (dublicate column -- names are not allowed in an RTuple) b) multiple-source columns -- to single-target column mapping (N to 1), The N columns will be merged -- to the single target column based on the transformation. The N columns -- will be removed from the RTuple or not based on the -- removeSrcCol flag (dublicate column names are not allowed in an -- RTuple) c) single-source column to multiple-target columns -- mapping (1 to M) the source column will be "expanded" to M target -- columns based ont he transformation. the source column will be removed -- or not based on the removeSrcCol flag (dublicate column names -- are not allowed in an RTuple) d) multiple-source column to -- multiple target columns mapping (N to M) The N source columns will be -- mapped to M target columns based on the transformation. The N columns -- will be removed from the RTuple or not based on the -- removeSrcCol flag (dublicate column names are not allow in an -- RTuple) -- -- Some examples of mapping are the following: -- --
-- (Start_Date, No, StartDate, t -> True) -- copy the source value to target and dont remove the source column, so the target RTuple will have both columns Start_Date and StartDate -- -- with the exactly the same value) -- -- ([Amount, Discount], Yes, FinalAmount, ([a, d] -> a * d) ) -- FinalAmount is a derived column based on a function applied to the two source columns. -- -- In the final RTuple we remove the two source columns. -- ---- -- An RColMapping can be applied with the runCM -- (runColMapping) operator data RColMapping ColMapEmpty :: RColMapping -- | single-source column to single-target column mapping (1 to 1). RMap1x1 :: ColumnName -> YesNo -> ColumnName -> (RDataType -> RDataType) -> RPredicate -> RColMapping [srcCol] :: RColMapping -> ColumnName [removeSrcCol] :: RColMapping -> YesNo [trgCol] :: RColMapping -> ColumnName [transform1x1] :: RColMapping -> RDataType -> RDataType [srcRTupleFilter] :: RColMapping -> RPredicate -- | multiple-source columns to single-target column mapping (N to 1) RMapNx1 :: [ColumnName] -> YesNo -> ColumnName -> ([RDataType] -> RDataType) -> RPredicate -> RColMapping [srcColGrp] :: RColMapping -> [ColumnName] [removeSrcCol] :: RColMapping -> YesNo [trgCol] :: RColMapping -> ColumnName [transformNx1] :: RColMapping -> [RDataType] -> RDataType [srcRTupleFilter] :: RColMapping -> RPredicate -- | single-source column to multiple-target columns mapping (1 to N) RMap1xN :: ColumnName -> YesNo -> [ColumnName] -> (RDataType -> [RDataType]) -> RPredicate -> RColMapping [srcCol] :: RColMapping -> ColumnName [removeSrcCol] :: RColMapping -> YesNo [trgColGrp] :: RColMapping -> [ColumnName] [transform1xN] :: RColMapping -> RDataType -> [RDataType] [srcRTupleFilter] :: RColMapping -> RPredicate -- | multiple-source column to multiple target columns mapping (N to M) RMapNxM :: [ColumnName] -> YesNo -> [ColumnName] -> ([RDataType] -> [RDataType]) -> RPredicate -> RColMapping [srcColGrp] :: RColMapping -> [ColumnName] [removeSrcCol] :: RColMapping -> YesNo [trgColGrp] :: RColMapping -> [ColumnName] [transformNxM] :: RColMapping -> [RDataType] -> [RDataType] [srcRTupleFilter] :: RColMapping -> RPredicate -- | A Column Transformation function data type. It is used in order to -- define an arbitrary column-level transformation (i.e., from a list of -- N input Column-Values we produce a list of M derived (output) -- Column-Values). A Column value is represented with the -- RDataType. type ColXForm = [RDataType] -> [RDataType] -- | Constructs an RColMapping. This is the suggested method for creating a -- column mapping and not by calling the data constructors directly. createColMapping :: [ColumnName] -> [ColumnName] -> ColXForm -> YesNo -> RPredicate -> RColMapping -- | An ETL operation applied to an RTable can be either an -- ROperation (a relational agebra operation like join, filter -- etc.) defined in RTable.Core module, or an RColMapping -- applied to an RTable data ETLOperation ETLrOp :: ROperation -> ETLOperation [rop] :: ETLOperation -> ROperation ETLcOp :: RColMapping -> ETLOperation [cmap] :: ETLOperation -> RColMapping -- | ETLmapping : it is the equivalent of a mapping in an ETL tool and -- consists of a series of ETLOperations that are applied, one-by-one, to -- some initial input RTable, but if binary ETLOperations are included in -- the ETLMapping, then there will be more than one input RTables that -- the ETLOperations of the ETLMapping will be applied to. When we apply -- (i.e., run) an ETLOperation of the ETLMapping we get a new RTable, -- which is then inputed to the next ETLOperation, until we finally run -- all ETLOperations. The purpose of the execution of an ETLMapping is to -- produce a single new RTable as the result of the execution of all the -- ETLOperations of the ETLMapping. In terms of database operations an -- ETLMapping is the equivalent of an CREATE AS SELECT (CTAS) operation -- in an RDBMS. This means that anything that can be done in the SELECT -- part (i.e., column projection, row filtering, grouping and join -- operations, etc.) in order to produce a new table, can be included in -- an ETLMapping. -- -- An ETLMapping is executed with the etl (runETLmapping) operator -- -- Implementation: An ETLMapping is implemented as a binary tree where -- the node represents the ETLOperation to be executed and the left -- branch is another ETLMapping, while the right branch is an RTable -- (that might be empty in the case of a Unary ETLOperation). Execution -- proceeds from bottom-left to top-right. This is similar in concept to -- a left-deep join tree. In a Left-Deep ETLOperation tree the "pipe" of -- ETLOperations comes from the left branches always. The leaf node is -- always an ETLMapping with an ETLMapEmpty in the left branch and an -- RTable in the right branch (the initial RTable inputed to the -- ETLMapping). In this way, the result of the execution of each -- ETLOperation (which is an RTable) is passed on to the next -- ETLOperation. Here is an example: -- --
-- A Left-Deep ETLOperation Tree -- -- final RTable result -- / -- etlOp3 -- / -- etlOp2 rtab2 -- / -- A leaf-node --> etlOp1 emptyRTab -- / -- ETLMapEmpty rtab1 ---- -- You see that always on the left branch we have an ETLMapping data type -- (i.e., a left-deep ETLOperation tree). So how do we implement the -- following case? -- --
-- final RTable result -- / -- A leaf-node --> etlOp1 -- / -- rtab1 rtab2 ---- -- The answer is that we "model" the left RTable (rtab1 in our example) -- as an ETLMapping of the form: -- --
-- ETLMapLD { etlOp = ETLcOp{cmap = ColMapEmpty}, tabL = ETLMapEmpty, tabR = rtab1 }
--
--
-- So we embed the rtab1 in a ETLMapping, which is a leaf (i.e., it has
-- an empty prevMap), the rtab1 is in the right branch (tabR) and the
-- ETLOperation is the EmptyColMapping, which returns its input RTable
-- when executed. We can use function rtabToETLMapping for this
-- job. So it becomes A leaf-node --> etlOp1 / rtabToETLMapping
-- rtab1 rtab2
--
-- In this manner, a leaf-node can also be implemented like this:
--
-- -- final RTable result -- / -- etlOp3 -- / -- etlOp2 rtab2 -- / -- A leaf-node --> etlOp1 emptyRTab -- / -- rtabToETLMapping rtab1 emptyRTable --data ETLMapping -- | an empty node ETLMapEmpty :: ETLMapping -- | a Left-Deep node ETLMapLD :: ETLOperation -> ETLMapping -> RTable -> ETLMapping -- | the ETLOperation to be executed [etlOp] :: ETLMapping -> ETLOperation -- | the left-branch corresponding to the previous ETLOperation, which is -- input to this one. [tabL] :: ETLMapping -> ETLMapping -- | the right branch corresponds to another RTable (for binary ETL -- operations). If this is a Unary ETLOperation then this field must be -- an empty RTable. [tabR] :: ETLMapping -> RTable -- | a Right-Deep node ETLMapRD :: ETLOperation -> RTable -> ETLMapping -> ETLMapping -- | the ETLOperation to be executed [etlOp] :: ETLMapping -> ETLOperation -- | the left-branch corresponds to another RTable (for binary ETL -- operations). If this is a Unary ETLOperation then this field must be -- an empty RTable. [tabLrd] :: ETLMapping -> RTable -- | the right branch corresponding to the previous ETLOperation, which is -- input to this one. [tabRrd] :: ETLMapping -> ETLMapping -- | a Balanced node ETLMapBal :: ETLOperation -> ETLMapping -> ETLMapping -> ETLMapping -- | the ETLOperation to be executed [etlOp] :: ETLMapping -> ETLOperation -- | the left-branch corresponding to the previous ETLOperation, which is -- input to this one. If this is a Unary ETLOperation then this field -- might be an empty ETLMapping. [tabLbal] :: ETLMapping -> ETLMapping -- | the right branch corresponding corresponding to the previous -- ETLOperation, which is input to this one. -- If this is a Unary -- ETLOperation then this field might be an empty ETLMapping. [tabRbal] :: ETLMapping -> ETLMapping data YesNo Yes :: YesNo No :: YesNo -- | runCM operator executes an RColMapping If a target-column has the same -- name with a source-column and a DontRemoveSrc (i.e., removeSrcCol == -- No) has been specified, then the (target-column, target-value) -- key-value pair, overwrites the corresponding (source-column, -- source-value) key-value pair runCM :: RColMapping -> RTable -> RTable -- | executes a Unary ETL Operation etlOpU :: ETLOperation -> RTable -> RTable -- | executes a Binary ETL Operation etlOpB :: ETLOperation -> RTable -> RTable -> RTable -- | This operator executes an ETLMapping etl :: ETLMapping -> RTable -- | This operator executes an ETLMapping and returns the -- RTabResult Writer Monad that embedds apart from the -- resulting RTable, also the number of RTuples returned etlRes :: ETLMapping -> RTabResult -- | Model an RTable as an ETLMapping which when executed -- will return the input RTable rtabToETLMapping :: RTable -> ETLMapping -- | Creates a left-deep leaf ETL Mapping, of the following form: -- --
-- A Left-Deep ETLOperation Tree -- -- final RTable result -- / -- etlOp3 -- / -- etlOp2 rtab2 -- / -- A leaf-node --> etlOp1 emptyRTab -- / -- ETLMapEmpty rtab1 --createLeafETLMapLD :: ETLOperation -> RTable -> ETLMapping -- | creates a Binary operation leaf node of the form: -- --
-- A leaf-node --> etlOp1 -- / -- rtabToETLMapping rtab1 rtab2 --createLeafBinETLMapLD :: ETLOperation -> RTable -> RTable -> ETLMapping -- | Connects an ETL Mapping to a left-deep ETL Mapping tree, of the form -- --
-- A Left-Deep ETLOperation Tree -- -- final RTable result -- / -- etlOp3 -- / -- etlOp2 rtab2 -- / -- A leaf-node --> etlOp1 emptyRTab -- / -- ETLMapEmpty rtab1 ---- -- Example: -- --
-- -- connect a Unary ETL mapping (etlOp2) -- -- etlOp2 -- / -- etlOp1 emptyRTab -- -- => connectETLMapLD etlOp2 emptyRTable prevMap -- -- -- connect a Binary ETL Mapping (etlOp3) -- -- etlOp3 -- / -- etlOp2 rtab2 -- -- => connectETLMapLD etlOp3 rtab2 prevMap ---- -- Note that the right branch (RTable) appears first in the list of input -- arguments of this function and the left branch (ETLMapping) appears -- second. This is strange, and one could thought that it is a mistake -- (i.e., the left branch should appear first and the right branch -- second) since we are reading from left to right. However this was a -- deliberate choice, so that we leave the left branch (which is the -- connection point with the previous ETLMapping) as the last argument, -- and thus we can partially apply the argumenets and get a new function -- with input parameter only the previous mapping. This is very helpfull -- in function composition connectETLMapLD :: ETLOperation -> RTable -> ETLMapping -> ETLMapping instance GHC.Show.Show Etl.Internal.Core.YesNo instance GHC.Classes.Eq Etl.Internal.Core.YesNo instance GHC.Classes.Eq Etl.Internal.Core.ETLMapping -- | Julius is a type-level Embedded Domain Specific Language -- (EDSL) for ETL/ELT data processing in Haskell. Julius enables us -- to express complex data transformation flows (i.e., an arbitrary -- combination of ETL operations) in a more friendly manner (a Julius -- Expression), with plain Haskell code (no special language for ETL -- scripting required). For more information read this Julius -- Tutorial. -- --
-- EtlMapStart :-> <ETLOpExpr> :-> <ETLOpExpr> :-> ... :-> <ETLOpExpr> -- -- equivalently -- EtlMapStart :-> <ETLOpExpr> -- :-> <ETLOpExpr> -- :-> ... -- :-> <ETLOpExpr> ---- -- A Named ETL Operation Expression (NamedMap) is just an ETL -- Operation with a name, so as to be able to reference this specific -- step in the chain of ETL Operations. It is actually a named -- intermediate result, which can reference and use in other parts of -- our Julius expression. It is similar in notion to a subquery, known as -- an INLINE VIEW, or better, it is equivalent to the WITH -- clause in SQL (i.e., also called subquery factoring in SQL parlance) -- For example: -- --
-- EtlMapStart :-> <ETLOpExpr> -- :=> NamedResult "my_intermdt_result" <ETLOpExpr> -- :-> ... -- :-> <ETLOpExpr> ---- -- An ETL Operation Expression (ETLOpExpr) - a.k.a. a Julius -- Expression - is either a Column Mapping Expression -- (ColMappingExpr), or a Relational Operation Expression -- (ROpExpr). The former is used in order to express a Column -- Mapping (i.e., an arbitrary transformation at the column level, -- with which we can create any derived column based on existing columns, -- see RColMapping) and the latter is Relational Operation -- (Selection, Projection, Join, Outer Join, Group By, Order By, -- Aggregate, or a generic Unary or Binary RTable operation, see -- ROperation) -- --
-- import Etl.Julius
-- import RTable.Data.CSV (CSV, readCSV, writeCSV, toRTable)
--
-- -- 1. Define table metadata
-- -- E.g.,
-- src_DBTab_MData :: RTableMData
-- src_DBTab_MData =
-- createRTableMData ( "sourceTab" -- table name
-- ,[ ("OWNER", Varchar) -- Owner of the table
-- ,("TABLE_NAME", Varchar) -- Name of the table
-- ,("TABLESPACE_NAME", Varchar) -- Tablespace name
-- ,("STATUS",Varchar) -- Status of the table object (VALID/IVALID)
-- ,("NUM_ROWS", Integer) -- Number of rows in the table
-- ,("BLOCKS", Integer) -- Number of Blocks allocated for this table
-- ,("LAST_ANALYZED", Timestamp "MMDDYYYY HH24:MI:SS") -- Timestamp of the last time the table was analyzed (i.e., gathered statistics)
-- ]
-- )
-- ["OWNER", "TABLE_NAME"] -- primary key
-- [] -- (alternative) unique keys
--
-- -- Result RTable metadata
-- result_tab_MData :: RTableMData
-- result_tab_MData = ...
--
-- -- 2. Define your ETL code
-- -- E.g.,
-- myEtl :: [RTable] -> [RTable]
-- myEtl [rtab] =
-- -- 3. Define your Julius Expression(s)
-- let jul =
-- EtlMapStart
-- :-> (EtlR $
-- ROpStart
-- :. (...)
-- ...
-- -- 4. Evaluate Julius to the Result RTable
-- in [juliusToRTable jul]
--
-- main :: IO ()
-- main = do
--
-- -- 5. read source csv files
-- -- E.g.,
-- srcCSV <- readCSV "./app/test-data.csv"
--
-- -- 6. Convert CSV to an RTable and do your ETL
-- [resultRTab] <- runETL myETL $ [toRTable src_DBTab_MData srcCSV]
--
-- -- 7. Print your results on screen
-- -- E.g.,
-- printfRTable (genRTupleFormat ["OWNER", "TABLE_NAME","LAST_ANALYZED"] genDefaultColFormatMap) $ resultRTab
--
-- -- 8. Save your result to a CSV file
-- -- E.g.,
-- writeCSV "./app/result-data.csv" $
-- fromRTable result_tab_MData resultRTab
--
--
-- -- SELECT * -- FROM expenses exp -- WHERE exp.category = 'FOOD:SUPER_MARKET' -- AND exp.amount > 50.00 ---- --
-- juliusToRTable $ -- EtlMapStart -- :-> (EtlR $ -- ROpStart -- :. (Filter (From $ Tab expenses) $ FilterBy myFpred)) -- -- myFpred :: RPredicate -- myFpred = \t -> t <!> "category" == "FOOD:SUPER_MARKET" -- && -- t <!> "amount" > 50.00 ---- --
-- SELECT "TxTimeStamp", "Category", "Description", "Amount" -- FROM expenses exp -- WHERE exp.category = 'FOOD:SUPER_MARKET' -- AND exp.amount > 50.00 ---- --
-- juliusToRTable $ -- EtlMapStart -- :-> (EtlR $ -- ROpStart -- :. (Filter (From $ Tab expenses) $ FilterBy myFpred) -- :. (Select ["TxTimeStamp", "Category","Description", "Amount"] $ From Previous)) -- -- myFpred :: RPredicate -- myFpred = \t -> t <!> "category" == "FOOD:SUPER_MARKET" -- && -- t <!> "amount" > 50.00 ---- --
-- SELECT "TxTimeStamp", "Category", "Description", "Amount" -- FROM expenses exp -- WHERE exp.category = 'FOOD:SUPER_MARKET' -- AND exp.amount > 50.00 -- ORDER BY "TxTimeStamp" DESC ---- --
-- juliusToRTable $
-- EtlMapStart
-- :-> (EtlR $
-- ROpStart
-- :. (Filter (From $ Tab expenses) $ FilterBy myFpred)
-- :. (Select ["TxTimeStamp", "Category","Description", "Amount"] $ From Previous)
-- :. (OrderBy [("TxTimeStamp", Desc)] $ From Previous))
--
-- myFpred :: RPredicate
-- myFpred = \t -> t <!> "category" == "FOOD:SUPER_MARKET"
-- &&
-- t <!> "amount" > 50.00
--
--
--
-- SELECT "Category", sum("Amount") AS "TotalAmount"
-- FROM expenses exp
-- GROUP BY "Category"
-- ORDER BY "TotalAmount" DESC
--
--
--
-- juliusToRTable $
-- EtlMapStart
-- :-> (EtlR $
-- ROpStart
-- :. (GroupBy ["Category"]
-- (AggOn [Sum "Amount" $ As "TotalAmount"] (From $ Tab expenses)) $
-- GroupOn (t1 t2 -> t1 <!> "Category" == t2 <!> "Category")
-- )
-- :. (OrderBy [ ("TotalAmount", Desc)] $ From Previous))
--
--
--
-- WITH exp
-- as (
-- SELECT "Category", sum("Amount") AS "TotalAmount"
-- FROM expenses
-- GROUP BY "Category"
-- )
-- SELECT exp."Category", exp."TotalAmount", bdg."YearlyBudget"
-- FROM budget bdg RIGHT JOIN exp ON (bdg."Category" = exp."Category")
-- ORDER BY exp."TotalAmount" DESC
--
--
--
-- juliusToRTable $
-- EtlMapStart
-- :-> (EtlR $
-- ROpStart
-- :. (GroupBy ["Category"]
-- (AggOn [Sum "Amount" $ As "TotalAmount"] (From $ Tab expenses)) $
-- GroupOn (t1 t2 -> t1 <!> "Category" == t2 <!> "Category")
-- )
-- -- >>> A Right Outer Join that preserves the Previous result RTuples and joins with the budget table
-- :. (RJoin (TabL budget) Previous $
-- JoinOn (tl tr ->
-- tl <!> "Category" == tr <!> "Category")
-- )
-- )
-- :. (OrderBy [ ("TotalAmount", Desc)] $ From Previous))
--
--
--
-- WITH exp
-- as (
-- SELECT "Category", sum("Amount") AS "TotalAmount"
-- FROM expenses
-- GROUP BY "Category"
-- )
-- SELECT exp."Category", exp."TotalAmount", bdg."YearlyBudget", bdg."YearlyBudget" - exp."TotalAmount" AS "ResidualAmount"
-- FROM budget bdg RIGHT JOIN exp ON (bdg."Category" = exp."Category")
-- ORDER BY exp."TotalAmount" DESC
--
--
--
-- juliusToRTable $
-- EtlMapStart
-- :-> (EtlR $
-- ROpStart
-- :. (GroupBy ["Category"]
-- (AggOn [Sum "Amount" $ As "TotalAmount"] (From $ Tab expenses)) $
-- GroupOn (t1 t2 -> t1 <!> "Category" == t2 <!> "Category")
-- )
-- :. (RJoin (TabL budget) Previous $
-- JoinOn (tl tr ->
-- tl <!> "Category" == tr <!> "Category")
-- )
-- )
-- :. (Select ["Category", "TotalAmount", "YearlyBudget"] $ From Previous)
-- :. (OrderBy [ ("TotalAmount", Desc)] $ From Previous)
-- )
-- -- >>> A Column Mapping to create a derived column ("ResidualAmount"")
-- :-> (EtlC $
-- Source ["TotalAmount", "YearlyBudget"] $
-- Target ["ResidualAmount"] $
-- By ([totAmount, yearlyBudget] -> [yearlyBudget - totAmount])
-- (On Previous) DontRemoveSrc $ FilterBy (t -> True)
-- )
--
--
--
-- WITH detailYearTXTab
-- as (
-- SELECT "TxTimeStamp", "Category", "Description", "Amount","DebitCredit"
-- FROM txTab
-- WHERE to_number(to_char("TxTimeStamp", YYYY)) >= yearInput
-- AND
-- to_number(to_char("TxTimeStamp", YYYY)) < yearInput + 1
-- ),
-- expGroupbyCategory
-- as (
-- SELECT "Category", sum ("Amount") AS "AmountSpent"
-- FROM detailYearTXTab
-- WHERE
-- "DebitCredit" = "D"
-- GROUP BY "Category"
-- ORDER BY 2 DESC
-- ),
-- revGroupbyCategory
-- as (
-- SELECT "Category", sum("Amount") AS "AmountReceived"
-- FROM detailYearTXTab
-- WHERE
-- "DebitCredit" = "C"
-- GROUP BY "Category"
-- ORDER BY 2 DESC
-- ),
-- ojoinedWithBudget
-- as(
-- SELECT "Category", "AmountSpent", YearlyBudget"
-- FROM budget bdg RIGHT JOIN expGroupbyCategory exp ON (bdg."Category" = exp."Category")
-- ),
-- calculatedFields
-- as(
-- SELECT "Category", "AmountSpent", "YearlyBudget", "YearlyBudget" - "AmountSpent" AS "ResidualAmount"
-- FROM ojoinedWithBudget
-- )
-- SELECT *
-- FROM calculatedFields
--
--
--
-- let
-- julExpr =
-- -- 1. get detailed transactions of the year
-- :=> NamedResult "detailYearTXtab" (EtlR $
-- ROpStart
-- -- keep RTuples only of the specified year
-- :. (Filter (From $ Tab txTab) $
-- FilterBy (t -> rtime (t <!> "TxTimeStamp") >=
-- RTimestampVal {year = yearInput, month = 1, day = 1, hours24 = 0, minutes = 0, seconds = 0}
-- &&
-- rtime (t <!> "TxTimeStamp") <
-- RTimestampVal { year = yearInput + 1,
-- month = 1,
-- day = 1, hours24 = 0, minutes = 0, seconds = 0})
-- )
-- -- keep only columns of interest
-- :. (Select ["TxTimeStamp", "Category", "Description", "Amount","DebitCredit"] $ From Previous)
-- :. (OrderBy [("TxTimeStamp", Asc)] $ From Previous)
-- )
-- -- 2. expenses group by category
-- :=> NamedResult "expGroupbyCategory" (EtlR $
-- ROpStart
-- -- keep only the "debit" transactions
-- :. (FilterBy (From Previous) $
-- FilterBy (t -> t <!> "DebitCredit" == "D")
-- )
-- :. (GroupBy ["Category"]
-- (AggOn [Sum "Amount" $ As "AmountSpent" ] $ From Previous) $
-- GroupOn (t1 t2 -> t1 <!> "Category" == t2 <!> "Category")
-- )
-- :. (OrderBy [("AmountSpent", Desc)] $ From Previous)
-- )
-- -- 3. revenues group by category
-- :=> NamedResult "revGroupbyCategory" (EtlR $
-- ROpStart
-- -- keep only the "credit" transactions
-- :. (FilterBy (From $ juliusToRTable $ takeNamedResult "detailYearTXtab" julExpr) $
-- FilterBy (t -> t <!> "DebitCredit" == "C")
-- )
-- :. (GroupBy ["Category"]
-- (AggOn [Sum "Amount" $ As "AmountReceived" ] $ From Previous) $
-- GroupOn (t1 t2 -> t1 <!> "Category" == t2 <!> "Category")
-- )
-- :. (OrderBy [("AmountReceived", Desc)] $ From Previous)
-- )
-- -- 3. Expenses Group By Category Outer joined with budget info
-- :=> NamedResult "ojoinedWithBudget" (EtlR $
-- ROpStart
-- :. (RJoin (TabL budget) (Tab $ juliusToRTable $ takeNamedResult "expGroupbyCategory" julExpr) $
-- JoinOn (tl tr ->
-- tl <!> "Category" == tr <!> "Category")
-- )
-- )
-- :. (Select ["Category", "AmountSpent", "YearlyBudget"] $ From Previous)
-- :. (OrderBy [ ("TotalAmount", Desc)] $ From Previous)
-- )
-- -- 4. A Column Mapping to create a derived column ("ResidualAmount")
-- :=> NamedResult "calculatedFields" (EtlC $
-- Source ["AmountSpent", "YearlyBudget"] $
-- Target ["ResidualAmount"] $
-- By ([amountSpent, yearlyBudget] -> [yearlyBudget - amountSpent])
-- (On Previous) DontRemoveSrc $ FilterBy (t -> True)
-- )
--
-- -- 5. Print detail transactions
-- printRTable $ juliusToRTable $ takeNamedResult "detailYearTXtab" julExpr
--
-- -- 6. Print Expenses by Category
-- printRTable $ juliusToRTable $ takeNamedResult "expGroupbyCategory" julExpr
--
-- -- 7. Print Expenses with Budgeting Info and Residual Amount
-- printRTable $ juliusToRTable $ takeNamedResult "calculatedFields" julExpr
--
-- -- equivalently
-- printRTable $ juliusToRTable julExpr
--
-- -- 8. Print Revenues by Category
-- printRTable $ juliusToRTable $ takeNamedResult "revGroupbyCategory" julExpr
--
--
-- Explanation of each named result in the above example:
--
-- -- FilterBy (\_ -> True) --data ByFunction By :: ColXForm -> OnRTable -> RemoveSrcCol -> ByPred -> ByFunction -- | Defines the RTable that the current operation will be applied -- to. data OnRTable On :: TabExpr -> OnRTable -- | A Table Expression defines the RTable on which the current ETL -- Operation will be applied. If the Previous constructor is used, -- then this RTable is the result of the previous ETL Operations -- in the current Julius Expression (ETLMappingExpr) data TabExpr Tab :: RTable -> TabExpr Previous :: TabExpr -- | Indicator of whether the source column(s) in a Column Mapping will be -- removed or not (used in ColMappingExpr) If a target-column has -- the same name with a source-column and a DontRemoveSrc has been -- specified, then the (target-column, target-value) key-value pair, -- overwrites the corresponding (source-column, source-value) key-value -- pair. data RemoveSrcCol RemoveSrc :: RemoveSrcCol DontRemoveSrc :: RemoveSrcCol -- | An RTuple predicate clause. data ByPred FilterBy :: RPredicate -> ByPred -- | Predicate for Deletion Operation data ByDelPred Where :: RPredicate -> ByDelPred -- | The Set sub-clause of an Update RTable clause. It -- specifies each column to be updated along with the new value. data SetColumns Set :: [(ColumnName, RDataType)] -> SetColumns -- | A Relational Operation Expression (ROpExpr) is a sequence of -- one or more Relational Algebra Operations applied on a input -- RTable. It is a sub-expression within a Julius Expression -- (ETLMappingExpr) and we use it whenever we want to apply -- relational algebra operations on an RTable (which might be the result -- of previous operations in a Julius Expression). A Julius Expression -- (ETLMappingExpr) can contain an arbitrary number of -- ROpExprs. The relational operation connector :. is left -- associative because in a ROpExpr operations are evaluated from -- left to right (or top to bottom). data ROpExpr ROpStart :: ROpExpr (:.) :: ROpExpr -> RelationalOp -> ROpExpr infixl 6 :. -- | The Relational Operation (RelationalOp) is a Julius clause that -- represents a Relational Algebra Operation. data RelationalOp -- | RTuple filtering clause (selection operation), based on an -- arbitrary predicate function (RPredicate) Filter :: FromRTable -> ByPred -> RelationalOp -- | Column projection clause Select :: [ColumnName] -> FromRTable -> RelationalOp -- | Aggregate Operation clause Agg :: Aggregate -> RelationalOp -- | Group By clause, based on an arbitrary Grouping predicate function -- (RGroupPredicate) GroupBy :: [ColumnName] -> Aggregate -> GroupOnPred -> RelationalOp -- | Inner Join clause, based on an arbitrary join predicate function - not -- just equi-join - (RJoinPredicate) Join :: TabLiteral -> TabExpr -> TabExprJoin -> RelationalOp -- | Left Join clause, based on an arbitrary join predicate function - not -- just equi-join - (RJoinPredicate) LJoin :: TabLiteral -> TabExpr -> TabExprJoin -> RelationalOp -- | Right Join clause, based on an arbitrary join predicate function - not -- just equi-join - (RJoinPredicate) RJoin :: TabLiteral -> TabExpr -> TabExprJoin -> RelationalOp -- | Full Outer Join clause, based on an arbitrary join predicate function -- - not just equi-join - (RJoinPredicate) FOJoin :: TabLiteral -> TabExpr -> TabExprJoin -> RelationalOp -- | Implements the semi-Join operation between two RTables (any type of -- join predicate is allowed) It returns the RTuples from the left -- RTable that match with the right RTable. Note that if an -- RTuple from the left RTable matches more than one -- RTuples from the right RTable the semi join operation -- will return only a single RTuple. SemiJoin :: TabLiteral -> TabExpr -> TabExprJoin -> RelationalOp SemiJoinP :: TabExpr -> TabLiteral -> TabExprJoin -> RelationalOp -- | Implements the anti-Join operation between two RTables (any type of -- join predicate is allowed) It returns the RTuples from the left -- RTable that DONT match with the right RTable. AntiJoin :: TabLiteral -> TabExpr -> TabExprJoin -> RelationalOp AntiJoinP :: TabExpr -> TabLiteral -> TabExprJoin -> RelationalOp -- | Intersection clause Intersect :: TabLiteral -> TabExpr -> RelationalOp -- | Union clause. Note this operation eliminates dublicate RTuples Union :: TabLiteral -> TabExpr -> RelationalOp -- | Union All clause. It is a Union operation without dublicate -- RTuple elimination. UnionAll :: TabLiteral -> TabExpr -> RelationalOp -- | Minus clause (set Difference operation) Minus :: TabLiteral -> TabExpr -> RelationalOp MinusP :: TabExpr -> TabLiteral -> RelationalOp -- | This is a generic unary operation on a RTable -- (UnaryRTableOperation). It is used to define an arbitrary unary -- operation on an RTable GenUnaryOp :: OnRTable -> ByGenUnaryOperation -> RelationalOp -- | This is a generic binary operation on a RTable -- (BinaryRTableOperation). It is used to define an arbitrary -- binary operation on an RTable GenBinaryOp :: TabLiteral -> TabExpr -> ByGenBinaryOperation -> RelationalOp -- | Order By clause. OrderBy :: [(ColumnName, OrderingSpec)] -> FromRTable -> RelationalOp -- | Delete operation. Deletes the RTuples from an RTable -- based on an RPredicate. Please note that this is an -- immutable implementation of an RTable update. This -- simply means that the delete operation returns a new RTable. -- So, the original RTable remains unchanged and no deletion -- in-place takes place whatsoever. Moreover, if we have multiple threads -- deleting an RTable, due to immutability, each thread "sees" its -- own copy of the RTable and thus there is no need for locking -- the deleted RTuples, as happens in a common RDBMS. Delete :: FromRTable -> ByDelPred -> RelationalOp -- | Update an RTable. Please note that this is an immutable -- implementation of an RTable update. This simply means that the -- update operation returns a new RTable that includes all the -- RTuples of the original RTable, both the ones that have -- been updated and the others that have not. So, the original -- RTable remains unchanged and no update in-place takes place -- whatsoever. Moreover, if we have multiple threads updating an -- RTable, due to immutability, each thread "sees" its own copy of -- the RTable and thus there is no need for locking the updated -- RTuples, as happens in a common RDBMS. Update :: TabExpr -> SetColumns -> ByPred -> RelationalOp -- | Insert Operation. It can insert into an RTable a single -- RTuple or a whole RTable. The latter is the equivalent -- of an INSERT INTO SELECT clause in SQL. Since, an -- RTable can be the result of a Julius expression (playing the -- role of a subquery within the Insert clause, in this case). Please -- note that this is an immutable implementation of an -- RTable insert. This simply means that the insert operation -- returns a new RTable and does not affect the original -- RTable. Also note that the source and target RTables -- should have the same structure. By "structure", we mean that the -- ColumnNames and the corresponding data types must match. -- Essentially what we record in the ColumnInfo must be the same -- for the two RTables. Otherwise a -- ConflictingRTableStructures exception will be thrown. Insert :: IntoClause -> RelationalOp -- | Upsert (Update+Insert, aka Merge) Operation. We provide a source -- RTable and a matching condition (RUpsertPredicate) to -- the RTuples of the target RTable. An RTuple from -- the target RTable might match to a single only RTuple in -- the source RTable, or not match at all. If it is matched to -- more than one RTuples then an exception -- (UniquenessViolationInUpsert)is thrown. When an RTuple -- from the target RTable is matched to a source RTuple, -- then the corresponding columns of the target RTuple are updated -- with the new values provided in the source RTuple. This takes -- place for the target RTuples that match but also that satisfy -- the input RPredicate. Thus we can restrict further with a -- filter the RTuples of the target RTable where the update -- will take place. Finally, the source RTuples that did not match -- to the target RTable, are inserted (appended) to the target -- RTable -- -- Please note that this is an immutable implementation of an -- RTable upsert. This simply means that the upsert operation -- returns a new RTable and does not affect the original -- RTable. Also note that the source and target RTables -- should have the same structure. By "structure", we mean that the -- ColumnNames and the corresponding data types must match. -- Essentially what we record in the ColumnInfo must be the same -- for the two RTables. Otherwise a -- ConflictingRTableStructures exception will be thrown. -- --
-- An Example: -- Source RTable: srcTab = -- Id | Msg | Other -- ----|---------------|------- -- 1 | "updated" |"a" -- 2 | "world2" |"a" -- 3 | "inserted" |"a" -- -- Target RTable: trgTab = -- Id | Msg | Other -- ----|---------------|------- -- 1 | "hello1" |"b" -- 2 | "world1" |"b" -- 4 | "old" |"b" -- 5 | "hello" |"b" -- -- juliusToRTable $ -- EtlMapStart -- :-> (EtlR $ -- ROpStart -- :.(Upsert $ -- MergeInto (Tab trgTab) $ -- Using (TabSrc srcTab) $ -- MergeOn (RUpsertPredicate ["Id"] (\t1 t2 -> t1 <!> "Id" == t2 <!> "Id")) $ -- merge condition: srcTab.Id == trgTab.Id -- WhenMatchedThen $ -- UpdateCols ["Msg"] $ -- FilterBy (\t -> let -- msg = case toText (t <!> "Msg") of -- Just t -> t -- Nothing -> pack "" -- in (take 5 msg) == (pack "hello") -- ) -- Msg like "hello%" -- ) -- ) -- -- Result RTable: -- Id | Msg | Other -- ----|---------------|------- -- 1 | "updated" |"b" -- Updated RTuple. Note that only column "Msg" has been overwritten, as per the UpdateCols subclause -- 2 | "world1" |"b" -- Not affected due to FilterBy predicate -- 3 | "inserted" |"a" -- Inserted RTuple -- 4 | "old" |"b" -- Not affected due to MergeOn condition -- 5 | "hello" |"b" -- Not affected due to MergeOn condition --Upsert :: MergeInto -> RelationalOp -- | Resembles the "FROM" clause in SQL. It defines the RTable on -- which the Relational Operation will be applied data FromRTable From :: TabExpr -> FromRTable -- | An Aggregate Operation Clause data Aggregate AggOn :: [AggOp] -> FromRTable -> Aggregate -- | These are the available aggregate operation clauses data AggOp Sum :: ColumnName -> AsColumn -> AggOp -- | Count aggregation (no distinct) Count :: ColumnName -> AsColumn -> AggOp -- | Count distinct aggregation (i.e., count(distinct col) in -- SQL). Returns the distinct number of values for this column. CountDist :: ColumnName -> AsColumn -> AggOp -- | Returns the number of RTuples in the RTable (i.e., -- count(*) in SQL) CountStar :: AsColumn -> AggOp Min :: ColumnName -> AsColumn -> AggOp Max :: ColumnName -> AsColumn -> AggOp -- | Average aggregation Avg :: ColumnName -> AsColumn -> AggOp -- | String aggregation StrAgg :: ColumnName -> AsColumn -> Delimiter -> AggOp -- | A custom aggregate operation GenAgg :: ColumnName -> AsColumn -> AggBy -> AggOp -- | Defines the name of the column that will hold the aggregate operation -- result. It resembles the "AS" clause in SQL. data AsColumn As :: ColumnName -> AsColumn -- | Julius Clause to provide a custom aggregation function data AggBy AggBy :: AggFunction -> AggBy -- | A grouping predicate clause. It defines an arbitrary function -- (RGroupPRedicate), which drives when two RTuples -- should belong in the same group. data GroupOnPred GroupOn :: RGroupPredicate -> GroupOnPred -- | This clause is used for expressions where we do not allow the use of -- the Previous value data TabLiteral TabL :: RTable -> TabLiteral -- | Join Predicate Clause. It defines when two RTuples should be -- paired. data TabExprJoin JoinOn :: RJoinPredicate -> TabExprJoin -- | It is used to define an arbitrary unary operation on an RTable data ByGenUnaryOperation ByUnaryOp :: UnaryRTableOperation -> ByGenUnaryOperation -- | It is used to define an arbitrary binary operation on an RTable data ByGenBinaryOperation ByBinaryOp :: BinaryRTableOperation -> ByGenBinaryOperation -- | Insert Into subclause data IntoClause Into :: TabExpr -> InsertSource -> IntoClause -- | Subclause on Insert clause. Defines the source of the insert -- operation. The Values branch is used for inserting a singl -- RTuple, while the RTuples branch is used for inserting -- a whole RTable, typically derived as the result of a Julius -- expression. The former is similar in concept with an INSERT INTO -- VALUES SQL clause, and the latter is similar in concept with an -- INSERT INTO SELECT SQL clause. data InsertSource Values :: ValuesClause -> InsertSource RTuples :: TabSource -> InsertSource -- | Subclause on Insert clause. Defines the source RTuple of -- the insert operation. type ValuesClause = [(ColumnName, RDataType)] -- | This subclause refers to the source RTable that will feed an -- Insert operation data TabSource TabSrc :: RTable -> TabSource -- | Merge Into subclause data MergeInto MergeInto :: TabExpr -> MergeSource -> MergeInto -- | Upsert source subclause (Using clause in SQL) data MergeSource Using :: TabSource -> MergeMatchCondition -> MergeSource -- | Upsert matching condition subclause data MergeMatchCondition MergeOn :: RUpsertPredicate -> WhenMatched -> MergeMatchCondition -- | When Matched subclause of Upsert data WhenMatched WhenMatchedThen :: UpdateColumns -> WhenMatched -- | Update columns subclause of Upsert data UpdateColumns UpdateCols :: [ColumnName] -> ByPred -> UpdateColumns -- | Evaluates (parses) the Julius exrpession and produces an -- ETLMapping. The ETLMapping is an internal representation -- of the Julius expression and one needs to combine it with the -- etl function, in order to evaluate the Julius expression into -- an RTable. This can be achieved directly with function -- juliusToRTable evalJulius :: ETLMappingExpr -> ETLMapping -- | Pure code to evaluate the "ETL-logic" of a Julius expression and -- generate the corresponding target RTable. -- -- The evaluation of a Julius expression (i.e., a ETLMappingExpr) -- to an RTable is strict. It evaluates fully to Normal Form (NF) as -- opposed to a lazy evaluation (i.e., only during IO), or evaluation to -- a WHNF. This is for efficiency reasons (e.g., avoid space leaks and -- excessive memory usage). It also has the impact that exceptions will -- be thrown at the same line of code that juliusToRTable is -- called. Thus one should wrap this call with a catch handler, or -- use eitherPrintRTable, or eitherPrintfRTable, if one -- wants to handle the exception gracefully. -- -- Example: -- --
-- do -- catch (printRTable $ juliusToRTable $ <a Julius expression> ) -- (\e -> putStrLn $ "There was an error in the Julius evaluation: " ++ (show (e::SomeException)) ) ---- -- Or, similarly -- --
-- do -- p <- (eitherPrintRTable printRTable $ -- juliusToRTable $ <a Julius expression> -- ) :: IO (Either SomeException ()) -- case p of -- Left exc -> putStrLn $ "There was an error in the Julius evaluation: " ++ (show exc) -- Right _ -> return () --juliusToRTable :: ETLMappingExpr -> RTable -- | Evaluate a Julius expression within the IO Monad. I.e., Effectful code -- to evaluate the "ETL-logic" of a Julius expression and generate the -- corresponding target RTable. -- -- The evaluation of a Julius expression (i.e., a ETLMappingExpr) -- to an RTable is strict. It evaluates fully to Normal Form (NF) as -- opposed to a lazy evaluation (i.e., only during IO), or evaluation to -- a WHNF. This is for efficiency reasons (e.g., avoid space leaks and -- excessive memory usage). It also has the impact that exceptions will -- be thrown at the same line of code that runJulius is called. -- Thus one should wrap this call with a catch handler, or use -- eitherRunJulius, if he wants to handle the exception -- gracefully. -- -- Example: -- --
-- do -- result <- catch (runJulius $ <a Julius expression>) -- (e -> do -- putStrLn $ "there was an error in Julius evaluation: " ++ (show (e::SomeException)) -- return emptyRTable -- ) --runJulius :: ETLMappingExpr -> IO RTable -- | Evaluate a Julius expression and return the corresponding target -- RTable or an exception. One can define custom exceptions to be -- thrown within a Julius expression. This function will catch any -- exceptions that are instances of the Exception type class. -- -- The evaluation of a Julius expression (i.e., a ETLMappingExpr) -- to an RTable is strict. It evaluates fully to Normal Form (NF) -- as opposed to a lazy evaluation (i.e., only during IO), or evaluation -- to a WHNF. This is for efficiency reasons (e.g., avoid space leaks and -- excessive memory usage). -- -- Example: -- --
-- do -- res <- (eitherRunJulius $ <a Julius expression>) :: IO (Either SomeException RTable) -- resultRTab <- case res of -- Right t -> return t -- Left exc -> do -- putStrLn $ "there was an error in Julius evaluation: " ++ (show exc) -- return emptyRTable --eitherRunJulius :: Exception e => ETLMappingExpr -> IO (Either e RTable) -- | Receives an input Julius expression, evaluates it to an ETL Mapping -- (ETLMapping) and executes it, in order to return an -- RTabResult containing an RTable storing the result of -- the ETL Mapping, as well as the number of RTuples returned juliusToResult :: ETLMappingExpr -> RTabResult -- | Evaluate a Julius expression within the IO Monad and return an -- RTabResult. runJuliusToResult :: ETLMappingExpr -> IO RTabResult -- | Evaluate a Julius expression within the IO Monad and return either an -- RTabResult, or an exception, in case of an error during -- evaluation. eitherRunJuliusToResult :: Exception e => ETLMappingExpr -> IO (Either e RTabResult) -- | Generic ETL execution function. It receives a list of input (aka -- "source") RTables and an ETL function that produces a list of -- output (aka "target") RTables. The ETL function should embed -- all the "transformation-logic" from the source RTables to the -- target RTables. runETL :: ([RTable] -> [RTable]) -> [RTable] -> IO [RTable] -- | Generic ETL execution function that returns either the target list of -- RTables, or an exception in case of a problem during the ETL -- code execution. It receives a list of input (aka "source") -- RTables and an ETL function that produces a list of output (aka -- "target") RTables. The ETL function should embed all the -- "transformation-logic" from the source RTables to the target -- RTables. eitherRunETL :: Exception e => ([RTable] -> [RTable]) -> [RTable] -> IO (Either e [RTable]) -- | Returns a prefix of an ETLMappingExpr that matches a named -- intermediate result. For example, below we show a Julius expression -- where we define an intermediate named result called "myResult". This -- result, is used at a later stage in this Julius expression, with the -- use of the function takeNamedResult. -- --
-- etlXpression = -- EtlMapStart -- :-> (EtlC $ ...) -- :=> NamedResult "myResult" (EtlR $ ...) -- :-> (EtlR $ ... ) -- :-> (EtlR $ -- ROpStart -- :. (Minus -- (TabL $ -- juliusToRTable $ takeNamedResult "myResult" etlXpression -- THIS IS THE POINT WHERE WE USE THE NAMED RESULT! -- ) -- (Previous)) -- ) ---- -- In the above Julius expression (etlXpresion) the "myResult" named -- result equals to the prefix of the etlXpresion, up to the operation -- (included) with the named result "myResult". -- --
-- -- takeNamedResult "myResult" etlXpression == EtlMapStart -- :-> (EtlC $ ...) -- :=> NamedResult "myResult" (EtlR $ ...) ---- -- Note that the julius expression is scanned from right to left and thus -- it will return the longest prefix expression that matches the input -- name takeNamedResult :: NamedResultName -> ETLMappingExpr -> ETLMappingExpr -- | Returns an UnaryRTableOperation (RTable -> -- RTable) that adds a surrogate key (SK) column to an -- RTable and fills each row with a SK value. It primarily is -- intended to be used within a Julius expression. For example: -- --
-- GenUnaryOp (On Tab rtab1) $ ByUnaryOp (addSurrogateKeyJ TxSK 0) --addSurrogateKeyJ :: Integral a => ColumnName -> a -> RTable -> RTable -- | Returns a BinaryRTableOperation (RTable -> -- RTable -> RTable) that Appends an RTable to a -- target RTable. It is primarily intended to be used within a -- Julius expression. For example: -- --
-- GenBinaryOp (TabL rtab1) (Tab $ rtab2) $ ByBinaryOp appendRTableJ --appendRTableJ :: RTable -> RTable -> RTable -- | Returns an ETLOperation that adds a surrogate key (SK) column -- to an RTable and fills each row with a SK value. This function -- is only exposed for backward compatibility reasons. The recommended -- function to use instead is addSurrogateKeyJ, which can be -- embedded directly into a Julius expression as a -- UnaryRTableOperation. addSurrogateKey :: Integral a => ColumnName -> a -> ETLOperation -- | Returns an ETLOperation that Appends an RTable to a -- target RTable This function is only exposed for backward -- compatibility reasons. The recommended function to use instead is -- appendRTableJ, which can be embedded directly into a Julius -- expression as a BinaryRTableOperation. appendRTable :: ETLOperation -- | This module implements the RTabular instance of the CSV -- data type, i.e., implements the interface by which a CSV file can be -- transformed to/from an RTable. It is required when we want to -- do ETL/ELT over CSV files with the DBFunctor package (i.e., -- with the Julius EDSL for ETL/ELT found in the Etl.Julius -- module). -- -- The minimum requirement for implementing an RTabular instance -- for a data type is to implement the toRTable and -- fromRTable functions. Apart from these two functions, this -- module also exports functions for reading and writing CSV data -- from/to CSV files. Also it supports all types of delimiters (not only -- commas) and CSVs with or without headers. (see CSVOptions) -- -- For the CSV data type this module uses the Cassava library -- (Data.Csv) module RTable.Data.CSV -- | Definition of a CSV file. Treating CSV data as opaque byte strings newtype CSV CSV :: Vector Row -> CSV [csv] :: CSV -> Vector Row -- | Definition of a CSV Row. Essentially a Row is just a Vector of -- ByteString type Row = Vector Column -- | Definition of a CSV column. type Column = Field -- | Options for a CSV file (e.g., delimiter specification, header -- specification etc.) data CSVOptions CSVOptions :: Char -> YesNo -> CSVOptions [delimiter] :: CSVOptions -> Char [hasHeader] :: CSVOptions -> YesNo -- | Yes or No sum type data YesNo Yes :: YesNo No :: YesNo -- | reads a CSV file and returns a CSV data type (Treating CSV data -- as opaque byte strings) readCSV :: FilePath -> IO CSV -- | reads a CSV file based on input options (delimiter and header option) -- and returns a CSV data type (Treating CSV data as opaque byte -- strings) readCSVwithOptions :: CSVOptions -> FilePath -> IO CSV -- | reads a CSV file and returns a lazy bytestring readCSVFile :: FilePath -> IO ByteString -- | write a CSV to a newly created csv file writeCSV :: FilePath -> CSV -> IO () -- | write a CSV (bytestring) to a newly created csv file writeCSVFile :: FilePath -> ByteString -> IO () toRTable :: RTabular a => RTableMData -> a -> RTable fromRTable :: RTabular a => RTableMData -> RTable -> a -- | print input CSV on screen printCSV :: CSV -> IO () -- | print input CSV on screen printCSVFile :: ByteString -> IO () -- | copy input csv file to specified output csv file copyCSV :: FilePath -> FilePath -> IO () -- | selectNrows: Returns the first N rows from a CSV file selectNrows :: Int -> CSV -> CSV -- | Column projection on an input CSV file where desired columns are -- defined by position (index) in the CSV. projectByIndex :: [Int] -> CSV -> CSV -- | O(1) First row headCSV :: CSV -> Row -- | O(1) Yield all but the first row without copying. The CSV may not be -- empty. tailCSV :: CSV -> CSV -- | creates a Header (as defined in Data.Csv) from an -- RTable csvHeaderFromRtable :: RTable -> Header -- | Exception to signify an error in decoding a CSV file into a CSV -- data type data CsvFileDecodingError CsvFileDecodingError :: FilePath -> Text -> CsvFileDecodingError -- | This exception signifies an error in parsing a CSV -- Column to an RDataType value data CSVColumnToRDataTypeError CSVColumnToRDataTypeError :: ColumnName -> Text -> CSVColumnToRDataTypeError instance GHC.Show.Show RTable.Data.CSV.CsvFileDecodingError instance GHC.Classes.Eq RTable.Data.CSV.CsvFileDecodingError instance GHC.Show.Show RTable.Data.CSV.CSVColumnToRDataTypeError instance GHC.Classes.Eq RTable.Data.CSV.CSVColumnToRDataTypeError instance GHC.Exception.Type.Exception RTable.Data.CSV.CSVColumnToRDataTypeError instance GHC.Exception.Type.Exception RTable.Data.CSV.CsvFileDecodingError instance RTable.Core.RTabular RTable.Data.CSV.CSV