{-| Module : CSVdb.Base Description : Implements 'RTable' over CSV (TSV, or any other delimiter) files logic Copyright : (c) Nikos Karagiannidis, 2018 License : BSD3 Maintainer : nkarag@gmail.com Stability : stable Portability : POSIX This module implements the 'RTabular' instance of the 'CSV' data type, i.e., implements the interface by which a CSV file can be transformed to/from an 'RTable'. It is required when we want to do ETL\/ELT over CSV files with the "DBFunctor" package (i.e., with the __Julius__ EDSL for ETL/ELT found in the "Etl.Julius" module). The minimum requirement for implementing an 'RTabular' instance for a data type is to implement the 'toRTable' and 'fromRTable' functions. Apart from these two functions, this module also exports functions for reading and writing 'CSV' data from/to CSV files. Also it supports all types of delimiters (not only commas) and CSVs with or without headers. (see 'CSVOptions') For the 'CSV' data type this module uses the Cassava library ("Data.Csv") -} {-# LANGUAGE OverloadedStrings #-} -- :set -XOverloadedStrings {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE TypeSynonymInstances #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE NoMonomorphismRestriction #-} module RTable.Data.CSV ( -- * The CSV data type CSV (..) ,Row ,Column ,CSVOptions(..) ,YesNo (..) -- * Read/Write CSV ,readCSV ,readCSVwithOptions ,readCSVFile ,writeCSV ,writeCSVFile -- * CSV as Tabular data ,toRTable ,fromRTable -- * CSV I/O ,printCSV ,printCSVFile -- * Basic CSV processing ,copyCSV ,selectNrows ,projectByIndex ,headCSV ,tailCSV -- * Misc ,csvHeaderFromRtable -- * Exceptions ,CsvFileDecodingError (..) ,CSVColumnToRDataTypeError (..) ) where import Debug.Trace import RTable.Core -- CSV-conduit --import qualified Data.CSV.Conduit as CC -- http://hackage.haskell.org/package/csv-conduit , https://www.stackage.org/haddock/lts-6.27/csv-conduit-0.6.6/Data-CSV-Conduit.html -- Cassava (CSV parsing Library) -- https://github.com/hvr/cassava -- https://www.stackbuilders.com/tutorials/haskell/csv-encoding-decoding/ -- https://www.stackage.org/lts-7.15/package/cassava-0.4.5.1 -- https://hackage.haskell.org/package/cassava-0.4.5.1/docs/Data-Csv.html import qualified Data.Csv as CV -- HashMap -- https://hackage.haskell.org/package/unordered-containers-0.2.7.2/docs/Data-HashMap-Strict.html import qualified Data.HashMap.Strict as HM -- Data.List import Data.List (map) -- ByteString import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString as BS import Data.ByteString.Char8 (pack,unpack) -- as BSW --(pack) import Prelude hiding (putStr) import Data.ByteString.Lazy.Char8 (putStr)--as BLW -- Text import Data.Text as T import Data.Text.Encoding (decodeUtf8, encodeUtf8, decodeUtf8', decodeUtf16LE) -- Vector import qualified Data.Vector as V -- Data.Maybe import Data.Maybe (fromJust) -- Data.Serialize (Cereal package) -- https://hackage.haskell.org/package/cereal -- https://hackage.haskell.org/package/cereal-0.5.4.0/docs/Data-Serialize.html -- http://stackoverflow.com/questions/2283119/how-to-convert-a-integer-to-a-bytestring-in-haskell import Data.Serialize (decode, encode) -- Typepable -- https://hackage.haskell.org/package/base-4.9.1.0/docs/Data-Typeable.html -- http://stackoverflow.com/questions/6600380/what-is-haskells-data-typeable -- http://alvinalexander.com/source-code/haskell/how-determine-type-object-haskell-program import qualified Data.Typeable as TB --(typeOf, Typeable) import Data.Either.Combinators (fromRight') import Data.Char (ord) import Text.Printf (printf) import Control.Exception {-- -- Example code from: https://github.com/hvr/cassava {-- Sample CSV: name,salary John Doe,50000 Jane Doe,60000 --} data Person = Person { name :: !String , salary :: !Int } instance FromNamedRecord Person where parseNamedRecord r = Person <$> r .: "name" <*> r .: "salary" main :: IO () main = do csvData <- BL.readFile "salaries.csv" case decodeByName csvData of Left err -> putStrLn err Right (_, v) -> V.forM_ v $ \ p -> putStrLn $ name p ++ " earns " ++ show (salary p) ++ " dollars" --} -- ################################################## -- * Data Types -- ################################################## {-data MyType = MyType Int instance RTabular MyType where toRTable md t = emptyRTable fromRTable mf rt = MyType (5::Int)-} -- | Definition of a CSV file. -- Treating CSV data as opaque byte strings newtype CSV = CSV {csv :: V.Vector Row} -- type CSV = V.Vector Row -- i.e., CV.Csv -- | CSV data are \"Tabular\" data thus implement the 'RTabular' interface instance RTabular CSV where toRTable = csvToRTable fromRTable = rtableToCSV -- | Definition of a CSV Row. -- Essentially a Row is just a Vector of ByteString type Row = V.Vector Column -- i.e., CV.Record -- | Definition of a CSV column. type Column = CV.Field -- This typeclass instance is required by CV.decodeByName --instance CV.FromNamedRecord (V.Vector BS.ByteString) -- ################################################## -- * IO operations -- ################################################## -- | reads a CSV file and returns a lazy bytestring readCSVFile :: FilePath -- ^ the CSV file -> IO BL.ByteString -- ^ the output CSV readCSVFile f = BL.readFile f -- | reads a CSV file and returns a 'CSV' data type (Treating CSV data as opaque byte strings) readCSV :: FilePath -- ^ the CSV file -> IO CSV -- ^ the output CSV type readCSV f = do csvData <- BL.readFile f {- csvDataBS <- BL.readFile f let --decodeUtf8' :: ByteString -> Either UnicodeException Text utf8text = case decodeUtf8' (BL.toStrict csvDataBS) of Left exc -> error $ "Error in decodeUtf8' the whole ByteString from Data.ByteString.Lazy.readFile: " ++ (show exc) Right t -> t -- Note that I had to make sure to use encodeUtf8 on a literal of type Text rather than just using a ByteString literal directly to Cassava -- because The IsString instance for ByteStrings, which is what's used to convert the literal to a ByteString, truncates each Unicode code point -- see : https://stackoverflow.com/questions/26499831/parse-csv-tsv-file-in-haskell-unicode-characters csvData = encodeUtf8 utf8text -- encodeUtf8 :: Text -> ByteString -} let csvResult = -- fromRight' $ CV.decode CV.HasHeader csvData case CV.decode CV.HasHeader csvData of Left str -> throw $ CsvFileDecodingError f $ T.pack str -- error $ "Error in decoding CSV file " ++ f ++ ": " ++ str Right res -> res {-- case CV.decode CV.HasHeader csvData of --CV.decodeByName csvData of Left err -> let errbs = encode (err::String) -- BL.pack err -- convert String to ByteString record = V.singleton (errbs) csv = V.singleton (record) in csv Right csv -> csv --Right (hdr, csv) -> csv --} return $ CSV csvResult -- | Yes or No sum type data YesNo = Yes | No -- | Options for a CSV file (e.g., delimiter specification, header specification etc.) data CSVOptions = CSVOptions { delimiter :: Char ,hasHeader :: YesNo } -- | reads a CSV file based on input options (delimiter and header option) and returns a 'CSV' data type (Treating CSV data as opaque byte strings) readCSVwithOptions :: CSVOptions -> FilePath -- ^ the CSV file -> IO CSV -- ^ the output CSV type readCSVwithOptions opt f = do csvData <- BL.readFile f let csvoptions = CV.defaultDecodeOptions { CV.decDelimiter = fromIntegral $ ord (delimiter opt) } csvResult = case CV.decodeWith csvoptions (case (hasHeader opt) of Yes -> CV.HasHeader No -> CV.NoHeader) csvData of Left str -> throw $ CsvFileDecodingError f $ T.pack str -- error $ "Error in decoding CSV file " ++ f ++ ": " ++ str Right res -> res {- csvResult = fromRight' $ CV.decodeWith csvoptions (case (hasHeader opt) of Yes -> CV.HasHeader No -> CV.NoHeader) csvData -} return $ CSV csvResult -- | write a CSV (bytestring) to a newly created csv file writeCSVFile :: FilePath -- ^ the csv file to be created -> BL.ByteString -- ^ input CSV -> IO() writeCSVFile f csv = BL.writeFile f csv -- | write a 'CSV' to a newly created csv file writeCSV :: FilePath -- ^ the csv file to be created -> CSV -- ^ input 'CSV' -> IO() writeCSV f (CSV csv) = do let csvBS = CV.encode (V.toList csv) BL.writeFile f csvBS -- | print input CSV on screen printCSVFile :: BL.ByteString -- ^ input CSV to be printed on screen -> IO() printCSVFile csv = putStr csv -- | print input 'CSV' on screen printCSV :: CSV -- ^ input 'CSV' to be printed on screen -> IO() printCSV (CSV csv) = do -- convert each ByteString field to Text {--let csvText = V.map (\r -> V.map (decodeUtf32LE) r) csv let csvBS = CV.encode (V.toList csvText)--} let csvBS = CV.encode (V.toList csv) putStr csvBS -- | copy input csv file to specified output csv file copyCSV :: FilePath -- ^ input csv file ->FilePath -- ^ output csv file -> IO() copyCSV fi fo = do csv <- readCSV fi writeCSV fo csv -- ################################################## -- * CSV to RTable integration -- ################################################## -- | csvToRTable: Creates an RTable from a CSV and a set of RTable Metadata. -- The RTable metadata essentially defines the data type of each column so as to -- call the appropriate data constructor of RDataType and turn the ByteString values of CSV to RDataTypes values of RTable -- We assume that the order of the columns in the CSV is identical with the order of the columns in the RTable metadata csvToRTable :: RTableMData -> CSV -> RTable csvToRTable m (CSV c) = V.map (row2RTuple m) c where row2RTuple :: RTableMData -> Row -> RTuple row2RTuple md row = let -- create a list of ColumnInfo. The order of the list correpsonds to the fixed column order and it is identical to the CSV column order listOfColInfo = toListColumnInfo (rtuplemdata md) --Prelude.map (snd) $ (rtuplemdata md) -- HM.toList (rtuplemdata md) -- create a list of the form [(ColumnInfo, Column)] listOfColInfoColumn = Prelude.zip listOfColInfo (V.toList row) -- create a list of ColumnNames listOfColNames = toListColumnName (rtuplemdata md) --Prelude.map (fst) $ (rtuplemdata md) --HM.toList (rtuplemdata md) -- create a list of RDataTypes listOfRDataTypes = Prelude.map (\(ci,co) -> column2RDataType ci co) $ listOfColInfoColumn where column2RDataType :: ColumnInfo -> Column -> RDataType column2RDataType ci col = if col == BS.empty then -- this is an empty ByteString Null else -- Data.ByteString.Char8.unpack :: ByteString -> [Char] case (dtype ci) of Integer -> RInt (val::Integer) -- (read (Data.ByteString.Char8.unpack col) :: Int) --((read $ show val) :: Int) Varchar -> RText $ if False then trace ("Creating RText for column " ++ (T.unpack $ name ci)) $ (val::T.Text) else (val::T.Text) Date fmt -> RDate { rdate = (val::T.Text) {-decodeUtf8 col-} , dtformat = fmt } -- (val::T.Text) --getDateFormat (val::String)} Timestamp fmt -> RTime $ createRTimestamp (T.unpack fmt) (Data.ByteString.Char8.unpack col) -- Data.ByteString.Char8.unpack :: ByteString -> [Char] Double -> RDouble (val::Double) --(read (Data.ByteString.Char8.unpack col) :: Double) -- ((read $ show val) :: Double) where -- Use Data.Serialize for the decoding from ByteString to a known data type -- decode :: Serialize a => ByteString -> Either String a -- val = fromRight' (decode col) {-- val = case decode col of Left e -> e -- you should throw an exception here! Right v -> v --} -- use Data.Csv parsing capabilities in order to turn a Column (i.e. a Field, i.e., a ByteString) -- into a known data type. -- For this reason we are going to use : CV.parseField :: Field -> Parser a --val = fromRight' $ CV.runParser $ CV.parseField col val = case CV.runParser $ CV.parseField col of Left str -> throw $ CSVColumnToRDataTypeError (name ci) $ T.pack str -- error $ "Error in parsing column " ++ (T.unpack $ name ci) ++ ":" ++ str Right v -> v {-- val = case CV.runParser $ CV.parseField col of Left e -> e -- you should throw an exception here! Right v -> v --} {-- getDateFormat :: String -> String getDateFormat _ = "DD/MM/YYYY"-- parse and return date format --} in HM.fromList $ Prelude.zip listOfColNames listOfRDataTypes -- | rtableToCSV : Retunrs a CSV from an RTable -- The first line of the CSV will be the header line, taken from the RTable metadata. -- Note that the driver for creating the output CSV file is the input RTableMData descrbing the columns and RDataTypes of each RTuple. -- This means, that if the RTableMData include a subset of the actual columns of the input RTable, then no eror will occure and the -- output CSV will include only this subset. -- In the same token, if in the RTableMData there is a column name that is not present in the input RTable, then an error will occur. rtableToCSV :: RTableMData -- ^ input RTable metadata describing the RTable -> RTable -- ^ input RTable -> CSV -- ^ output CSV rtableToCSV m t = CSV $ (csv $ createCSVHeader m) V.++ (V.map (rtuple2row m) t) where rtuple2row :: RTableMData -> RTuple -> Row rtuple2row md rt = -- check that the RTuple is not empty. Otherwise the HM.! operator will cause an exception if not $ isRTupEmpty rt then let listOfColInfo = toListColumnInfo (rtuplemdata md) --Prelude.map (snd) $ (rtuplemdata md) --HM.toList (rtuplemdata md) -- create a list of the form [(ColumnInfo, RDataType)] -- Prelude.zip listOfColInfo (Prelude.map (snd) $ HM.toList rt) -- this code does NOT guarantee that HM.toList will return the same column order as [ColumnInfo] listOfColInfoRDataType :: [ColumnInfo] -> RTuple -> [(ColumnInfo, RDataType)] -- this code does guarantees that RDataTypes will be in the same column order as [ColumnInfo], i.e., the correct RDataType for the correct column listOfColInfoRDataType (ci:[]) rtup = [(ci, rtup HM.!(name ci))] -- rt HM.!(name ci) -> this returns the RDataType by column name listOfColInfoRDataType (ci:colInfos) rtup = (ci, rtup HM.!(name ci)):listOfColInfoRDataType colInfos rtup listOfColumns = Prelude.map (\(ci,rdt) -> rDataType2Column ci rdt) $ listOfColInfoRDataType listOfColInfo rt where rDataType2Column :: ColumnInfo -> RDataType -> Column rDataType2Column _ rdt = {-- -- encode :: Serialize a => a -> ByteString case rdt of RInt i -> encode i RText t -> encodeUtf8 t -- encodeUtf8 :: Text -> ByteString RDate {rdate = d, dtformat = f} -> encode d RDouble db -> encode db --} -- toField :: a -> Field (from Data.Csv) case rdt of RInt i -> CV.toField i RText t -> CV.toField t RDate {rdate = d, dtformat = f} -> CV.toField d RDouble db -> CV.toField ((printf "%.2f" db)::String) RTime { rtime = RTimestampVal {year = y, month = m, day = d, hours24 = h, minutes = mi, seconds = s} } -> let timeText = (digitToText d) `mappend` T.pack "/" `mappend` (digitToText m) `mappend` T.pack "/" `mappend` (digitToText y) `mappend` T.pack " " `mappend` (digitToText h) `mappend` T.pack ":" `mappend` (digitToText mi) `mappend` T.pack ":" `mappend` (digitToText s) -- T.pack . removeQuotes . T.unpack $ (showText d) `mappend` T.pack "/" `mappend` (showText m) `mappend` T.pack "/" `mappend` (showText y) `mappend` T.pack " " `mappend` (showText h) `mappend` T.pack ":" `mappend` (showText mi) `mappend` T.pack ":" `mappend` (showText s) -- removeQuotes $ (show d) ++ "/" ++ (show m) ++ "/" ++ (show y) ++ " " ++ (show h) ++ ":" ++ (show mi) ++ ":" ++ (show s) where digitToText :: Int -> T.Text digitToText d = if d > 9 then showText d else "0" `mappend` (showText d) showText :: Show a => a -> Text showText = T.pack . show -- removeQuotes ('"' : [] ) = "" -- removeQuotes ('"' : xs ) = removeQuotes xs -- removeQuotes ( x : xs ) = x:removeQuotes xs -- removeQuotes _ = "" --noQuotesText = fromJust $ T.stripSuffix "\"" (fromJust $ T.stripPrefix "\"" timeText) in CV.toField timeText --noQuotesText -- CV.toField $ (show d) ++ "/" ++ (show m) ++ "/" ++ (show y) ++ " " ++ (show h) ++ ":" ++ (show mi) ++ ":" ++ (show s) Null -> CV.toField (""::T.Text) in V.fromList $ listOfColumns else V.empty::Row createCSVHeader :: RTableMData -> CSV createCSVHeader md = let listOfColNames = toListColumnName (rtuplemdata md) --Prelude.map (fst) $ (rtuplemdata md) --HM.toList (rtuplemdata md) listOfByteStrings = Prelude.map (\n -> CV.toField n) listOfColNames headerRow = V.fromList listOfByteStrings in CSV $ V.singleton headerRow -- In order to be able to decode a CSV bytestring into an RTuple, -- we need to make Rtuple an instance of the FromNamedRecord typeclass and -- implement the parseNamesRecord function. But this is not necessary, since there is already an instance for CV.FromNamedRecord (HM.HashMap a b), which is the same, -- since an RTuple is a HashMap. -- -- type RTuple = HM.HashMap ColumnName RDataType -- type ColumnName = String -- data RDataType = -- RInt { rint :: Int } -- | RChar { rchar :: Char } -- | RText { rtext :: T.Text } -- | RString {rstring :: [Char]} -- | RDate { -- rdate :: String -- ,dtformat :: String -- ^ e.g., "DD/MM/YYYY" -- } -- | RDouble { rdouble :: Double } -- | RFloat { rfloat :: Float } -- | Null -- deriving (Show, Eq) -- -- -- parseNamedRecord :: NamedRecord -> Parser a -- type NamedRecord = HashMap ByteString ByteString -- -- Instance of class FromNamedRecord: -- (Eq a, FromField a, FromField b, Hashable a) => FromNamedRecord (HashMap a b) -- -- From this we understand that we need to make RDataType (which is "b" in HashMap a b) an instance of FormField ((CV.FromField RDataType)) by implementing parseField -- where: -- @ -- parseField :: Field -> Parser a -- type Field = ByteString -- @ {--instance CV.FromNamedRecord RTuple where parseNamedRecord r = do let listOfcolNames = map (fst) $ HM.toList r -- get the first element of each pair which is the name of the column (list of ByteStrings) listOfParserValues = map (\c -> r CV..: c) listOfcolNames -- this retuns a list of the form [Parser RDataType] listOfValues = map (\v -> right (CV.runParser v)) listOfParserValues -- this returns a list of the form [RDataType] rtup = createRtuple $ zip listOfcolNames listOfValues return rtup --} -- Necessary instance in order to convert a CSV file column value to an 'RDataType' value. {-instance CV.FromField RDataType where parseField dt = do -- dt is a ByteString (i.e., a Field) representing some value that we have read from the CSV file (we dont know its type) -- we need to construct an RDataType from this value and then wrap it into a Parser Monad and return it -- -- ### Note: the following line does not work ### -- 1. parse the input ByteString using Cassavas' parsing capabilities for known data types -- val <- CV.parseField dt -- 1. We dont know the type of dt. OK lets wrap it into a generic type, that of Data.Typeable.TypeRep let valTypeRep = TB.typeOf dt -- 2. wrap this value into a RDataType let rdata = createRDataType valTypeRep --val -- wrap the RDataType into a Parser Monad and return it pure rdata -} {-- -- #### NOTE ### -- -- if the following does not work (val is always a String, then try to use Data.Serialize.decode instead in order to get the original value from a bytestring) -- get the value inside the Parser Monad (FromField has instances from all common haskell data types) let val = case CV.runParser (CV.parseField dt) of -- runParser :: Parser a -> Either String a Left e -> e Right v -> v --} -- Lets try to use Data.Serialize.decode to get the vlaue from the bytestring (decode :: Serialize a => ByteString -> Either String a) {-- let val = case decode dt of Left e -> e Right v -> v -- wrap this value into a RDataType let rdata = createRDataType val -- wrap the RDataType into a Parser Monad pure rdata --} -- In order to encode an input RTable into a CSV bytestring -- we need to make Rtuple an instance of the ToNamedRecord typeclass and -- implement the toNamedRecord function. -- Where: -- -- @ -- toNamedRecord :: a -> NamedRecord -- type NamedRecord = HashMap ByteString ByteString -- -- namedRecord :: [(ByteString, ByteString)] -> NamedRecord -- Construct a named record from a list of name-value ByteString pairs. Use .= to construct such a pair from a name and a value. -- -- (.=) :: ToField a => ByteString -> a -> (ByteString, ByteString) -- @ -- -- In our case, we dont need to do this because an RTuple is just a synonym for HM.HashMap ColumnName RDataType and the data type HashMap a b is -- already an instance of ToNamedRecord. -- -- Also we need to make RDataType an instance of ToField ((CV.ToField RDataType)) by implementing toField, so as to be able -- to convert an RDataType into a ByteString -- where: -- -- @ -- toField :: a -> Field -- type Field = ByteString -- @ -- {-instance CV.ToField RDataType where toField rdata = case rdata of RInt i -> encode (i::Integer) --RChar c -> encode (c::Char) -- RText t -> encode (t::String) RText t -> encodeUtf8 t -- encodeUtf8 :: Text -> ByteString --RString s -> encode (s::String) --RFloat f -> encode (f::Float) RDouble d -> encode (d::Double) Null -> encode (""::String) RDate d f -> encodeUtf8 d -- encode (d::String) -} -- csv2rtable : turn a input CSV to an RTable. -- The input CSV will be a ByteString. We assume that the first line is the CSV header, -- including the Column Names. The RTable that will be created will have as column names the headers appearing -- in the first line of the CSV. -- Internally we use CV.decodeByName to achieve this decoding -- where: -- @ -- decodeByName -- :: FromNamedRecord a -- => ByteString -- -> Either String (Header, Vector a) -- @ -- Essentially, decodeByName will return a @Vector RTuples@ -- -- In order to be able to decode a CSV bytestring into an RTuple, -- we need to make Rtuple an instance of the FromNamesRecrd typeclass and -- implement the parseNamesRecord function. But this is not necessary, since there is already an instance for CV.FromNamedRecord (HM.HashMap a b), which is the same, -- since an RTuple is a HashMap. -- Also we need to make RDataType an instance of FormField ((CV.FromField RDataType)) by implementing parseField -- where: -- @ -- parseField :: Field -> Parser a -- type Field = ByteString -- @ -- See RTable module for these instance {-csv2rtable :: BL.ByteString -- ^ input CSV (we asume that this CSV has a header in the 1st line) -> RTable -- ^ output RTable csv2rtable csv = case CV.decodeByName csv of Left e -> emptyRTable Right (h, v) -> v -} -- rtable2csv: encode an RTable into a CSV bytestring -- The first line of the CSV will be the header, which compirses of the column names. -- -- Internally we use CV.encodeByName to achieve this decoding -- where: -- @ -- encodeByName :: ToNamedRecord a => Header -> [a] -> ByteString -- Efficiently serialize CSV records as a lazy ByteString. The header is written before any records and dictates the field order. -- -- type Header = Vector Name -- type Name = ByteString -- @ -- -- In order to encode an input RTable into a CSV bytestring -- we need to make Rtuple an instance of the ToNamedRecord typeclass and -- implement the toNamedRecord function. -- Where: -- @ -- toNamedRecord :: a -> NamedRecord -- type NamedRecord = HashMap ByteString ByteString -- -- namedRecord :: [(ByteString, ByteString)] -> NamedRecord -- Construct a named record from a list of name-value ByteString pairs. Use .= to construct such a pair from a name and a value. -- -- (.=) :: ToField a => ByteString -> a -> (ByteString, ByteString) -- @ -- In our case, we dont need to do this because an RTuple is just a synonym for HM.HashMap ColumnName RDataType and the data type HashMap a b is -- already an instance of ToNamedRecord. -- -- Also we need to make RDataType an instance of ToField ((CV.ToField RDataType)) by implementing toField, so as to be able -- to convert an RDataType into a ByteString -- where: -- @ -- toField :: a -> Field -- type Field = ByteString -- @ -- See 'RTable' module for these instance {-rtable2csv :: RTable -- ^ input RTable -> BL.ByteString -- ^ Output ByteString rtable2csv rtab = CV.encodeByName (csvHeaderFromRtable rtab) (V.toList rtab) -} -- | creates a 'Data.Csv.Header' (as defined in "Data.Csv") from an 'RTable' csvHeaderFromRtable :: RTable -> CV.Header csvHeaderFromRtable rtab = let fstRTuple = V.head rtab -- just get any tuple, e.g., the 1st one colList = HM.keys fstRTuple -- get a list of the columnNames ([ColumnName]) colListPacked = Prelude.map (encode . T.unpack) colList -- turn it into a list of ByteStrings ([ByteString]) header = V.fromList colListPacked in header -- ################################################## -- * Vector oprtations on CSV -- ################################################## -- | O(1) First row headCSV :: CSV -> Row headCSV = V.head . csv -- | O(1) Yield all but the first row without copying. The CSV may not be empty. tailCSV :: CSV -> CSV tailCSV = CSV . V.tail . csv -- ################################################## -- * DDL on CSV -- ################################################## -- ################################################## -- * DML on CSV -- ################################################## -- ################################################## -- * Filter, Join, Projection -- ################################################## -- | selectNrows: Returns the first N rows from a CSV file selectNrows :: Int -- ^ Number of rows to select -> CSV -- ^ Input csv -> CSV -- ^ Output csv selectNrows n icsv = CSV $ V.take n (csv icsv) {-selectNrows:: Int -- ^ Number of rows to select -> BL.ByteString -- ^ Input csv -> BL.ByteString -- ^ Output csv selectNrows n csvi = let rtabi = csv2rtable csvi rtabo = limit n rtabi -- restrictNrows n rtabi in rtable2csv rtabo -} -- | Column projection on an input CSV file where -- desired columns are defined by position (index) -- in the CSV. projectByIndex :: [Int] -- ^ input list of column indexes -> CSV -- ^ input csv -> CSV -- ^ output CSV projectByIndex inds (CSV icsv) = V.foldr (prj) (CSV $ V.empty) icsv where prj :: Row -> CSV -> CSV prj row (CSV acc) = let -- construct new row including only projected columns newrow = V.fromList $ Data.List.map (\i -> row V.! i) inds in -- add new row in result vector CSV $ V.snoc acc newrow -- ##### Exceptions Definitions -- | Exception to signify an error in decoding a CSV file into a 'CSV' data type data CsvFileDecodingError = CsvFileDecodingError FilePath Text deriving(Eq,Show) instance Exception CsvFileDecodingError -- | This exception signifies an error in parsing a 'CSV' 'Column' to an 'RDataType' value data CSVColumnToRDataTypeError = CSVColumnToRDataTypeError ColumnName Text deriving(Eq,Show) instance Exception CSVColumnToRDataTypeError