src/Data/Cassava/Records.hs

{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE OverloadedStrings #-}

{-|
Module      : Records.hs
Description : Using Template Haskell this module auto create Record
              types by inferring types from the provided csv or tab separated file.
Copyright   : (c) Guru Devanla 2018
License     : MIT
Maintainer  : grdvnl@gmail.com
Stability   : experimental


This module provides an easy way to explore input files that may have numerous columns
by helping create a Record types by guessing the types. That information can be used
as is or persisted to a file so that other customizations can be performed.
-}

module Data.Cassava.Records
  (
    -- ** Creating Record types
    -- $makeCsvRecord
    makeCsvRecord
    -- $commaOptions
  , commaOptions
    -- $tabOptions
  , tabOptions
  -- $makeInstance
  , makeInstance
  -- $loadData
  , loadData
  )
where

import Control.Monad
import Language.Haskell.TH
import Language.Haskell.TH.Syntax
import qualified Data.ByteString as BL
import qualified Data.ByteString.Lazy as BLZ
import qualified Data.ByteString.Char8 as BC
import Data.Csv.Parser as CP
import qualified Data.Csv as Csv
import qualified Data.Vector as V
import Data.List as L
import Data.HashMap.Strict as H
import Data.Csv hiding(Name)
import Data.Attoparsec.ByteString as P
import Data.Attoparsec.Text as AT
import Data.String
import Text.Read
import qualified Data.Char as DC
import GHC.Generics (Generic)
import Data.Text as DT
import qualified Data.Text.Encoding as DTE
import Data.Data

import Data.Cassava.Internal.RecordBuilder


defaultFieldNameOptions :: Options
defaultFieldNameOptions = defaultOptions { fieldLabelModifier = rmUnderscore }
  where
    rmUnderscore ('_':str) = DT.unpack . DT.pack $ str
    rmUnderscore str = str

{-| Convinience method that creates the default instances required by
Cassava. The generated methods assumed fields are prefixed with "_".

For example, if the column header in the input file have upper case or mixed case
the names will not directly match with field names in the record. In that case
explicity instances have to be provided manually and the field modifiers provided accordingly.

For example, if the columns in the input file have all headers listed in upper case,
since the field names are all lower case, the defaultFieldNameOptions function would look like
this

@

defaultFieldNameOptions :: Options
defaultFieldNameOptions = defaultOptions { fieldLabelModifier = rmUnderscore }
  where
    rmUnderscore ('_':str) = DT.unpack . DT.toUpper . DT.pack $ str
    rmUnderscore str = str
@

Note the DT.toUpper call to convert the field names to upper case before comparing to
'NamedRecords'

-}
makeInstance :: String -- ^ name of record for which the instance needs to be created
             -> DecsQ
makeInstance recordName = [d|
   instance ToNamedRecord $(conT (mkName recordName)) where
        toNamedRecord = genericToNamedRecord defaultFieldNameOptions
   instance FromNamedRecord $(conT (mkName recordName)) where
        parseNamedRecord = genericParseNamedRecord defaultFieldNameOptions
   instance DefaultOrdered $(conT (mkName recordName)) where
        headerOrder = genericHeaderOrder defaultFieldNameOptions
   |]


-- $tabOptions
{-| Provides a default 'DecodeOptions' for tab separated input files
-}
tabOptions :: DecodeOptions
tabOptions = defaultDecodeOptions {
  decDelimiter = fromIntegral (DC.ord '\t')
  }

-- $commaOptions
{-| Provides a default 'DecodeOptions' for comma separated input files
-}
commaOptions :: DecodeOptions
commaOptions = defaultDecodeOptions

-- $makeCsvRecord
{-|
Makes the Record that reflects the types inferred from the input file.
-}
makeCsvRecord :: String -- ^ Name to use for the Record type being created
              -> FilePath  -- ^ File path of input file
              -> String -- ^ Prefix to be used to field names. Recommended to use "_" to work well with Lens
              -> DecodeOptions -- ^ 'DecodeOptions' as required by Cassava to read the input file
              -> DecsQ
makeCsvRecord recordName fileName prefix decodeOptions = do
  csvData <- runIO $ BL.readFile fileName
  let (headers, named_records) = createRecords csvData decodeOptions
  makeRecord recordName (inferTypes headers named_records prefix)

-- $loadData
{-|
Helper function to load the data from the the provided file path
-}
loadData :: (FromNamedRecord a)
         => FilePath  -- ^ Path of the file to be loaded
         -> IO (V.Vector a) -- ^ a will be of a Record type
loadData file_path = do
  csvData <- BLZ.readFile file_path
  case decodeByName csvData of
    Left err -> fail ("Faled to load" Prelude.++ err)
    Right (_, v) -> return v