{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RankNTypes #-} {-| Module : Data.Sv.Decode.Core Copyright : (C) CSIRO 2017-2018 License : BSD3 Maintainer : George Wilson Stability : experimental Portability : non-portable This module contains data structures, combinators, and primitives for decoding a CSV into a list of your Haskell datatype. A 'Decode' can be built using the primitives in this file. 'Decode' is an 'Applicative' and an 'Data.Functor.Alt.Alt', allowing for composition of these values with '<*>' and '' The primitive 'Decode's in this file which use 'ByteString' expect UTF-8 encoding. The Decode type has an instance of 'Data.Profunctor.Profunctor', so you can 'lmap' or 'alterInput' to reencode on the way in. This module is intended to be imported qualified like so @ import qualified Data.Sv.Decode.Core as D @ -} module Data.Sv.Decode.Core ( -- * The types Decode (..) , Decode' , DecodeValidation , DecodeError (..) , DecodeErrors (..) -- * Running Decodes , decode -- * Convenience constructors and functions , decodeMay , decodeEither , decodeEither' , mapErrors , alterInput -- * Primitive Decodes -- ** Name-based , column , (.:) -- ** Field-based , contents , char , byteString , utf8 , lazyUtf8 , lazyByteString , string , int , integer , float , double , rational , boolean , boolean' , ignore , replace , exactly , emptyField -- ** Row-based , row -- * Combinators , choice , element , optionalField , ignoreFailure , orEmpty , either , orElse , orElseE , categorical , categorical' , (>>==) , (==<<) , bindDecode -- * Building Decodes from Readable , decodeRead , decodeRead' , decodeReadWithMsg -- * Building Decodes from parsers , withTrifecta , withAttoparsec , withParsec , withTextReader -- * Working with errors , onError , decodeError , unexpectedEndOfRow , expectedEndOfRow , unknownCategoricalValue , badParse , badDecode , validateEither , validateEitherWith , validateMaybe -- * Implementation details , runDecode , buildDecode , mkDecode , promote , promote' , runNamed , anonymous , makePositional ) where import Prelude hiding (either) import qualified Prelude import Control.Lens (alaf) import Control.Monad (unless) import Control.Monad.Reader (ReaderT (ReaderT, runReaderT)) import Control.Monad.State (state) import Control.Monad.Writer.Strict (runWriter) import qualified Data.Attoparsec.ByteString as A import Data.Bifunctor (first, second) import Data.ByteString (ByteString) import qualified Data.ByteString.UTF8 as UTF8 import qualified Data.ByteString.Lazy as LBS import Data.Char (toUpper) import Data.Functor.Alt (Alt (())) import Data.Functor.Compose (Compose (Compose, getCompose)) import Data.List.NonEmpty (NonEmpty ((:|))) import Data.Map.Strict (Map) import qualified Data.Map.Strict as M import Data.Monoid (First (First), Last) import Data.Profunctor (lmap) import Data.Readable (Readable (fromBS)) import Data.Semigroup (Semigroup ((<>)), sconcat) import Data.Semigroup.Foldable (asum1) import Data.Semigroupoid (Semigroupoid (o)) import Data.Set (Set, fromList, member) import Data.String (IsString (fromString)) import Data.Text (Text) import qualified Data.Text as T import Data.Text.Encoding (decodeUtf8', encodeUtf8) import qualified Data.Text.Read as TR (Reader, rational) import qualified Data.Text.Lazy as LT import Data.Vector (Vector, (!)) import qualified Data.Vector as V import Text.Parsec (Parsec) import qualified Text.Parsec as P (parse) import qualified Text.Trifecta as Tri import Data.Sv.Decode.Error import Data.Sv.Decode.Type -- | Decodes a sv into a list of its values using the provided 'Decode' decode :: Traversable f => Decode' ByteString a -> f (Vector ByteString) -> DecodeValidation ByteString (f a) decode d = traverse (promote d) -- | Build a 'Decode', given a function that returns 'Maybe'. -- -- Return the given error if the function returns 'Nothing'. decodeMay :: DecodeError e -> (s -> Maybe a) -> Decode e s a decodeMay e f = mkDecode (validateMaybe e . f) -- | Build a 'Decode', given a function that returns 'Either'. decodeEither :: (s -> Either (DecodeError e) a) -> Decode e s a decodeEither f = mkDecode (validateEither . f) -- | Build a 'Decode', given a function that returns 'Either', and a function to -- build the error. decodeEither' :: (e -> DecodeError e') -> (s -> Either e a) -> Decode e' s a decodeEither' e f = mkDecode (validateEitherWith e . f) -- | Get the contents of a field without doing any decoding. This never fails. contents :: Decode e s s contents = mkDecode pure -- | Grab the whole row as a 'Vector' row :: Decode e s (Vector s) row = Decode . Compose . DecodeState . ReaderT $ \v -> state (const (pure v, Ind (V.length v))) -- | Get a field that's a single char. This will fail if there are mulitple -- characters in the field. char :: Decode' ByteString Char char = string >>== \cs -> case cs of [] -> badDecode "Expected single char but got empty string" (c:[]) -> pure c (_:_:_) -> badDecode ("Expected single char but got " <> UTF8.fromString cs) -- | Get the contents of a field as a bytestring. -- -- Alias for 'contents' byteString :: Decode' ByteString ByteString byteString = contents -- | Get the contents of a UTF-8 encoded field as 'Text' -- -- This will also work for ASCII text, as ASCII is a subset of UTF-8 utf8 :: Decode' ByteString Text utf8 = contents >>== Prelude.either (badDecode . UTF8.fromString . show) pure . decodeUtf8' -- | Get the contents of a field as a lazy 'Data.Text.Lazy.Text' lazyUtf8 :: Decode' ByteString LT.Text lazyUtf8 = LT.fromStrict <$> utf8 -- | Get the contents of a field as a lazy 'Data.ByteString.Lazy.ByteString' lazyByteString :: Decode' ByteString LBS.ByteString lazyByteString = LBS.fromStrict <$> contents -- | Get the contents of a field as a 'String' string :: Decode' ByteString String string = UTF8.toString <$> contents -- | Throw away the contents of a field. This is useful for skipping unneeded fields. ignore :: Decode e s () ignore = replace () -- | Throw away the contents of a field, and return the given value. replace :: a -> Decode e s a replace a = a <$ contents -- | Decode exactly the given string, or else fail. exactly :: (Semigroup s, Eq s, IsString s) => s -> Decode' s s exactly s = contents >>== \z -> if s == z then pure s else badDecode (sconcat ("'":|[z,"' was not equal to '",s,"'"])) -- | Decode a UTF-8 'ByteString' field as an 'Int' int :: Decode' ByteString Int int = named "int" -- | Decode a UTF-8 'ByteString' field as an 'Integer' integer :: Decode' ByteString Integer integer = named "integer" -- | Decode a UTF-8 'ByteString' field as a 'Float' float :: Decode' ByteString Float float = named "float" -- | Decode a UTF-8 'ByteString' field as a 'Double' -- -- This is much faster than 'rational' but can be less precise. If you're -- not sure which to use, use 'rational'. -- -- See the documentation for 'Data.Text.Read.double' for more information. double :: Decode' ByteString Double double = named "double" -- | Decode a UTF-8 'ByteString' as any 'Floating' type (usually 'Double') -- -- This is slower than 'double ' but more precise. If you're -- not sure which to use, use 'rational'. -- -- See the documentation for 'Data.Text.Read.double' for more information. rational :: Floating a => Decode' ByteString a rational = rat `o` utf8 where rat = mapErrors encodeUtf8 (withTextReader TR.rational) -- | Decode a field as a 'Bool' -- -- This aims to be tolerant to different forms a boolean might take. boolean :: (IsString s, Ord s) => Decode' s Bool boolean = boolean' fromString -- | Decode a field as a 'Bool'. This version lets you provide the fromString -- function that's right for you, since 'Data.String.IsString' on a -- 'Data.ByteString.ByteString' will do the wrong thing in the case of many -- encodings such as UTF-16 or UTF-32. -- -- This aims to be tolerant to different forms a boolean might take. boolean' :: Ord s => (String -> s) -> Decode' s Bool boolean' s = categorical' [ (False, fmap s ["false", "False", "FALSE", "f", "F", "0", "n", "N", "no", "No", "NO", "off", "Off", "OFF"]) , (True, fmap s ["true", "True", "TRUE", "t", "T", "1", "y", "Y", "yes", "Yes", "YES", "on", "On", "ON"]) ] -- | Succeed only when the given field is the empty string. -- -- The empty string surrounded in quotes or spaces is still the empty string. emptyField :: (Eq s, IsString s, Semigroup s) => Decode' s () emptyField = contents >>== \c -> unless (c == fromString "") (badDecode ("Expected emptiness but got: " <> c)) -- | Choose the leftmost 'Decode' that succeeds. Alias for '' choice :: Decode e s a -> Decode e s a -> Decode e s a choice = () -- | Choose the leftmost 'Decode' that succeeds. Alias for 'asum1' element :: NonEmpty (Decode e s a) -> Decode e s a element = asum1 -- | Try the given 'Decode'. If it fails, instead succeed with 'Nothing'. ignoreFailure :: Decode e s a -> Decode e s (Maybe a) ignoreFailure a = Just <$> a Nothing <$ ignore -- | If the field is the empty string, succeed with 'Nothing'. -- Otherwise try the given 'Decode'. orEmpty :: (Eq s, IsString s, Semigroup s) => Decode' s a -> Decode' s (Maybe a) orEmpty a = Nothing <$ emptyField Just <$> a -- | Try the given 'Decode'. If it fails, succeed without consuming anything. -- -- This usually isn't what you want. 'ignoreFailure' and 'orEmpty' are more -- likely what you are after. optionalField :: Decode e s a -> Decode e s (Maybe a) optionalField a = Just <$> a pure Nothing -- | Try the first, then try the second, and wrap the winner in an 'Either'. -- -- This is left-biased, meaning if they both succeed, left wins. either :: Decode e s a -> Decode e s b -> Decode e s (Either a b) either a b = fmap Left a fmap Right b -- | Try the given decoder, otherwise succeed with the given value. orElse :: Decode e s a -> a -> Decode e s a orElse f a = f replace a -- | Try the given decoder, or if it fails succeed with the given value, in an 'Either'. orElseE :: Decode e s b -> a -> Decode e s (Either a b) orElseE b a = fmap Right b replace (Left a) -- | Decode categorical data, given a list of the values and the strings which match them. -- -- Usually this is used with sum types with nullary constructors. -- -- > data TrafficLight = Red | Amber | Green -- > categorical [(Red, "red"), (Amber, "amber"), (Green, "green")] categorical :: (Ord s, Show a) => [(a, s)] -> Decode' s a categorical = categorical' . fmap (fmap pure) -- | Decode categorical data, given a list of the values and lists of strings -- which match them. -- -- This version allows for multiple strings to match each value, which is -- useful for when the categories are inconsistently labelled. -- -- > data TrafficLight = Red | Amber | Green -- > categorical' [(Red, ["red", "R"]), (Amber, ["amber", "orange", "A"]), (Green, ["green", "G"])] -- -- For another example of its usage, see the source for 'boolean'. categorical' :: forall s a . (Ord s, Show a) => [(a, [s])] -> Decode' s a categorical' as = let as' :: [(a, Set s)] as' = fmap (second fromList) as go :: s -> (a, Set s) -> Maybe a go s (a, set) = if s `member` set then Just a else Nothing in contents >>== \s -> validateMaybe (UnknownCategoricalValue s (fmap snd as)) $ alaf First foldMap (go s) as' -- | Use the 'Readable' instance to try to decode the given value. decodeRead :: Readable a => Decode' ByteString a decodeRead = decodeReadWithMsg (mappend "Couldn't decode ") -- | Use the 'Readable' instance to try to decode the given value, -- or fail with the given error message. decodeRead' :: Readable a => ByteString -> Decode' ByteString a decodeRead' e = decodeReadWithMsg (const e) -- | Use the 'Readable' instance to try to decode the given value, -- or use the value to build an error message. decodeReadWithMsg :: Readable a => (ByteString -> e) -> Decode e ByteString a decodeReadWithMsg e = contents >>== \c -> maybe (badDecode (e c)) pure . fromBS $ c -- | Given the name of a type, try to decode it using 'Readable', named :: Readable a => ByteString -> Decode' ByteString a named name = let vs' = ['a','e','i','o','u'] vs = fmap toUpper vs' ++ vs' n c = if c `elem` vs then "n" else "" n' = foldMap (n . fst) . UTF8.uncons n'' = n' name space = " " in decodeReadWithMsg $ \bs -> mconcat ["Couldn't decode \"", bs, "\" as a", n'', space, name] -- | Map over the errors of a 'Decode' -- -- To map over the other two parameters, use the 'Data.Profunctor.Profunctor' instance. mapErrors :: (e -> x) -> Decode e s a -> Decode x s a mapErrors f (Decode (Compose r)) = Decode (Compose (fmap (rnat (first (fmap f))) r)) -- | This transforms a @Decode' s a@ into a @Decode' t a@. It needs -- functions in both directions because the errors can include fragments of the -- input. -- -- @alterInput :: (s -> t) -> (t -> s) -> Decode' s a -> Decode' t a@ alterInput :: (e -> x) -> (t -> s) -> Decode e s a -> Decode x t a alterInput f g = mapErrors f . lmap g ---- Promoting parsers to 'Decode's -- | Build a 'Decode' from a Trifecta parser withTrifecta :: Tri.Parser a -> Decode' ByteString a withTrifecta = mkParserFunction (validateTrifectaResult (BadDecode . UTF8.fromString)) (flip Tri.parseByteString mempty) -- | Build a 'Decode' from an Attoparsec parser withAttoparsec :: A.Parser a -> Decode' ByteString a withAttoparsec = mkParserFunction (validateEitherWith (BadDecode . fromString)) A.parseOnly -- | Build a 'Decode' from a Parsec parser withParsec :: Parsec ByteString () a -> Decode' ByteString a withParsec = -- Parsec will include a position, but it will only confuse the user -- since it won't correspond obviously to a position in their source file. let dropPos = drop 1 . dropWhile (/= ':') in mkParserFunction (validateEitherWith (BadDecode . UTF8.fromString . dropPos . show)) (\p s -> P.parse p mempty s) -- | Build a 'Decode' from a @Data.Text@ 'Data.Text.Read.Reader' withTextReader :: TR.Reader a -> Decode' Text a withTextReader ir = decodeEither $ \t -> case ir t of Left s -> let msg = "Couldn't decode \"" <> t <> "\": " <> T.pack s in Left (BadDecode msg) Right (a,leftover) -> if T.null leftover then pure a else Left (BadDecode ( "Leftover input during decoding: " <> leftover )) mkParserFunction :: Tri.CharParsing p => (f a -> DecodeValidation ByteString a) -> (p a -> ByteString -> f a) -> p a -> Decode' ByteString a mkParserFunction err run p = let p' = p <* Tri.eof in byteString >>== (err . run p') {-# INLINE mkParserFunction #-} -- | This can be used to build a 'Decode' whose value depends on the -- result of another 'Decode'. This is especially useful since 'Decode' is not -- a 'Monad'. -- -- If you need something like this but with more power, look at 'bindDecode' (>>==) :: Decode e s a -> (a -> DecodeValidation e b) -> Decode e s b (>>==) = flip (==<<) infixl 1 >>== {-# INLINE (>>==) #-} -- | flipped '>>==' (==<<) :: (a -> DecodeValidation e b) -> Decode e s a -> Decode e s b (==<<) f d = buildDecode $ \vec i -> case runDecode d vec i of (v, l, i') -> (bindValidation v f, l, i') infixr 1 ==<< -- | Bind through a 'Decode'. -- -- This bind does not agree with the 'Applicative' instance because it does -- not accumulate multiple error values. This is a violation of the 'Monad' -- laws, meaning 'Decode' is not a 'Monad'. -- -- That is not to say that there is anything wrong with using this function. -- It can be quite useful. bindDecode :: Decode e s a -> (a -> Decode e s b) -> Decode e s b bindDecode d f = buildDecode $ \v i -> case runDecode d v i of (Failure e, l, i') -> (Failure e, l, i') (Success a, l, i') -> case runDecode (f a) v i' of (v', l', i'') -> (v', l <> l', i'') -- | Run a 'Decode', and based on its errors build a new 'Decode'. onError :: Decode e s a -> (DecodeErrors e -> Decode e s a) -> Decode e s a onError d f = buildDecode $ \v i -> case runDecode d v i of (Success a, l, i') -> (Success a, l, i') (Failure e, l, i') -> case runDecode (f e) v i' of (v',l',i'') -> (v',l <> l',i'') -- | Build a 'Decode' from a function. mkDecode :: (s -> DecodeValidation e a) -> Decode e s a mkDecode f = Decode . Compose . DecodeState . ReaderT $ \v -> state $ \(Ind i) -> if i >= length v then (Compose (pure unexpectedEndOfRow), Ind i) else (Compose (pure (f (v ! i))), Ind (i+1)) -- | Promotes a 'Decode' to work on a whole 'Record' at once. -- This does not need to be called by the user. Instead use 'decode'. promote :: Decode' s a -> Vector s -> DecodeValidation s a promote = promote' id {-# INLINE promote #-} -- | Promotes a 'Decode' to work on a whole 'Record' at once. -- This does not need to be called by the user. Instead use 'decode'. -- -- This version lets the error string and input string type pararms -- differ, but needs a function to convert between them. promote' :: (s -> e) -> Decode e s a -> Vector s -> DecodeValidation e a promote' se dec vecField = let len = length vecField in case runDecode dec vecField (Ind 0) of (d, l, Ind i) -> if i < len && and l then d *> expectedEndOfRow (V.force (fmap se (V.drop i vecField))) else d -- | Convenience to get the underlying function out of a 'Decode' in a useful form runDecode :: Decode e s a -> Vector s -> Ind -> (DecodeValidation e a, Last Bool, Ind) runDecode = fmap (fmap z) . runDecodeState . getCompose . unwrapDecode where z (Compose wv, i) = case runWriter wv of (v,l) ->(v,l,i) {-# INLINE runDecode #-} -- | Convenience to get the underlying function out of a 'NameDecode' in a useful form runNamed :: NameDecode e s a -> Map s Ind -> DecodeValidation e (Decode e s a) runNamed = fmap getCompose . runReaderT . unNamed -- | Promote a 'Decode' to a 'NameDecode' that doesn't look for any names anonymous :: Decode e s a -> NameDecode e s a anonymous = Named . ReaderT . pure . Compose . pure -- | Given a header and a 'NameDecode', resolve header names to positions and -- return a 'Decode' makePositional :: Ord s => Vector s -> NameDecode e s a -> DecodeValidation e (Decode e s a) makePositional names d = runNamed d . M.fromList $ zip (V.toList names) (Ind <$> [0..]) -- | This is the primitive for building decoders that work with columns -- -- Look for the column with the given name and run the given decoder on it column :: Ord s => s -> Decode' s a -> NameDecode' s a column s d = Named . ReaderT $ \m -> case M.lookup s m of Nothing -> Compose (missingColumn s) Just i -> Compose . pure . buildDecode $ \vec _ -> case runDecode d vec i of (v, l, i') -> (v, l <> pure False, i') -- | Infix alias for 'column' -- -- Mnemonic: __D__ot colon names __D__ecoders, __E__qual colon names __E__ncoders. (.:) :: Ord s => s -> Decode' s a -> NameDecode' s a (.:) = column {-# INLINE (.:) #-} infixl 5 .: rnat :: Functor f => (g a -> h a) -> Compose f g a -> Compose f h a rnat gh (Compose fga) = Compose (fmap gh fga)