{-# language OverloadedStrings, TemplateHaskell #-}
{-# OPTIONS_GHC -fno-warn-unused-imports #-}
module Numeric.Datasets.Netflix (
parseTrainingSet, parseTestSet, parseMovies,
RD(..),
UserId, MovieId,
Train(..), Test(..), Movie(..),
RatingDate(..),
trainingSet, testSet, movies
) where
import Prelude hiding (takeWhile)
import Numeric.Datasets
import Data.FileEmbed
import Data.ByteString hiding (map, head, takeWhile)
import Data.Time (Day, fromGregorian)
import Data.Monoid (mconcat)
import Data.Traversable (traverse)
import qualified Data.Attoparsec.Internal.Types as PT (Parser)
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 hiding (takeWhile, inClass)
trainingSet :: [(FilePath, ByteString)]
trainingSet = $(embedDir "datafiles/netflix/training/")
testSet :: [(FilePath, ByteString)]
testSet = $(embedDir "datafiles/netflix/test/")
movies :: [(FilePath, ByteString)]
movies = $(embedDir "datafiles/netflix/movies/")
data RatingDate = RatingDate {userId :: UserId,
ratingDate :: Day} deriving (Eq, Show)
newtype UserId = UserId {unUserId :: Int} deriving Eq
instance Show UserId where show = show . unUserId
data Train = Train {trainRating :: RatingDate,
rating :: Int } deriving (Eq, Show)
newtype MovieId = MovieId {unMovieId :: Int} deriving Eq
instance Show MovieId where show = show . unMovieId
data Movie = Movie { movieId :: MovieId,
releaseYear :: Day,
movieTitle :: ByteString } deriving (Eq, Show)
newtype Test = Test { testRating :: RatingDate } deriving (Eq, Show)
data Col a = Col {cMovieId :: MovieId,
cSet :: [a]} deriving (Eq, Show)
newtype TrainCol = TrainC { unTrC :: Col Train } deriving (Eq, Show)
mkTrainCol :: MovieId -> [Train] -> TrainCol
mkTrainCol mid cs = TrainC (Col mid cs)
newtype TestCol = TestC { unTeC :: Col Test } deriving (Eq, Show)
mkTestCol :: MovieId -> [Test] -> TestCol
mkTestCol mid cs = TestC (Col mid cs)
data RD a = RD { rdRating :: a,
rdDate :: Day} deriving (Eq, Show)
toCoordsTrainCol :: Num a => TrainCol -> [(UserId, MovieId, RD a)]
toCoordsTrainCol tc = map (f mid) tss where
tss = cSet $ unTrC tc
mid = cMovieId $ unTrC tc
f m ts = (uid, m, RD r d) where
r = fromIntegral $ rating ts
d = ratingDate $ trainRating ts
uid = userId $ trainRating ts
toCoordsTestCol :: TestCol -> [(UserId, MovieId, Day)]
toCoordsTestCol tc = map (f mid) tss where
tss = cSet $ unTeC tc
mid = cMovieId $ unTeC tc
f m ts = (uid, m, d) where
d = ratingDate $ testRating ts
uid = userId $ testRating ts
parseTrainingSet :: Num a => Either String [(UserId, MovieId, RD a)]
parseTrainingSet = mconcat <$> parseTrainingSet'
parseTrainingSet' :: Num a => Either String [[(UserId, MovieId, RD a)]]
parseTrainingSet' = do
d <- traverse (parseOnly trainingSetParser . snd) trainingSet
pure $ map toCoordsTrainCol d
parseTestSet :: Either String [(UserId, MovieId, Day)]
parseTestSet = mconcat <$> parseTestSet'
parseTestSet' :: Either String [[(UserId, MovieId, Day)]]
parseTestSet' = do
d <- traverse (parseOnly testSetParser . snd) testSet
return $ map toCoordsTestCol $ mconcat d
parseMovies :: Either String [Movie]
parseMovies = do
d <- traverse (parseOnly moviesParser . snd) movies
return $ mconcat d
trainingSetParser :: PT.Parser ByteString TrainCol
trainingSetParser = do
(mid, tr) <- stanza trainRow
return $ mkTrainCol mid tr
testSetParser :: PT.Parser ByteString [TestCol]
testSetParser = do
ll <- many1 (stanza testRow)
return $ map (uncurry mkTestCol) ll
moviesParser :: PT.Parser ByteString [Movie]
moviesParser = parseRows moviesRow
trainRow :: PT.Parser ByteString Train
trainRow = do
uid <- decc
rate <- decc
d <- date
let r = RatingDate (UserId uid) d
return $ Train r rate
testRow :: PT.Parser ByteString Test
testRow = do
uid <- decc
d <- date
let r = RatingDate (UserId uid) d
return $ Test r
moviesRow :: PT.Parser ByteString Movie
moviesRow = do
mo <- decc
ye <- decc
title <- takeWhile (inClass "-a-zA-Z0-9 :,&.")
return $ Movie (MovieId mo) (fromGregorian (fromIntegral ye) 1 1) title
parseRows :: PT.Parser ByteString a -> PT.Parser ByteString [a]
parseRows p = many1 (p <* endOfLine)
stanza :: PT.Parser ByteString a -> PT.Parser ByteString (MovieId, [a])
stanza p = do
i <- ident <* endOfLine
pp <- many1 (p <* endOfLine)
return (MovieId (fromIntegral i), pp)
date :: PT.Parser ByteString Day
date = do
(yy:mm:dd:_) <- sepBy decimal dash
pure $ fromGregorian (fromIntegral yy) mm dd
comma, dash :: Parser Char
comma = char ','
dash = char '-'
decc :: PT.Parser ByteString Int
decc = do
d <- decimal
_ <- comma
return d
ident :: PT.Parser ByteString Integer
ident = do
i <- decimal
_ <- char ':'
return i