{-# language DeriveAnyClass #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE OverloadedStrings #-}
{-# options_ghc -Wno-unused-imports -Wno-unused-top-binds #-}
module Algebra.Graph.IO.Datasets.LINQS.Citeseer (citeseerGraph, stash, ContentRow(..), DocClass(..)) where

import Control.Applicative (Alternative(..))
import Control.Monad (when, foldM)
import Control.Monad.IO.Class (MonadIO(..))
import GHC.Generics (Generic(..))
import GHC.Int (Int16)
import Data.Functor (($>))

-- algebraic-graphs
import qualified Algebra.Graph as G (Graph, empty, overlay, edge)
-- binary
import Data.Binary (Binary(..), encode, decode, encodeFile, decodeFileOrFail)
-- binary-conduit
import qualified Data.Conduit.Serialization.Binary as CB (conduitDecode, conduitEncode, ParseError(..))
-- bytestring
import Data.ByteString (ByteString)
import Data.ByteString.Char8 (unpack)
-- conduit
import Conduit (MonadUnliftIO(..), MonadResource, runResourceT)
import Data.Conduit (runConduit, ConduitT, (.|), yield, await)
import qualified Data.Conduit.Combinators as C (print, sourceFile, sinkFile, map, mapM, foldM, foldMap, foldl, foldMapM, mapWhile)
-- containers
import Data.Sequence (Seq, (|>))
import qualified Data.Map as M (Map, singleton, lookup)
-- exceptions
import Control.Monad.Catch (MonadThrow(..))
-- filepath
import System.FilePath ((</>), takeFileName, takeExtension)
-- http-conduit
import Network.HTTP.Simple (httpSource, getResponseBody, Response, Request, parseRequest, setRequestMethod)
-- megaparsec
import Text.Megaparsec (parse, parseTest, (<?>))
import Text.Megaparsec.Char (char)
import Text.Megaparsec.Char.Lexer (decimal)
import Text.Megaparsec.Error (errorBundlePretty)
-- parser.combinators
import Control.Monad.Combinators (count)
-- primitive
import Control.Monad.Primitive (PrimMonad(..))
-- tar-conduit
import Data.Conduit.Tar (Header(..), untarChunks, TarChunk, withEntries, FileInfo, filePath, withFileInfo, headerFileType, FileType(..), headerFilePath)
-- text
import qualified Data.Text as T (Text, unwords)
import qualified Data.Text.IO as T (readFile)

import Algebra.Graph.IO.Internal.Conduit (fetch, unTarGz)
import Algebra.Graph.IO.Internal.Megaparsec (Parser, ParseE, symbol, lexeme, alphaNum)
import Algebra.Graph.IO.SV (parseTSV)

{-
CiteSeer: The CiteSeer dataset consists of 3312 scientific publications classified into one of six classes. The citation network consists of 4732 links. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 3703 unique words. The README file in the dataset provides more details.
http://www.cs.umd.edu/~sen/lbc-proj/data/citeseer.tgz
-}

-- | Download, parse, serialize and save the dataset to local storage.
--
-- Two binary files will be created under @.\/assets\/citeseer/@
stash :: IO ()
stash :: IO ()
stash = do
  let path :: String
path = String
"http://www.cs.umd.edu/~sen/lbc-proj/data/citeseer.tgz"
  Request
rq <- String -> IO Request
forall (m :: * -> *). MonadThrow m => String -> m Request
parseRequest String
path
  ResourceT IO () -> IO ()
forall (m :: * -> *) a. MonadUnliftIO m => ResourceT m a -> m a
runResourceT (ResourceT IO () -> IO ()) -> ResourceT IO () -> IO ()
forall a b. (a -> b) -> a -> b
$ ConduitT () Void (ResourceT IO) () -> ResourceT IO ()
forall (m :: * -> *) r. Monad m => ConduitT () Void m r -> m r
runConduit (ConduitT () Void (ResourceT IO) () -> ResourceT IO ())
-> ConduitT () Void (ResourceT IO) () -> ResourceT IO ()
forall a b. (a -> b) -> a -> b
$
    Request -> ConduitT () ByteString (ResourceT IO) ()
forall (m :: * -> *) i.
MonadResource m =>
Request -> ConduitT i ByteString m ()
fetch Request
rq ConduitT () ByteString (ResourceT IO) ()
-> ConduitM ByteString Void (ResourceT IO) ()
-> ConduitT () Void (ResourceT IO) ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    ConduitT ByteString TarChunk (ResourceT IO) ()
forall (m :: * -> *).
(PrimMonad m, MonadThrow m) =>
ConduitT ByteString TarChunk m ()
unTarGz ConduitT ByteString TarChunk (ResourceT IO) ()
-> ConduitM TarChunk Void (ResourceT IO) ()
-> ConduitM ByteString Void (ResourceT IO) ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    (FileInfo -> ConduitM ByteString Void (ResourceT IO) ())
-> ConduitM TarChunk Void (ResourceT IO) ()
forall (m :: * -> *) o.
MonadThrow m =>
(FileInfo -> ConduitM ByteString o m ())
-> ConduitM TarChunk o m ()
withFileInfo ( \FileInfo
fi -> do
     FileInfo -> ConduitM ByteString Void (ResourceT IO) ()
forall (m :: * -> *) c.
(MonadThrow m, MonadResource m) =>
FileInfo -> ConduitT ByteString c m ()
contentToFile FileInfo
fi
     FileInfo -> ConduitM ByteString Void (ResourceT IO) ()
forall (m :: * -> *) c.
(MonadThrow m, MonadIO m, MonadResource m) =>
FileInfo -> ConduitT ByteString c m ()
citesToFile FileInfo
fi )

restoreContent :: IO (M.Map String (Seq Int16, DocClass))
restoreContent :: IO (Map String (Seq Int16, DocClass))
restoreContent = ResourceT IO (Map String (Seq Int16, DocClass))
-> IO (Map String (Seq Int16, DocClass))
forall (m :: * -> *) a. MonadUnliftIO m => ResourceT m a -> m a
runResourceT (ResourceT IO (Map String (Seq Int16, DocClass))
 -> IO (Map String (Seq Int16, DocClass)))
-> ResourceT IO (Map String (Seq Int16, DocClass))
-> IO (Map String (Seq Int16, DocClass))
forall a b. (a -> b) -> a -> b
$ ConduitT () Void (ResourceT IO) (Map String (Seq Int16, DocClass))
-> ResourceT IO (Map String (Seq Int16, DocClass))
forall (m :: * -> *) r. Monad m => ConduitT () Void m r -> m r
runConduit (ConduitT () Void (ResourceT IO) (Map String (Seq Int16, DocClass))
 -> ResourceT IO (Map String (Seq Int16, DocClass)))
-> ConduitT
     () Void (ResourceT IO) (Map String (Seq Int16, DocClass))
-> ResourceT IO (Map String (Seq Int16, DocClass))
forall a b. (a -> b) -> a -> b
$
  ConduitT () ContentRow (ResourceT IO) ()
forall (m :: * -> *) i.
(MonadResource m, MonadThrow m) =>
ConduitT i ContentRow m ()
contentFromFile ConduitT () ContentRow (ResourceT IO) ()
-> ConduitM
     ContentRow Void (ResourceT IO) (Map String (Seq Int16, DocClass))
-> ConduitT
     () Void (ResourceT IO) (Map String (Seq Int16, DocClass))
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
  (ContentRow -> Map String (Seq Int16, DocClass))
-> ConduitM
     ContentRow Void (ResourceT IO) (Map String (Seq Int16, DocClass))
forall (m :: * -> *) b a o.
(Monad m, Monoid b) =>
(a -> b) -> ConduitT a o m b
C.foldMap ( \(CRow String
k Seq Int16
fs DocClass
c) -> String -> (Seq Int16, DocClass) -> Map String (Seq Int16, DocClass)
forall k a. k -> a -> Map k a
M.singleton String
k (Seq Int16
fs, DocClass
c) )


-- | document classes of the Citeseer dataset
data DocClass = Agents | AI | DB | IR | ML | HCI deriving (DocClass -> DocClass -> Bool
(DocClass -> DocClass -> Bool)
-> (DocClass -> DocClass -> Bool) -> Eq DocClass
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: DocClass -> DocClass -> Bool
$c/= :: DocClass -> DocClass -> Bool
== :: DocClass -> DocClass -> Bool
$c== :: DocClass -> DocClass -> Bool
Eq, Int -> DocClass -> ShowS
[DocClass] -> ShowS
DocClass -> String
(Int -> DocClass -> ShowS)
-> (DocClass -> String) -> ([DocClass] -> ShowS) -> Show DocClass
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [DocClass] -> ShowS
$cshowList :: [DocClass] -> ShowS
show :: DocClass -> String
$cshow :: DocClass -> String
showsPrec :: Int -> DocClass -> ShowS
$cshowsPrec :: Int -> DocClass -> ShowS
Show, (forall x. DocClass -> Rep DocClass x)
-> (forall x. Rep DocClass x -> DocClass) -> Generic DocClass
forall x. Rep DocClass x -> DocClass
forall x. DocClass -> Rep DocClass x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep DocClass x -> DocClass
$cfrom :: forall x. DocClass -> Rep DocClass x
Generic, Get DocClass
[DocClass] -> Put
DocClass -> Put
(DocClass -> Put)
-> Get DocClass -> ([DocClass] -> Put) -> Binary DocClass
forall t. (t -> Put) -> Get t -> ([t] -> Put) -> Binary t
putList :: [DocClass] -> Put
$cputList :: [DocClass] -> Put
get :: Get DocClass
$cget :: Get DocClass
put :: DocClass -> Put
$cput :: DocClass -> Put
Binary)

docClassP :: Parser DocClass
docClassP :: Parser DocClass
docClassP =
  (Text -> Parser Text
symbol Text
"Agents" Parser Text -> DocClass -> Parser DocClass
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> DocClass
Agents) Parser DocClass -> Parser DocClass -> Parser DocClass
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
  (Text -> Parser Text
symbol Text
"AI" Parser Text -> DocClass -> Parser DocClass
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> DocClass
AI) Parser DocClass -> Parser DocClass -> Parser DocClass
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
  (Text -> Parser Text
symbol Text
"DB" Parser Text -> DocClass -> Parser DocClass
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> DocClass
DB) Parser DocClass -> Parser DocClass -> Parser DocClass
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
  (Text -> Parser Text
symbol Text
"IR" Parser Text -> DocClass -> Parser DocClass
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> DocClass
IR) Parser DocClass -> Parser DocClass -> Parser DocClass
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
  (Text -> Parser Text
symbol Text
"ML" Parser Text -> DocClass -> Parser DocClass
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> DocClass
ML) Parser DocClass -> Parser DocClass -> Parser DocClass
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
  (Text -> Parser Text
symbol Text
"HCI" Parser Text -> DocClass -> Parser DocClass
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> DocClass
HCI)

{-
The .content file contains descriptions of the papers in the following format:

		<paper_id> <word_attributes>+ <class_label>

The first entry in each line contains the unique string ID of the paper followed by binary values indicating whether each word in the vocabulary is present (indicated by 1) or absent (indicated by 0) in the paper (vocabulary : 3703 unique words). Finally, the last entry in the line contains the class label of the paper.
-}

-- | only process the .content file within the archive
content :: (MonadThrow io, MonadIO io) => ConduitT TarChunk o io ()
content :: ConduitT TarChunk o io ()
content = (FileInfo -> ConduitM ByteString o io ())
-> ConduitT TarChunk o io ()
forall (m :: * -> *) o.
MonadThrow m =>
(FileInfo -> ConduitM ByteString o m ())
-> ConduitM TarChunk o m ()
withFileInfo ((FileInfo -> ConduitM ByteString o io ())
 -> ConduitT TarChunk o io ())
-> (FileInfo -> ConduitM ByteString o io ())
-> ConduitT TarChunk o io ()
forall a b. (a -> b) -> a -> b
$ \FileInfo
fi ->
  Bool -> ConduitM ByteString o io () -> ConduitM ByteString o io ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when ((ShowS
takeExtension ShowS -> (ByteString -> String) -> ByteString -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> String
unpack (ByteString -> String) -> ByteString -> String
forall a b. (a -> b) -> a -> b
$ FileInfo -> ByteString
filePath FileInfo
fi) String -> String -> Bool
forall a. Eq a => a -> a -> Bool
== String
".content") (ConduitM ByteString o io () -> ConduitM ByteString o io ())
-> ConduitM ByteString o io () -> ConduitM ByteString o io ()
forall a b. (a -> b) -> a -> b
$
    ConduitT ByteString (Row Text) io ()
forall (m :: * -> *).
MonadThrow m =>
ConduitT ByteString (Row Text) m ()
parseTSV ConduitT ByteString (Row Text) io ()
-> ConduitM (Row Text) o io () -> ConduitM ByteString o io ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    (Row Text -> Text) -> ConduitT (Row Text) Text io ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
C.map Row Text -> Text
T.unwords ConduitT (Row Text) Text io ()
-> ConduitM Text o io () -> ConduitM (Row Text) o io ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    (Text -> Either (ParseErrorBundle Text Void) ContentRow)
-> ConduitT
     Text (Either (ParseErrorBundle Text Void) ContentRow) io ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
C.map (Parsec Void Text ContentRow
-> String -> Text -> Either (ParseErrorBundle Text Void) ContentRow
forall e s a.
Parsec e s a -> String -> s -> Either (ParseErrorBundle s e) a
parse Parsec Void Text ContentRow
contentRowP String
"") ConduitT
  Text (Either (ParseErrorBundle Text Void) ContentRow) io ()
-> ConduitM
     (Either (ParseErrorBundle Text Void) ContentRow) o io ()
-> ConduitM Text o io ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    ConduitM (Either (ParseErrorBundle Text Void) ContentRow) o io ()
forall a (m :: * -> *) o. (Show a, MonadIO m) => ConduitT a o m ()
C.print

contentToFile :: (MonadThrow m, MonadResource m) =>
                 FileInfo -> ConduitT ByteString c m ()
contentToFile :: FileInfo -> ConduitT ByteString c m ()
contentToFile FileInfo
fi = Bool -> ConduitT ByteString c m () -> ConduitT ByteString c m ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when ((ShowS
takeExtension ShowS -> (ByteString -> String) -> ByteString -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> String
unpack (ByteString -> String) -> ByteString -> String
forall a b. (a -> b) -> a -> b
$ FileInfo -> ByteString
filePath FileInfo
fi) String -> String -> Bool
forall a. Eq a => a -> a -> Bool
== String
".content") (ConduitT ByteString c m () -> ConduitT ByteString c m ())
-> ConduitT ByteString c m () -> ConduitT ByteString c m ()
forall a b. (a -> b) -> a -> b
$ do
  ConduitT ByteString (Row Text) m ()
forall (m :: * -> *).
MonadThrow m =>
ConduitT ByteString (Row Text) m ()
parseTSV ConduitT ByteString (Row Text) m ()
-> ConduitM (Row Text) c m () -> ConduitT ByteString c m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    (Row Text -> Text) -> ConduitT (Row Text) Text m ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
C.map Row Text -> Text
T.unwords ConduitT (Row Text) Text m ()
-> ConduitM Text c m () -> ConduitM (Row Text) c m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    (Text -> ContentRow) -> ConduitT Text ContentRow m ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
C.map ( \Text
r -> case Parsec Void Text ContentRow
-> String -> Text -> Either (ParseErrorBundle Text Void) ContentRow
forall e s a.
Parsec e s a -> String -> s -> Either (ParseErrorBundle s e) a
parse Parsec Void Text ContentRow
contentRowP String
"" Text
r of
              Left ParseErrorBundle Text Void
e -> String -> ContentRow
forall a. HasCallStack => String -> a
error (String -> ContentRow) -> String -> ContentRow
forall a b. (a -> b) -> a -> b
$ ParseErrorBundle Text Void -> String
forall s e.
(VisualStream s, TraversableStream s, ShowErrorComponent e) =>
ParseErrorBundle s e -> String
errorBundlePretty ParseErrorBundle Text Void
e
              Right ContentRow
x -> ContentRow
x ) ConduitT Text ContentRow m ()
-> ConduitM ContentRow c m () -> ConduitM Text c m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    ConduitT ContentRow ByteString m ()
forall b (m :: * -> *).
(Binary b, MonadThrow m) =>
ConduitT b ByteString m ()
CB.conduitEncode ConduitT ContentRow ByteString m ()
-> ConduitT ByteString c m () -> ConduitM ContentRow c m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    String -> ConduitT ByteString c m ()
forall (m :: * -> *) o.
MonadResource m =>
String -> ConduitT ByteString o m ()
C.sinkFile String
"assets/citeseer/content-z"

contentFromFile :: (MonadResource m, MonadThrow m) => ConduitT i ContentRow m ()
contentFromFile :: ConduitT i ContentRow m ()
contentFromFile =
  String -> ConduitT i ByteString m ()
forall (m :: * -> *) i.
MonadResource m =>
String -> ConduitT i ByteString m ()
C.sourceFile String
"assets/citeseer/content-z" ConduitT i ByteString m ()
-> ConduitM ByteString ContentRow m ()
-> ConduitT i ContentRow m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
  ConduitM ByteString ContentRow m ()
forall b (m :: * -> *).
(Binary b, MonadThrow m) =>
ConduitT ByteString b m ()
CB.conduitDecode

-- | Dataset row of the .content file
data ContentRow = CRow {
  ContentRow -> String
crId :: String -- ^ identifier
  , ContentRow -> Seq Int16
crFeatures :: Seq Int16 -- ^ features, in sparse format (without the zeros)
  , ContentRow -> DocClass
crClass :: DocClass
                   } deriving (ContentRow -> ContentRow -> Bool
(ContentRow -> ContentRow -> Bool)
-> (ContentRow -> ContentRow -> Bool) -> Eq ContentRow
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: ContentRow -> ContentRow -> Bool
$c/= :: ContentRow -> ContentRow -> Bool
== :: ContentRow -> ContentRow -> Bool
$c== :: ContentRow -> ContentRow -> Bool
Eq, Int -> ContentRow -> ShowS
[ContentRow] -> ShowS
ContentRow -> String
(Int -> ContentRow -> ShowS)
-> (ContentRow -> String)
-> ([ContentRow] -> ShowS)
-> Show ContentRow
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [ContentRow] -> ShowS
$cshowList :: [ContentRow] -> ShowS
show :: ContentRow -> String
$cshow :: ContentRow -> String
showsPrec :: Int -> ContentRow -> ShowS
$cshowsPrec :: Int -> ContentRow -> ShowS
Show, (forall x. ContentRow -> Rep ContentRow x)
-> (forall x. Rep ContentRow x -> ContentRow) -> Generic ContentRow
forall x. Rep ContentRow x -> ContentRow
forall x. ContentRow -> Rep ContentRow x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep ContentRow x -> ContentRow
$cfrom :: forall x. ContentRow -> Rep ContentRow x
Generic, Get ContentRow
[ContentRow] -> Put
ContentRow -> Put
(ContentRow -> Put)
-> Get ContentRow -> ([ContentRow] -> Put) -> Binary ContentRow
forall t. (t -> Put) -> Get t -> ([t] -> Put) -> Binary t
putList :: [ContentRow] -> Put
$cputList :: [ContentRow] -> Put
get :: Get ContentRow
$cget :: Get ContentRow
put :: ContentRow -> Put
$cput :: ContentRow -> Put
Binary)

bit :: Parser Bool
bit :: Parser Bool
bit = (Token Text -> ParsecT Void Text Identity (Token Text)
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char Char
Token Text
'0' ParsecT Void Text Identity Char -> Bool -> Parser Bool
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> Bool
False) Parser Bool -> Parser Bool -> Parser Bool
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Token Text -> ParsecT Void Text Identity (Token Text)
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char Char
Token Text
'1' ParsecT Void Text Identity Char -> Bool -> Parser Bool
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> Bool
True)

sparse :: Foldable t => t Bool -> Seq Int16
sparse :: t Bool -> Seq Int16
sparse = (Seq Int16, Int16) -> Seq Int16
forall a b. (a, b) -> a
fst ((Seq Int16, Int16) -> Seq Int16)
-> (t Bool -> (Seq Int16, Int16)) -> t Bool -> Seq Int16
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ((Seq Int16, Int16) -> Bool -> (Seq Int16, Int16))
-> (Seq Int16, Int16) -> t Bool -> (Seq Int16, Int16)
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl (\(Seq Int16
acc, Int16
i) Bool
b -> if Bool
b then (Seq Int16
acc Seq Int16 -> Int16 -> Seq Int16
forall a. Seq a -> a -> Seq a
|> Int16
i, Int16 -> Int16
forall a. Enum a => a -> a
succ Int16
i) else (Seq Int16
acc, Int16 -> Int16
forall a. Enum a => a -> a
succ Int16
i)) (Seq Int16
forall a. Monoid a => a
mempty, Int16
0)

contentRowP :: Parser ContentRow
contentRowP :: Parsec Void Text ContentRow
contentRowP = do
  String
i <- Parser String -> Parser String
forall a. Parser a -> Parser a
lexeme Parser String
alphaNum
  let n :: Int
n = Int
3703
  [Bool]
foh <- Int -> Parser Bool -> ParsecT Void Text Identity [Bool]
forall (m :: * -> *) a. Monad m => Int -> m a -> m [a]
count Int
n (Parser Bool -> Parser Bool
forall a. Parser a -> Parser a
lexeme Parser Bool
bit) -- one-hot encoded features
  let feats :: Seq Int16
feats = [Bool] -> Seq Int16
forall (t :: * -> *). Foldable t => t Bool -> Seq Int16
sparse [Bool]
foh
  DocClass
c <- Parser DocClass -> Parser DocClass
forall a. Parser a -> Parser a
lexeme Parser DocClass
docClassP
  ContentRow -> Parsec Void Text ContentRow
forall (f :: * -> *) a. Applicative f => a -> f a
pure (ContentRow -> Parsec Void Text ContentRow)
-> ContentRow -> Parsec Void Text ContentRow
forall a b. (a -> b) -> a -> b
$ String -> Seq Int16 -> DocClass -> ContentRow
CRow String
i Seq Int16
feats DocClass
c





{-
The .cites file contains the citation graph of the corpus. Each line describes a link in the following format:

		<ID of cited paper> <ID of citing paper>

Each line contains two paper IDs. The first entry is the ID of the paper being cited and the second ID stands for the paper which contains the citation. The direction of the link is from right to left. If a line is represented by "paper1 paper2" then the link is "paper2->paper1". 
-}
-- | only process the .cites file within the archive

citesToFile :: (MonadThrow m, MonadIO m, MonadResource m) =>
               FileInfo -> ConduitT ByteString c m ()
citesToFile :: FileInfo -> ConduitT ByteString c m ()
citesToFile FileInfo
fi = do
  let fpath :: String
fpath = ByteString -> String
unpack (ByteString -> String) -> ByteString -> String
forall a b. (a -> b) -> a -> b
$ FileInfo -> ByteString
filePath FileInfo
fi
  Bool -> ConduitT ByteString c m () -> ConduitT ByteString c m ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (ShowS
takeExtension String
fpath String -> String -> Bool
forall a. Eq a => a -> a -> Bool
== String
".cites") (ConduitT ByteString c m () -> ConduitT ByteString c m ())
-> ConduitT ByteString c m () -> ConduitT ByteString c m ()
forall a b. (a -> b) -> a -> b
$
    ConduitT ByteString (Row Text) m ()
forall (m :: * -> *).
MonadThrow m =>
ConduitT ByteString (Row Text) m ()
parseTSV ConduitT ByteString (Row Text) m ()
-> ConduitM (Row Text) c m () -> ConduitT ByteString c m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    (Row Text -> Text) -> ConduitT (Row Text) Text m ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
C.map Row Text -> Text
T.unwords ConduitT (Row Text) Text m ()
-> ConduitM Text c m () -> ConduitM (Row Text) c m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    (Text -> CitesRow String) -> ConduitT Text (CitesRow String) m ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
C.map ( \Text
r -> case Parsec Void Text (CitesRow String)
-> String
-> Text
-> Either (ParseErrorBundle Text Void) (CitesRow String)
forall e s a.
Parsec e s a -> String -> s -> Either (ParseErrorBundle s e) a
parse Parsec Void Text (CitesRow String)
citesRowP String
"" Text
r of
              Left ParseErrorBundle Text Void
e -> String -> CitesRow String
forall a. HasCallStack => String -> a
error (String -> CitesRow String) -> String -> CitesRow String
forall a b. (a -> b) -> a -> b
$ ParseErrorBundle Text Void -> String
forall s e.
(VisualStream s, TraversableStream s, ShowErrorComponent e) =>
ParseErrorBundle s e -> String
errorBundlePretty ParseErrorBundle Text Void
e
              Right CitesRow String
x -> CitesRow String
x ) ConduitT Text (CitesRow String) m ()
-> ConduitM (CitesRow String) c m () -> ConduitM Text c m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    ConduitT (CitesRow String) ByteString m ()
forall b (m :: * -> *).
(Binary b, MonadThrow m) =>
ConduitT b ByteString m ()
CB.conduitEncode ConduitT (CitesRow String) ByteString m ()
-> ConduitT ByteString c m () -> ConduitM (CitesRow String) c m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    String -> ConduitT ByteString c m ()
forall (m :: * -> *) o.
MonadResource m =>
String -> ConduitT ByteString o m ()
C.sinkFile String
"assets/citeseer/cites"

citesFromFile :: (MonadResource m, MonadThrow m) => ConduitT i (CitesRow String) m ()
citesFromFile :: ConduitT i (CitesRow String) m ()
citesFromFile =
  String -> ConduitT i ByteString m ()
forall (m :: * -> *) i.
MonadResource m =>
String -> ConduitT i ByteString m ()
C.sourceFile String
"assets/citeseer/cites" ConduitT i ByteString m ()
-> ConduitM ByteString (CitesRow String) m ()
-> ConduitT i (CitesRow String) m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
  ConduitM ByteString (CitesRow String) m ()
forall b (m :: * -> *).
(Binary b, MonadThrow m) =>
ConduitT ByteString b m ()
CB.conduitDecode

-- | Reconstruct the citation graph
--
-- NB : relies on the user having `stash`ed the dataset to local disk first.
citeseerGraph :: IO (G.Graph ContentRow)
citeseerGraph :: IO (Graph ContentRow)
citeseerGraph = do
  Map String (Seq Int16, DocClass)
mm <- IO (Map String (Seq Int16, DocClass))
restoreContent
  ResourceT IO (Graph ContentRow) -> IO (Graph ContentRow)
forall (m :: * -> *) a. MonadUnliftIO m => ResourceT m a -> m a
runResourceT (ResourceT IO (Graph ContentRow) -> IO (Graph ContentRow))
-> ResourceT IO (Graph ContentRow) -> IO (Graph ContentRow)
forall a b. (a -> b) -> a -> b
$ ConduitT () Void (ResourceT IO) (Graph ContentRow)
-> ResourceT IO (Graph ContentRow)
forall (m :: * -> *) r. Monad m => ConduitT () Void m r -> m r
runConduit (ConduitT () Void (ResourceT IO) (Graph ContentRow)
 -> ResourceT IO (Graph ContentRow))
-> ConduitT () Void (ResourceT IO) (Graph ContentRow)
-> ResourceT IO (Graph ContentRow)
forall a b. (a -> b) -> a -> b
$
    ConduitT () (CitesRow String) (ResourceT IO) ()
forall (m :: * -> *) i.
(MonadResource m, MonadThrow m) =>
ConduitT i (CitesRow String) m ()
citesFromFile ConduitT () (CitesRow String) (ResourceT IO) ()
-> ConduitM
     (CitesRow String) Void (ResourceT IO) (Graph ContentRow)
-> ConduitT () Void (ResourceT IO) (Graph ContentRow)
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    (Graph ContentRow -> CitesRow String -> Graph ContentRow)
-> Graph ContentRow
-> ConduitM
     (CitesRow String) Void (ResourceT IO) (Graph ContentRow)
forall (m :: * -> *) a b o.
Monad m =>
(a -> b -> a) -> a -> ConduitT b o m a
C.foldl (\Graph ContentRow
gr (CitesRow String
b String
a) ->
               let
                 edm :: Maybe ((Seq Int16, DocClass), (Seq Int16, DocClass))
edm = (,) ((Seq Int16, DocClass)
 -> (Seq Int16, DocClass)
 -> ((Seq Int16, DocClass), (Seq Int16, DocClass)))
-> Maybe (Seq Int16, DocClass)
-> Maybe
     ((Seq Int16, DocClass)
      -> ((Seq Int16, DocClass), (Seq Int16, DocClass)))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> String
-> Map String (Seq Int16, DocClass) -> Maybe (Seq Int16, DocClass)
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup String
a Map String (Seq Int16, DocClass)
mm Maybe
  ((Seq Int16, DocClass)
   -> ((Seq Int16, DocClass), (Seq Int16, DocClass)))
-> Maybe (Seq Int16, DocClass)
-> Maybe ((Seq Int16, DocClass), (Seq Int16, DocClass))
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> String
-> Map String (Seq Int16, DocClass) -> Maybe (Seq Int16, DocClass)
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup String
b Map String (Seq Int16, DocClass)
mm
               in
                 case Maybe ((Seq Int16, DocClass), (Seq Int16, DocClass))
edm of
                   Maybe ((Seq Int16, DocClass), (Seq Int16, DocClass))
Nothing -> Graph ContentRow
gr -- error $ show e
                   Just ((Seq Int16
bffs, DocClass
bc), (Seq Int16
affs, DocClass
ac)) ->
                     let
                       acr :: ContentRow
acr = String -> Seq Int16 -> DocClass -> ContentRow
CRow String
a Seq Int16
affs DocClass
ac
                       bcr :: ContentRow
bcr = String -> Seq Int16 -> DocClass -> ContentRow
CRow String
b Seq Int16
bffs DocClass
bc
                     in
                       (ContentRow
acr ContentRow -> ContentRow -> Graph ContentRow
forall a. a -> a -> Graph a
`G.edge` ContentRow
bcr) Graph ContentRow -> Graph ContentRow -> Graph ContentRow
forall a. Graph a -> Graph a -> Graph a
`G.overlay` Graph ContentRow
gr
                ) Graph ContentRow
forall a. Graph a
G.empty

data CitesRow a = CitesRow { CitesRow a -> a
cirTo :: a, CitesRow a -> a
cirFrom :: a } deriving (CitesRow a -> CitesRow a -> Bool
(CitesRow a -> CitesRow a -> Bool)
-> (CitesRow a -> CitesRow a -> Bool) -> Eq (CitesRow a)
forall a. Eq a => CitesRow a -> CitesRow a -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: CitesRow a -> CitesRow a -> Bool
$c/= :: forall a. Eq a => CitesRow a -> CitesRow a -> Bool
== :: CitesRow a -> CitesRow a -> Bool
$c== :: forall a. Eq a => CitesRow a -> CitesRow a -> Bool
Eq, Int -> CitesRow a -> ShowS
[CitesRow a] -> ShowS
CitesRow a -> String
(Int -> CitesRow a -> ShowS)
-> (CitesRow a -> String)
-> ([CitesRow a] -> ShowS)
-> Show (CitesRow a)
forall a. Show a => Int -> CitesRow a -> ShowS
forall a. Show a => [CitesRow a] -> ShowS
forall a. Show a => CitesRow a -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [CitesRow a] -> ShowS
$cshowList :: forall a. Show a => [CitesRow a] -> ShowS
show :: CitesRow a -> String
$cshow :: forall a. Show a => CitesRow a -> String
showsPrec :: Int -> CitesRow a -> ShowS
$cshowsPrec :: forall a. Show a => Int -> CitesRow a -> ShowS
Show, (forall x. CitesRow a -> Rep (CitesRow a) x)
-> (forall x. Rep (CitesRow a) x -> CitesRow a)
-> Generic (CitesRow a)
forall x. Rep (CitesRow a) x -> CitesRow a
forall x. CitesRow a -> Rep (CitesRow a) x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
forall a x. Rep (CitesRow a) x -> CitesRow a
forall a x. CitesRow a -> Rep (CitesRow a) x
$cto :: forall a x. Rep (CitesRow a) x -> CitesRow a
$cfrom :: forall a x. CitesRow a -> Rep (CitesRow a) x
Generic, Get (CitesRow a)
[CitesRow a] -> Put
CitesRow a -> Put
(CitesRow a -> Put)
-> Get (CitesRow a) -> ([CitesRow a] -> Put) -> Binary (CitesRow a)
forall a. Binary a => Get (CitesRow a)
forall a. Binary a => [CitesRow a] -> Put
forall a. Binary a => CitesRow a -> Put
forall t. (t -> Put) -> Get t -> ([t] -> Put) -> Binary t
putList :: [CitesRow a] -> Put
$cputList :: forall a. Binary a => [CitesRow a] -> Put
get :: Get (CitesRow a)
$cget :: forall a. Binary a => Get (CitesRow a)
put :: CitesRow a -> Put
$cput :: forall a. Binary a => CitesRow a -> Put
Binary)

citesRowP :: Parser (CitesRow String)
citesRowP :: Parsec Void Text (CitesRow String)
citesRowP = String -> String -> CitesRow String
forall a. a -> a -> CitesRow a
CitesRow (String -> String -> CitesRow String)
-> Parser String
-> ParsecT Void Text Identity (String -> CitesRow String)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser String -> Parser String
forall a. Parser a -> Parser a
lexeme Parser String
alphaNum ParsecT Void Text Identity (String -> CitesRow String)
-> Parser String -> Parsec Void Text (CitesRow String)
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Parser String -> Parser String
forall a. Parser a -> Parser a
lexeme Parser String
alphaNum


-- test

-- -- | one row of the .content file
-- --
-- -- λ> content0
-- -- CRow {crId = "100157", crFeatures = fromList [36,46,65,215,261,565,1162,1508,1613,1641,1662,1797,1842,1988,2025,2399,2456,2521,2597,2618,2641,2902,3016,3050,3163,3268,3272,3287,3411,3447,3669], crClass = Agents}
-- content0 = do
--   t <- T.readFile "src/Algebra/Graph/IO/Datasets/LINQS/c0"
--   parseTest contentRowP t