{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE OverloadedStrings #-}
{-# language DeriveAnyClass #-}
{-# LANGUAGE DeriveGeneric #-}
-- | Link-based datasets from https://linqs.soe.ucsc.edu/data
{-# options_ghc -Wno-unused-imports -Wno-unused-top-binds #-}
module Algebra.Graph.IO.Datasets.LINQS (
  restoreContent, CitesRow(..), ContentRow(..), 
  -- * Internal
  stash, sourceGraphEdges, loadGraph
                                       ) where

import Control.Applicative (Alternative(..))
import Control.Monad (when)
import Control.Monad.IO.Class (MonadIO(..))
import Data.Functor (($>))
import GHC.Generics (Generic(..))
import GHC.Int (Int16)

-- algebraic-graphs
import qualified Algebra.Graph as G (Graph, empty, overlay, edge)
-- bytestring
import Data.ByteString (ByteString)
import Data.ByteString.Char8 (unpack)
-- binary
import Data.Binary (Binary(..), encode, decode, encodeFile, decodeFileOrFail)
-- binary-conduit
import qualified Data.Conduit.Serialization.Binary as CB (conduitDecode, conduitEncode, ParseError(..))
-- conduit
import Conduit (MonadUnliftIO(..), MonadResource, runResourceT)
import Data.Conduit (runConduit, ConduitT, (.|), yield, await)
import qualified Data.Conduit.Combinators as C (print, sourceFile, sinkFile, map, mapM, foldM, mapWhile, foldMap, foldl)
-- containers
import Data.Sequence (Seq, (|>))
import qualified Data.Map as M (Map, singleton, lookup)
-- directory
import System.Directory (createDirectoryIfMissing)
-- exceptions
import Control.Monad.Catch (MonadThrow(..))
-- filepath
import System.FilePath ((</>), takeFileName, takeExtension)
-- http-conduit
import Network.HTTP.Simple (httpSource, getResponseBody, Response, Request, parseRequest, setRequestMethod)
-- megaparsec
import Text.Megaparsec (parse)
import Text.Megaparsec.Char (char)
import Text.Megaparsec.Char.Lexer (decimal)
import Text.Megaparsec.Error (errorBundlePretty)
-- parser.combinators
import Control.Monad.Combinators (count)
-- primitive
import Control.Monad.Primitive (PrimMonad(..))
-- tar-conduit
import Data.Conduit.Tar (Header(..), untarChunks, TarChunk, withEntries, FileInfo, filePath, withFileInfo, headerFileType, FileType(..), headerFilePath)
-- text
import Data.Text (Text)
import qualified Data.Text as T (Text, unwords)
import qualified Data.Text.IO as T (readFile)

import Algebra.Graph.IO.Internal.Conduit (fetch, unTarGz)
import Algebra.Graph.IO.Internal.Megaparsec (Parser, ParseE, symbol, lexeme, alphaNum)
import Algebra.Graph.IO.SV (parseTSV)

{-
CiteSeer: The CiteSeer dataset consists of 3312 scientific publications classified into one of six classes. The citation network consists of 4732 links. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 3703 unique words. The README file in the dataset provides more details.
http://www.cs.umd.edu/~sen/lbc-proj/data/citeseer.tgz

Cora: The Cora dataset consists of 2708 scientific publications classified into one of seven classes. The citation network consists of 5429 links. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 1433 unique words. The README file in the dataset provides more details.
http://www.cs.umd.edu/~sen/lbc-proj/data/cora.tgz

WebKB: The WebKB dataset consists of 877 scientific publications classified into one of five classes. The citation network consists of 1608 links. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 1703 unique words. The README file in the dataset provides more details.
http://www.cs.umd.edu/~sen/lbc-proj/data/WebKB.tgz

-}


-- | Download, decompress, parse, serialize and save the dataset to local storage
stash :: (Binary c) =>
         FilePath -- ^ directory where the data files will be saved
      -> String -- ^ URI of .tar.gz file
      -> Int -- ^ dictionary size
      -> Parser c -- ^ document class
      -> IO ()
stash :: FilePath -> FilePath -> Int -> Parser c -> IO ()
stash FilePath
dir FilePath
uri Int
n Parser c
pc = do
  Request
rq <- FilePath -> IO Request
forall (m :: * -> *). MonadThrow m => FilePath -> m Request
parseRequest FilePath
uri
  Bool -> FilePath -> IO ()
createDirectoryIfMissing Bool
True FilePath
dir
  ResourceT IO () -> IO ()
forall (m :: * -> *) a. MonadUnliftIO m => ResourceT m a -> m a
runResourceT (ResourceT IO () -> IO ()) -> ResourceT IO () -> IO ()
forall a b. (a -> b) -> a -> b
$ ConduitT () Void (ResourceT IO) () -> ResourceT IO ()
forall (m :: * -> *) r. Monad m => ConduitT () Void m r -> m r
runConduit (ConduitT () Void (ResourceT IO) () -> ResourceT IO ())
-> ConduitT () Void (ResourceT IO) () -> ResourceT IO ()
forall a b. (a -> b) -> a -> b
$
    Request -> ConduitT () ByteString (ResourceT IO) ()
forall (m :: * -> *) i.
MonadResource m =>
Request -> ConduitT i ByteString m ()
fetch Request
rq ConduitT () ByteString (ResourceT IO) ()
-> ConduitM ByteString Void (ResourceT IO) ()
-> ConduitT () Void (ResourceT IO) ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    ConduitT ByteString TarChunk (ResourceT IO) ()
forall (m :: * -> *).
(PrimMonad m, MonadThrow m) =>
ConduitT ByteString TarChunk m ()
unTarGz ConduitT ByteString TarChunk (ResourceT IO) ()
-> ConduitM TarChunk Void (ResourceT IO) ()
-> ConduitM ByteString Void (ResourceT IO) ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    (FileInfo -> ConduitM ByteString Void (ResourceT IO) ())
-> ConduitM TarChunk Void (ResourceT IO) ()
forall (m :: * -> *) o.
MonadThrow m =>
(FileInfo -> ConduitM ByteString o m ())
-> ConduitM TarChunk o m ()
withFileInfo ( \FileInfo
fi -> do
     FilePath
-> Int
-> Parser c
-> FileInfo
-> ConduitM ByteString Void (ResourceT IO) ()
forall (m :: * -> *) c o.
(MonadThrow m, MonadResource m, Binary c) =>
FilePath
-> Int -> Parser c -> FileInfo -> ConduitT ByteString o m ()
contentToFile FilePath
dir Int
n Parser c
pc FileInfo
fi
     FilePath -> FileInfo -> ConduitM ByteString Void (ResourceT IO) ()
forall (m :: * -> *) c.
(MonadThrow m, MonadIO m, MonadResource m) =>
FilePath -> FileInfo -> ConduitT ByteString c m ()
citesToFile FilePath
dir FileInfo
fi )

-- | Load the graph node data from local storage
restoreContent :: (Binary c) => FilePath -- ^ directory where the data files are saved
               -> IO (M.Map String (Seq Int16, c))
restoreContent :: FilePath -> IO (Map FilePath (Seq Int16, c))
restoreContent FilePath
dir = ResourceT IO (Map FilePath (Seq Int16, c))
-> IO (Map FilePath (Seq Int16, c))
forall (m :: * -> *) a. MonadUnliftIO m => ResourceT m a -> m a
runResourceT (ResourceT IO (Map FilePath (Seq Int16, c))
 -> IO (Map FilePath (Seq Int16, c)))
-> ResourceT IO (Map FilePath (Seq Int16, c))
-> IO (Map FilePath (Seq Int16, c))
forall a b. (a -> b) -> a -> b
$ ConduitT () Void (ResourceT IO) (Map FilePath (Seq Int16, c))
-> ResourceT IO (Map FilePath (Seq Int16, c))
forall (m :: * -> *) r. Monad m => ConduitT () Void m r -> m r
runConduit (ConduitT () Void (ResourceT IO) (Map FilePath (Seq Int16, c))
 -> ResourceT IO (Map FilePath (Seq Int16, c)))
-> ConduitT () Void (ResourceT IO) (Map FilePath (Seq Int16, c))
-> ResourceT IO (Map FilePath (Seq Int16, c))
forall a b. (a -> b) -> a -> b
$
  FilePath -> ConduitT () (ContentRow c) (ResourceT IO) ()
forall (m :: * -> *) c i.
(MonadResource m, MonadThrow m, Binary c) =>
FilePath -> ConduitT i (ContentRow c) m ()
contentFromFile FilePath
dir ConduitT () (ContentRow c) (ResourceT IO) ()
-> ConduitM
     (ContentRow c) Void (ResourceT IO) (Map FilePath (Seq Int16, c))
-> ConduitT () Void (ResourceT IO) (Map FilePath (Seq Int16, c))
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
  (ContentRow c -> Map FilePath (Seq Int16, c))
-> ConduitM
     (ContentRow c) Void (ResourceT IO) (Map FilePath (Seq Int16, c))
forall (m :: * -> *) b a o.
(Monad m, Monoid b) =>
(a -> b) -> ConduitT a o m b
C.foldMap ( \(CRow FilePath
k Seq Int16
fs c
c) -> FilePath -> (Seq Int16, c) -> Map FilePath (Seq Int16, c)
forall k a. k -> a -> Map k a
M.singleton FilePath
k (Seq Int16
fs, c
c) )


citesFromFile :: (MonadResource m, MonadThrow m) => FilePath -> ConduitT i (CitesRow String) m ()
citesFromFile :: FilePath -> ConduitT i (CitesRow FilePath) m ()
citesFromFile FilePath
dir =
  FilePath -> ConduitT i ByteString m ()
forall (m :: * -> *) i.
MonadResource m =>
FilePath -> ConduitT i ByteString m ()
C.sourceFile (FilePath
dir FilePath -> FilePath -> FilePath
</> FilePath
"cites") ConduitT i ByteString m ()
-> ConduitM ByteString (CitesRow FilePath) m ()
-> ConduitT i (CitesRow FilePath) m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
  ConduitM ByteString (CitesRow FilePath) m ()
forall b (m :: * -> *).
(Binary b, MonadThrow m) =>
ConduitT ByteString b m ()
CB.conduitDecode

-- | Reconstruct the citation graph
--
-- NB : relies on the user having `stash`ed the dataset to local disk first.
loadGraph :: (Binary c) =>
                 FilePath -- ^ directory where the data files were saved
              -> IO (G.Graph (ContentRow c))
loadGraph :: FilePath -> IO (Graph (ContentRow c))
loadGraph FilePath
dir = do
  Map FilePath (Seq Int16, c)
mm <- FilePath -> IO (Map FilePath (Seq Int16, c))
forall c. Binary c => FilePath -> IO (Map FilePath (Seq Int16, c))
restoreContent FilePath
dir
  ResourceT IO (Graph (ContentRow c)) -> IO (Graph (ContentRow c))
forall (m :: * -> *) a. MonadUnliftIO m => ResourceT m a -> m a
runResourceT (ResourceT IO (Graph (ContentRow c)) -> IO (Graph (ContentRow c)))
-> ResourceT IO (Graph (ContentRow c)) -> IO (Graph (ContentRow c))
forall a b. (a -> b) -> a -> b
$ ConduitT () Void (ResourceT IO) (Graph (ContentRow c))
-> ResourceT IO (Graph (ContentRow c))
forall (m :: * -> *) r. Monad m => ConduitT () Void m r -> m r
runConduit (ConduitT () Void (ResourceT IO) (Graph (ContentRow c))
 -> ResourceT IO (Graph (ContentRow c)))
-> ConduitT () Void (ResourceT IO) (Graph (ContentRow c))
-> ResourceT IO (Graph (ContentRow c))
forall a b. (a -> b) -> a -> b
$
    FilePath -> ConduitT () (CitesRow FilePath) (ResourceT IO) ()
forall (m :: * -> *) i.
(MonadResource m, MonadThrow m) =>
FilePath -> ConduitT i (CitesRow FilePath) m ()
citesFromFile FilePath
dir ConduitT () (CitesRow FilePath) (ResourceT IO) ()
-> ConduitM
     (CitesRow FilePath) Void (ResourceT IO) (Graph (ContentRow c))
-> ConduitT () Void (ResourceT IO) (Graph (ContentRow c))
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    (Graph (ContentRow c) -> CitesRow FilePath -> Graph (ContentRow c))
-> Graph (ContentRow c)
-> ConduitM
     (CitesRow FilePath) Void (ResourceT IO) (Graph (ContentRow c))
forall (m :: * -> *) a b o.
Monad m =>
(a -> b -> a) -> a -> ConduitT b o m a
C.foldl (\Graph (ContentRow c)
gr (CitesRow FilePath
b FilePath
a) ->
               let
                 edm :: Maybe ((Seq Int16, c), (Seq Int16, c))
edm = (,) ((Seq Int16, c)
 -> (Seq Int16, c) -> ((Seq Int16, c), (Seq Int16, c)))
-> Maybe (Seq Int16, c)
-> Maybe ((Seq Int16, c) -> ((Seq Int16, c), (Seq Int16, c)))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FilePath -> Map FilePath (Seq Int16, c) -> Maybe (Seq Int16, c)
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup FilePath
a Map FilePath (Seq Int16, c)
mm Maybe ((Seq Int16, c) -> ((Seq Int16, c), (Seq Int16, c)))
-> Maybe (Seq Int16, c) -> Maybe ((Seq Int16, c), (Seq Int16, c))
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> FilePath -> Map FilePath (Seq Int16, c) -> Maybe (Seq Int16, c)
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup FilePath
b Map FilePath (Seq Int16, c)
mm
               in
                 case Maybe ((Seq Int16, c), (Seq Int16, c))
edm of
                   Maybe ((Seq Int16, c), (Seq Int16, c))
Nothing -> Graph (ContentRow c)
gr -- error $ show e
                   Just ((Seq Int16
bffs, c
bc), (Seq Int16
affs, c
ac)) ->
                     let
                       acr :: ContentRow c
acr = FilePath -> Seq Int16 -> c -> ContentRow c
forall c. FilePath -> Seq Int16 -> c -> ContentRow c
CRow FilePath
a Seq Int16
affs c
ac
                       bcr :: ContentRow c
bcr = FilePath -> Seq Int16 -> c -> ContentRow c
forall c. FilePath -> Seq Int16 -> c -> ContentRow c
CRow FilePath
b Seq Int16
bffs c
bc
                     in
                       (ContentRow c
acr ContentRow c -> ContentRow c -> Graph (ContentRow c)
forall a. a -> a -> Graph a
`G.edge` ContentRow c
bcr) Graph (ContentRow c)
-> Graph (ContentRow c) -> Graph (ContentRow c)
forall a. Graph a -> Graph a -> Graph a
`G.overlay` Graph (ContentRow c)
gr
                ) Graph (ContentRow c)
forall a. Graph a
G.empty

-- | Stream out the edges of the citation graph, in which the nodes are decorated with the document metadata.
--
-- The full citation graph can be reconstructed by folding over this stream and `G.overlay`ing the graph edges as they arrive.
--
-- This way the graph can be partitioned in training , test and validation subsets at the usage site
sourceGraphEdges :: (MonadResource m, MonadThrow m) =>
                      FilePath -- ^ directory of data files
                   -> M.Map String (Seq Int16, c) -- ^ 'content' data
                   -> ConduitT i (Maybe (G.Graph (ContentRow c))) m ()
sourceGraphEdges :: FilePath
-> Map FilePath (Seq Int16, c)
-> ConduitT i (Maybe (Graph (ContentRow c))) m ()
sourceGraphEdges FilePath
dir Map FilePath (Seq Int16, c)
mm =
    FilePath -> ConduitT i (CitesRow FilePath) m ()
forall (m :: * -> *) i.
(MonadResource m, MonadThrow m) =>
FilePath -> ConduitT i (CitesRow FilePath) m ()
citesFromFile FilePath
dir ConduitT i (CitesRow FilePath) m ()
-> ConduitM (CitesRow FilePath) (Maybe (Graph (ContentRow c))) m ()
-> ConduitT i (Maybe (Graph (ContentRow c))) m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    (CitesRow FilePath -> Maybe (Graph (ContentRow c)))
-> ConduitM (CitesRow FilePath) (Maybe (Graph (ContentRow c))) m ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
C.map (\(CitesRow FilePath
b FilePath
a) ->
             case (,) ((Seq Int16, c)
 -> (Seq Int16, c) -> ((Seq Int16, c), (Seq Int16, c)))
-> Maybe (Seq Int16, c)
-> Maybe ((Seq Int16, c) -> ((Seq Int16, c), (Seq Int16, c)))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FilePath -> Map FilePath (Seq Int16, c) -> Maybe (Seq Int16, c)
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup FilePath
a Map FilePath (Seq Int16, c)
mm Maybe ((Seq Int16, c) -> ((Seq Int16, c), (Seq Int16, c)))
-> Maybe (Seq Int16, c) -> Maybe ((Seq Int16, c), (Seq Int16, c))
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> FilePath -> Map FilePath (Seq Int16, c) -> Maybe (Seq Int16, c)
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup FilePath
b Map FilePath (Seq Int16, c)
mm of
               Maybe ((Seq Int16, c), (Seq Int16, c))
Nothing -> Maybe (Graph (ContentRow c))
forall a. Maybe a
Nothing
               Just ((Seq Int16
bffs, c
bc), (Seq Int16
affs, c
ac)) ->
                 let
                       acr :: ContentRow c
acr = FilePath -> Seq Int16 -> c -> ContentRow c
forall c. FilePath -> Seq Int16 -> c -> ContentRow c
CRow FilePath
a Seq Int16
affs c
ac
                       bcr :: ContentRow c
bcr = FilePath -> Seq Int16 -> c -> ContentRow c
forall c. FilePath -> Seq Int16 -> c -> ContentRow c
CRow FilePath
b Seq Int16
bffs c
bc
                 in Graph (ContentRow c) -> Maybe (Graph (ContentRow c))
forall a. a -> Maybe a
Just (ContentRow c
acr ContentRow c -> ContentRow c -> Graph (ContentRow c)
forall a. a -> a -> Graph a
`G.edge` ContentRow c
bcr))







contentToFile :: (MonadThrow m, MonadResource m, Binary c) =>
                 FilePath
              -> Int -- ^ dictionary size
              -> Parser c -- ^ document class
              -> FileInfo
              -> ConduitT ByteString o m ()
contentToFile :: FilePath
-> Int -> Parser c -> FileInfo -> ConduitT ByteString o m ()
contentToFile FilePath
dir Int
n Parser c
pc FileInfo
fi = Bool -> ConduitT ByteString o m () -> ConduitT ByteString o m ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when ((FilePath -> FilePath
takeExtension (FilePath -> FilePath)
-> (ByteString -> FilePath) -> ByteString -> FilePath
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> FilePath
unpack (ByteString -> FilePath) -> ByteString -> FilePath
forall a b. (a -> b) -> a -> b
$ FileInfo -> ByteString
filePath FileInfo
fi) FilePath -> FilePath -> Bool
forall a. Eq a => a -> a -> Bool
== FilePath
".content") (ConduitT ByteString o m () -> ConduitT ByteString o m ())
-> ConduitT ByteString o m () -> ConduitT ByteString o m ()
forall a b. (a -> b) -> a -> b
$ do
  ConduitT ByteString (Row Text) m ()
forall (m :: * -> *).
MonadThrow m =>
ConduitT ByteString (Row Text) m ()
parseTSV ConduitT ByteString (Row Text) m ()
-> ConduitM (Row Text) o m () -> ConduitT ByteString o m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    (Row Text -> Text) -> ConduitT (Row Text) Text m ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
C.map Row Text -> Text
T.unwords ConduitT (Row Text) Text m ()
-> ConduitM Text o m () -> ConduitM (Row Text) o m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    (Text -> ContentRow c) -> ConduitT Text (ContentRow c) m ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
C.map ( \Text
r -> case Parsec Void Text (ContentRow c)
-> FilePath
-> Text
-> Either (ParseErrorBundle Text Void) (ContentRow c)
forall e s a.
Parsec e s a -> FilePath -> s -> Either (ParseErrorBundle s e) a
parse (Int -> Parser c -> Parsec Void Text (ContentRow c)
forall c. Int -> Parser c -> Parser (ContentRow c)
contentRowP Int
n Parser c
pc) FilePath
"" Text
r of
              Left ParseErrorBundle Text Void
e -> FilePath -> ContentRow c
forall a. HasCallStack => FilePath -> a
error (FilePath -> ContentRow c) -> FilePath -> ContentRow c
forall a b. (a -> b) -> a -> b
$ ParseErrorBundle Text Void -> FilePath
forall s e.
(VisualStream s, TraversableStream s, ShowErrorComponent e) =>
ParseErrorBundle s e -> FilePath
errorBundlePretty ParseErrorBundle Text Void
e
              Right ContentRow c
x -> ContentRow c
x ) ConduitT Text (ContentRow c) m ()
-> ConduitM (ContentRow c) o m () -> ConduitM Text o m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    ConduitT (ContentRow c) ByteString m ()
forall b (m :: * -> *).
(Binary b, MonadThrow m) =>
ConduitT b ByteString m ()
CB.conduitEncode ConduitT (ContentRow c) ByteString m ()
-> ConduitT ByteString o m () -> ConduitM (ContentRow c) o m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    FilePath -> ConduitT ByteString o m ()
forall (m :: * -> *) o.
MonadResource m =>
FilePath -> ConduitT ByteString o m ()
C.sinkFile (FilePath
dir FilePath -> FilePath -> FilePath
</> FilePath
"content-z")

contentFromFile :: (MonadResource m, MonadThrow m, Binary c) => FilePath
                -> ConduitT i (ContentRow c) m ()
contentFromFile :: FilePath -> ConduitT i (ContentRow c) m ()
contentFromFile FilePath
dir =
  FilePath -> ConduitT i ByteString m ()
forall (m :: * -> *) i.
MonadResource m =>
FilePath -> ConduitT i ByteString m ()
C.sourceFile (FilePath
dir FilePath -> FilePath -> FilePath
</> FilePath
"content-z") ConduitT i ByteString m ()
-> ConduitM ByteString (ContentRow c) m ()
-> ConduitT i (ContentRow c) m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
  ConduitM ByteString (ContentRow c) m ()
forall b (m :: * -> *).
(Binary b, MonadThrow m) =>
ConduitT ByteString b m ()
CB.conduitDecode


-- | Who cites whom
data CitesRow a = CitesRow {
  CitesRow a -> a
cirTo :: a -- ^ cited
  , CitesRow a -> a
cirFrom :: a -- ^ citing
  } deriving (CitesRow a -> CitesRow a -> Bool
(CitesRow a -> CitesRow a -> Bool)
-> (CitesRow a -> CitesRow a -> Bool) -> Eq (CitesRow a)
forall a. Eq a => CitesRow a -> CitesRow a -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: CitesRow a -> CitesRow a -> Bool
$c/= :: forall a. Eq a => CitesRow a -> CitesRow a -> Bool
== :: CitesRow a -> CitesRow a -> Bool
$c== :: forall a. Eq a => CitesRow a -> CitesRow a -> Bool
Eq, Int -> CitesRow a -> FilePath -> FilePath
[CitesRow a] -> FilePath -> FilePath
CitesRow a -> FilePath
(Int -> CitesRow a -> FilePath -> FilePath)
-> (CitesRow a -> FilePath)
-> ([CitesRow a] -> FilePath -> FilePath)
-> Show (CitesRow a)
forall a. Show a => Int -> CitesRow a -> FilePath -> FilePath
forall a. Show a => [CitesRow a] -> FilePath -> FilePath
forall a. Show a => CitesRow a -> FilePath
forall a.
(Int -> a -> FilePath -> FilePath)
-> (a -> FilePath) -> ([a] -> FilePath -> FilePath) -> Show a
showList :: [CitesRow a] -> FilePath -> FilePath
$cshowList :: forall a. Show a => [CitesRow a] -> FilePath -> FilePath
show :: CitesRow a -> FilePath
$cshow :: forall a. Show a => CitesRow a -> FilePath
showsPrec :: Int -> CitesRow a -> FilePath -> FilePath
$cshowsPrec :: forall a. Show a => Int -> CitesRow a -> FilePath -> FilePath
Show, (forall x. CitesRow a -> Rep (CitesRow a) x)
-> (forall x. Rep (CitesRow a) x -> CitesRow a)
-> Generic (CitesRow a)
forall x. Rep (CitesRow a) x -> CitesRow a
forall x. CitesRow a -> Rep (CitesRow a) x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
forall a x. Rep (CitesRow a) x -> CitesRow a
forall a x. CitesRow a -> Rep (CitesRow a) x
$cto :: forall a x. Rep (CitesRow a) x -> CitesRow a
$cfrom :: forall a x. CitesRow a -> Rep (CitesRow a) x
Generic, Get (CitesRow a)
[CitesRow a] -> Put
CitesRow a -> Put
(CitesRow a -> Put)
-> Get (CitesRow a) -> ([CitesRow a] -> Put) -> Binary (CitesRow a)
forall a. Binary a => Get (CitesRow a)
forall a. Binary a => [CitesRow a] -> Put
forall a. Binary a => CitesRow a -> Put
forall t. (t -> Put) -> Get t -> ([t] -> Put) -> Binary t
putList :: [CitesRow a] -> Put
$cputList :: forall a. Binary a => [CitesRow a] -> Put
get :: Get (CitesRow a)
$cget :: forall a. Binary a => Get (CitesRow a)
put :: CitesRow a -> Put
$cput :: forall a. Binary a => CitesRow a -> Put
Binary)

citesRowP :: Parser (CitesRow String)
citesRowP :: Parser (CitesRow FilePath)
citesRowP = FilePath -> FilePath -> CitesRow FilePath
forall a. a -> a -> CitesRow a
CitesRow (FilePath -> FilePath -> CitesRow FilePath)
-> ParsecT Void Text Identity FilePath
-> ParsecT Void Text Identity (FilePath -> CitesRow FilePath)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ParsecT Void Text Identity FilePath
-> ParsecT Void Text Identity FilePath
forall a. Parser a -> Parser a
lexeme ParsecT Void Text Identity FilePath
alphaNum ParsecT Void Text Identity (FilePath -> CitesRow FilePath)
-> ParsecT Void Text Identity FilePath
-> Parser (CitesRow FilePath)
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> ParsecT Void Text Identity FilePath
-> ParsecT Void Text Identity FilePath
forall a. Parser a -> Parser a
lexeme ParsecT Void Text Identity FilePath
alphaNum


-- | Dataset row of the .content file
--
-- The .content file contains descriptions of the papers in the following format:
--
-- 		\<paper_id\> \<word_attributes\> \<class_label\>
--
-- The first entry in each line contains the unique string ID of the paper followed by binary values indicating whether each word in the vocabulary is present (indicated by 1) or absent (indicated by 0) in the paper. Finally, the last entry in the line contains the class label of the paper.
data ContentRow c = CRow {
  ContentRow c -> FilePath
crId :: String -- ^ identifier
  , ContentRow c -> Seq Int16
crFeatures :: Seq Int16 -- ^ features, in sparse format (without the zeros)
  , ContentRow c -> c
crClass :: c -- ^ document class label
                   } deriving (ContentRow c -> ContentRow c -> Bool
(ContentRow c -> ContentRow c -> Bool)
-> (ContentRow c -> ContentRow c -> Bool) -> Eq (ContentRow c)
forall c. Eq c => ContentRow c -> ContentRow c -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: ContentRow c -> ContentRow c -> Bool
$c/= :: forall c. Eq c => ContentRow c -> ContentRow c -> Bool
== :: ContentRow c -> ContentRow c -> Bool
$c== :: forall c. Eq c => ContentRow c -> ContentRow c -> Bool
Eq, Eq (ContentRow c)
Eq (ContentRow c)
-> (ContentRow c -> ContentRow c -> Ordering)
-> (ContentRow c -> ContentRow c -> Bool)
-> (ContentRow c -> ContentRow c -> Bool)
-> (ContentRow c -> ContentRow c -> Bool)
-> (ContentRow c -> ContentRow c -> Bool)
-> (ContentRow c -> ContentRow c -> ContentRow c)
-> (ContentRow c -> ContentRow c -> ContentRow c)
-> Ord (ContentRow c)
ContentRow c -> ContentRow c -> Bool
ContentRow c -> ContentRow c -> Ordering
ContentRow c -> ContentRow c -> ContentRow c
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
forall c. Ord c => Eq (ContentRow c)
forall c. Ord c => ContentRow c -> ContentRow c -> Bool
forall c. Ord c => ContentRow c -> ContentRow c -> Ordering
forall c. Ord c => ContentRow c -> ContentRow c -> ContentRow c
min :: ContentRow c -> ContentRow c -> ContentRow c
$cmin :: forall c. Ord c => ContentRow c -> ContentRow c -> ContentRow c
max :: ContentRow c -> ContentRow c -> ContentRow c
$cmax :: forall c. Ord c => ContentRow c -> ContentRow c -> ContentRow c
>= :: ContentRow c -> ContentRow c -> Bool
$c>= :: forall c. Ord c => ContentRow c -> ContentRow c -> Bool
> :: ContentRow c -> ContentRow c -> Bool
$c> :: forall c. Ord c => ContentRow c -> ContentRow c -> Bool
<= :: ContentRow c -> ContentRow c -> Bool
$c<= :: forall c. Ord c => ContentRow c -> ContentRow c -> Bool
< :: ContentRow c -> ContentRow c -> Bool
$c< :: forall c. Ord c => ContentRow c -> ContentRow c -> Bool
compare :: ContentRow c -> ContentRow c -> Ordering
$ccompare :: forall c. Ord c => ContentRow c -> ContentRow c -> Ordering
$cp1Ord :: forall c. Ord c => Eq (ContentRow c)
Ord, Int -> ContentRow c -> FilePath -> FilePath
[ContentRow c] -> FilePath -> FilePath
ContentRow c -> FilePath
(Int -> ContentRow c -> FilePath -> FilePath)
-> (ContentRow c -> FilePath)
-> ([ContentRow c] -> FilePath -> FilePath)
-> Show (ContentRow c)
forall c. Show c => Int -> ContentRow c -> FilePath -> FilePath
forall c. Show c => [ContentRow c] -> FilePath -> FilePath
forall c. Show c => ContentRow c -> FilePath
forall a.
(Int -> a -> FilePath -> FilePath)
-> (a -> FilePath) -> ([a] -> FilePath -> FilePath) -> Show a
showList :: [ContentRow c] -> FilePath -> FilePath
$cshowList :: forall c. Show c => [ContentRow c] -> FilePath -> FilePath
show :: ContentRow c -> FilePath
$cshow :: forall c. Show c => ContentRow c -> FilePath
showsPrec :: Int -> ContentRow c -> FilePath -> FilePath
$cshowsPrec :: forall c. Show c => Int -> ContentRow c -> FilePath -> FilePath
Show, (forall x. ContentRow c -> Rep (ContentRow c) x)
-> (forall x. Rep (ContentRow c) x -> ContentRow c)
-> Generic (ContentRow c)
forall x. Rep (ContentRow c) x -> ContentRow c
forall x. ContentRow c -> Rep (ContentRow c) x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
forall c x. Rep (ContentRow c) x -> ContentRow c
forall c x. ContentRow c -> Rep (ContentRow c) x
$cto :: forall c x. Rep (ContentRow c) x -> ContentRow c
$cfrom :: forall c x. ContentRow c -> Rep (ContentRow c) x
Generic, Get (ContentRow c)
[ContentRow c] -> Put
ContentRow c -> Put
(ContentRow c -> Put)
-> Get (ContentRow c)
-> ([ContentRow c] -> Put)
-> Binary (ContentRow c)
forall c. Binary c => Get (ContentRow c)
forall c. Binary c => [ContentRow c] -> Put
forall c. Binary c => ContentRow c -> Put
forall t. (t -> Put) -> Get t -> ([t] -> Put) -> Binary t
putList :: [ContentRow c] -> Put
$cputList :: forall c. Binary c => [ContentRow c] -> Put
get :: Get (ContentRow c)
$cget :: forall c. Binary c => Get (ContentRow c)
put :: ContentRow c -> Put
$cput :: forall c. Binary c => ContentRow c -> Put
Binary)

bit :: Parser Bool
bit :: Parser Bool
bit = (Token Text -> ParsecT Void Text Identity (Token Text)
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char Char
Token Text
'0' ParsecT Void Text Identity Char -> Bool -> Parser Bool
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> Bool
False) Parser Bool -> Parser Bool -> Parser Bool
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Token Text -> ParsecT Void Text Identity (Token Text)
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char Char
Token Text
'1' ParsecT Void Text Identity Char -> Bool -> Parser Bool
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> Bool
True)

sparse :: Foldable t => t Bool -> Seq Int16
sparse :: t Bool -> Seq Int16
sparse = (Seq Int16, Int16) -> Seq Int16
forall a b. (a, b) -> a
fst ((Seq Int16, Int16) -> Seq Int16)
-> (t Bool -> (Seq Int16, Int16)) -> t Bool -> Seq Int16
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ((Seq Int16, Int16) -> Bool -> (Seq Int16, Int16))
-> (Seq Int16, Int16) -> t Bool -> (Seq Int16, Int16)
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl (\(Seq Int16
acc, Int16
i) Bool
b -> if Bool
b then (Seq Int16
acc Seq Int16 -> Int16 -> Seq Int16
forall a. Seq a -> a -> Seq a
|> Int16
i, Int16 -> Int16
forall a. Enum a => a -> a
succ Int16
i) else (Seq Int16
acc, Int16 -> Int16
forall a. Enum a => a -> a
succ Int16
i)) (Seq Int16
forall a. Monoid a => a
mempty, Int16
0)

contentRowP :: Int -- ^ vocabulary size
            -> Parser c -- ^ parser for document class
            -> Parser (ContentRow c)
contentRowP :: Int -> Parser c -> Parser (ContentRow c)
contentRowP Int
n Parser c
dcp = do
  FilePath
i <- ParsecT Void Text Identity FilePath
-> ParsecT Void Text Identity FilePath
forall a. Parser a -> Parser a
lexeme ParsecT Void Text Identity FilePath
alphaNum
  Seq Int16
feats <- [Bool] -> Seq Int16
forall (t :: * -> *). Foldable t => t Bool -> Seq Int16
sparse ([Bool] -> Seq Int16)
-> ParsecT Void Text Identity [Bool]
-> ParsecT Void Text Identity (Seq Int16)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Int -> Parser Bool -> ParsecT Void Text Identity [Bool]
forall (m :: * -> *) a. Monad m => Int -> m a -> m [a]
count Int
n (Parser Bool -> Parser Bool
forall a. Parser a -> Parser a
lexeme Parser Bool
bit)
  c
c <- Parser c -> Parser c
forall a. Parser a -> Parser a
lexeme Parser c
dcp
  ContentRow c -> Parser (ContentRow c)
forall (f :: * -> *) a. Applicative f => a -> f a
pure (ContentRow c -> Parser (ContentRow c))
-> ContentRow c -> Parser (ContentRow c)
forall a b. (a -> b) -> a -> b
$ FilePath -> Seq Int16 -> c -> ContentRow c
forall c. FilePath -> Seq Int16 -> c -> ContentRow c
CRow FilePath
i Seq Int16
feats c
c



citesToFile :: (MonadThrow m, MonadIO m, MonadResource m) =>
               FilePath
            -> FileInfo
            -> ConduitT ByteString c m ()
citesToFile :: FilePath -> FileInfo -> ConduitT ByteString c m ()
citesToFile FilePath
dir FileInfo
fi = do
  let fpath :: FilePath
fpath = ByteString -> FilePath
unpack (ByteString -> FilePath) -> ByteString -> FilePath
forall a b. (a -> b) -> a -> b
$ FileInfo -> ByteString
filePath FileInfo
fi
  Bool -> ConduitT ByteString c m () -> ConduitT ByteString c m ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (FilePath -> FilePath
takeExtension FilePath
fpath FilePath -> FilePath -> Bool
forall a. Eq a => a -> a -> Bool
== FilePath
".cites") (ConduitT ByteString c m () -> ConduitT ByteString c m ())
-> ConduitT ByteString c m () -> ConduitT ByteString c m ()
forall a b. (a -> b) -> a -> b
$
    ConduitT ByteString (Row Text) m ()
forall (m :: * -> *).
MonadThrow m =>
ConduitT ByteString (Row Text) m ()
parseTSV ConduitT ByteString (Row Text) m ()
-> ConduitM (Row Text) c m () -> ConduitT ByteString c m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    (Row Text -> Text) -> ConduitT (Row Text) Text m ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
C.map Row Text -> Text
T.unwords ConduitT (Row Text) Text m ()
-> ConduitM Text c m () -> ConduitM (Row Text) c m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    (Text -> CitesRow FilePath)
-> ConduitT Text (CitesRow FilePath) m ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
C.map ( \Text
r -> case Parser (CitesRow FilePath)
-> FilePath
-> Text
-> Either (ParseErrorBundle Text Void) (CitesRow FilePath)
forall e s a.
Parsec e s a -> FilePath -> s -> Either (ParseErrorBundle s e) a
parse Parser (CitesRow FilePath)
citesRowP FilePath
"" Text
r of
              Left ParseErrorBundle Text Void
e -> FilePath -> CitesRow FilePath
forall a. HasCallStack => FilePath -> a
error (FilePath -> CitesRow FilePath) -> FilePath -> CitesRow FilePath
forall a b. (a -> b) -> a -> b
$ ParseErrorBundle Text Void -> FilePath
forall s e.
(VisualStream s, TraversableStream s, ShowErrorComponent e) =>
ParseErrorBundle s e -> FilePath
errorBundlePretty ParseErrorBundle Text Void
e
              Right CitesRow FilePath
x -> CitesRow FilePath
x ) ConduitT Text (CitesRow FilePath) m ()
-> ConduitM (CitesRow FilePath) c m () -> ConduitM Text c m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    ConduitT (CitesRow FilePath) ByteString m ()
forall b (m :: * -> *).
(Binary b, MonadThrow m) =>
ConduitT b ByteString m ()
CB.conduitEncode ConduitT (CitesRow FilePath) ByteString m ()
-> ConduitT ByteString c m ()
-> ConduitM (CitesRow FilePath) c m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    FilePath -> ConduitT ByteString c m ()
forall (m :: * -> *) o.
MonadResource m =>
FilePath -> ConduitT ByteString o m ()
C.sinkFile (FilePath
dir FilePath -> FilePath -> FilePath
</> FilePath
"cites")