{-# language DeriveAnyClass #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE OverloadedStrings #-}
{-# options_ghc -Wno-unused-imports -Wno-unused-top-binds #-}
module Algebra.Graph.IO.Datasets.LINQS.Citeseer (
stash
, citeseerGraph, citeseerGraphEdges, restoreContent,
ContentRow(..), DocClass(..)) where
import Control.Applicative (Alternative(..))
import Control.Monad (when, foldM)
import Control.Monad.IO.Class (MonadIO(..))
import GHC.Generics (Generic(..))
import GHC.Int (Int16)
import Data.Functor (($>))
import qualified Algebra.Graph as G (Graph, empty, overlay, edge)
import Data.Binary (Binary(..), encode, decode, encodeFile, decodeFileOrFail)
import qualified Data.Conduit.Serialization.Binary as CB (conduitDecode, conduitEncode, ParseError(..))
import Data.ByteString (ByteString)
import Data.ByteString.Char8 (unpack)
import Conduit (MonadUnliftIO(..), MonadResource, runResourceT)
import Data.Conduit (runConduit, ConduitT, (.|), yield, await)
import qualified Data.Conduit.Combinators as C (print, sourceFile, sinkFile, map, mapM, foldM, foldMap, foldl, foldMapM, mapWhile)
import Data.Sequence (Seq, (|>))
import qualified Data.Map as M (Map, singleton, lookup)
import System.Directory (createDirectoryIfMissing)
import Control.Monad.Catch (MonadThrow(..))
import System.FilePath ((</>), takeFileName, takeExtension)
import Network.HTTP.Simple (httpSource, getResponseBody, Response, Request, parseRequest, setRequestMethod)
import Text.Megaparsec (parse, parseTest, (<?>))
import Text.Megaparsec.Char (char)
import Text.Megaparsec.Char.Lexer (decimal)
import Text.Megaparsec.Error (errorBundlePretty)
import Control.Monad.Combinators (count)
import Control.Monad.Primitive (PrimMonad(..))
import Data.Conduit.Tar (Header(..), untarChunks, TarChunk, withEntries, FileInfo, filePath, withFileInfo, headerFileType, FileType(..), headerFilePath)
import qualified Data.Text as T (Text, unwords)
import qualified Data.Text.IO as T (readFile)
import Algebra.Graph.IO.Internal.Conduit (fetch, unTarGz)
import Algebra.Graph.IO.Internal.Megaparsec (Parser, ParseE, symbol, lexeme, alphaNum)
import Algebra.Graph.IO.SV (parseTSV)
stash :: FilePath
-> IO ()
stash :: FilePath -> IO ()
stash FilePath
dir = do
let path :: FilePath
path = FilePath
"http://www.cs.umd.edu/~sen/lbc-proj/data/citeseer.tgz"
Request
rq <- FilePath -> IO Request
forall (m :: * -> *). MonadThrow m => FilePath -> m Request
parseRequest FilePath
path
Bool -> FilePath -> IO ()
createDirectoryIfMissing Bool
True FilePath
dir
ResourceT IO () -> IO ()
forall (m :: * -> *) a. MonadUnliftIO m => ResourceT m a -> m a
runResourceT (ResourceT IO () -> IO ()) -> ResourceT IO () -> IO ()
forall a b. (a -> b) -> a -> b
$ ConduitT () Void (ResourceT IO) () -> ResourceT IO ()
forall (m :: * -> *) r. Monad m => ConduitT () Void m r -> m r
runConduit (ConduitT () Void (ResourceT IO) () -> ResourceT IO ())
-> ConduitT () Void (ResourceT IO) () -> ResourceT IO ()
forall a b. (a -> b) -> a -> b
$
Request -> ConduitT () ByteString (ResourceT IO) ()
forall (m :: * -> *) i.
MonadResource m =>
Request -> ConduitT i ByteString m ()
fetch Request
rq ConduitT () ByteString (ResourceT IO) ()
-> ConduitM ByteString Void (ResourceT IO) ()
-> ConduitT () Void (ResourceT IO) ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
ConduitT ByteString TarChunk (ResourceT IO) ()
forall (m :: * -> *).
(PrimMonad m, MonadThrow m) =>
ConduitT ByteString TarChunk m ()
unTarGz ConduitT ByteString TarChunk (ResourceT IO) ()
-> ConduitM TarChunk Void (ResourceT IO) ()
-> ConduitM ByteString Void (ResourceT IO) ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
(FileInfo -> ConduitM ByteString Void (ResourceT IO) ())
-> ConduitM TarChunk Void (ResourceT IO) ()
forall (m :: * -> *) o.
MonadThrow m =>
(FileInfo -> ConduitM ByteString o m ())
-> ConduitM TarChunk o m ()
withFileInfo ( \FileInfo
fi -> do
FilePath -> FileInfo -> ConduitM ByteString Void (ResourceT IO) ()
forall (m :: * -> *) c.
(MonadThrow m, MonadResource m) =>
FilePath -> FileInfo -> ConduitT ByteString c m ()
contentToFile FilePath
dir FileInfo
fi
FilePath -> FileInfo -> ConduitM ByteString Void (ResourceT IO) ()
forall (m :: * -> *) c.
(MonadThrow m, MonadIO m, MonadResource m) =>
FilePath -> FileInfo -> ConduitT ByteString c m ()
citesToFile FilePath
dir FileInfo
fi )
restoreContent :: FilePath
-> IO (M.Map String (Seq Int16, DocClass))
restoreContent :: FilePath -> IO (Map FilePath (Seq Int16, DocClass))
restoreContent FilePath
dir = ResourceT IO (Map FilePath (Seq Int16, DocClass))
-> IO (Map FilePath (Seq Int16, DocClass))
forall (m :: * -> *) a. MonadUnliftIO m => ResourceT m a -> m a
runResourceT (ResourceT IO (Map FilePath (Seq Int16, DocClass))
-> IO (Map FilePath (Seq Int16, DocClass)))
-> ResourceT IO (Map FilePath (Seq Int16, DocClass))
-> IO (Map FilePath (Seq Int16, DocClass))
forall a b. (a -> b) -> a -> b
$ ConduitT
() Void (ResourceT IO) (Map FilePath (Seq Int16, DocClass))
-> ResourceT IO (Map FilePath (Seq Int16, DocClass))
forall (m :: * -> *) r. Monad m => ConduitT () Void m r -> m r
runConduit (ConduitT
() Void (ResourceT IO) (Map FilePath (Seq Int16, DocClass))
-> ResourceT IO (Map FilePath (Seq Int16, DocClass)))
-> ConduitT
() Void (ResourceT IO) (Map FilePath (Seq Int16, DocClass))
-> ResourceT IO (Map FilePath (Seq Int16, DocClass))
forall a b. (a -> b) -> a -> b
$
FilePath -> ConduitT () ContentRow (ResourceT IO) ()
forall (m :: * -> *) i.
(MonadResource m, MonadThrow m) =>
FilePath -> ConduitT i ContentRow m ()
contentFromFile FilePath
dir ConduitT () ContentRow (ResourceT IO) ()
-> ConduitM
ContentRow Void (ResourceT IO) (Map FilePath (Seq Int16, DocClass))
-> ConduitT
() Void (ResourceT IO) (Map FilePath (Seq Int16, DocClass))
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
(ContentRow -> Map FilePath (Seq Int16, DocClass))
-> ConduitM
ContentRow Void (ResourceT IO) (Map FilePath (Seq Int16, DocClass))
forall (m :: * -> *) b a o.
(Monad m, Monoid b) =>
(a -> b) -> ConduitT a o m b
C.foldMap ( \(CRow FilePath
k Seq Int16
fs DocClass
c) -> FilePath
-> (Seq Int16, DocClass) -> Map FilePath (Seq Int16, DocClass)
forall k a. k -> a -> Map k a
M.singleton FilePath
k (Seq Int16
fs, DocClass
c) )
data DocClass = Agents | AI | DB | IR | ML | HCI deriving (DocClass -> DocClass -> Bool
(DocClass -> DocClass -> Bool)
-> (DocClass -> DocClass -> Bool) -> Eq DocClass
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: DocClass -> DocClass -> Bool
$c/= :: DocClass -> DocClass -> Bool
== :: DocClass -> DocClass -> Bool
$c== :: DocClass -> DocClass -> Bool
Eq, Eq DocClass
Eq DocClass
-> (DocClass -> DocClass -> Ordering)
-> (DocClass -> DocClass -> Bool)
-> (DocClass -> DocClass -> Bool)
-> (DocClass -> DocClass -> Bool)
-> (DocClass -> DocClass -> Bool)
-> (DocClass -> DocClass -> DocClass)
-> (DocClass -> DocClass -> DocClass)
-> Ord DocClass
DocClass -> DocClass -> Bool
DocClass -> DocClass -> Ordering
DocClass -> DocClass -> DocClass
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: DocClass -> DocClass -> DocClass
$cmin :: DocClass -> DocClass -> DocClass
max :: DocClass -> DocClass -> DocClass
$cmax :: DocClass -> DocClass -> DocClass
>= :: DocClass -> DocClass -> Bool
$c>= :: DocClass -> DocClass -> Bool
> :: DocClass -> DocClass -> Bool
$c> :: DocClass -> DocClass -> Bool
<= :: DocClass -> DocClass -> Bool
$c<= :: DocClass -> DocClass -> Bool
< :: DocClass -> DocClass -> Bool
$c< :: DocClass -> DocClass -> Bool
compare :: DocClass -> DocClass -> Ordering
$ccompare :: DocClass -> DocClass -> Ordering
$cp1Ord :: Eq DocClass
Ord, Int -> DocClass
DocClass -> Int
DocClass -> [DocClass]
DocClass -> DocClass
DocClass -> DocClass -> [DocClass]
DocClass -> DocClass -> DocClass -> [DocClass]
(DocClass -> DocClass)
-> (DocClass -> DocClass)
-> (Int -> DocClass)
-> (DocClass -> Int)
-> (DocClass -> [DocClass])
-> (DocClass -> DocClass -> [DocClass])
-> (DocClass -> DocClass -> [DocClass])
-> (DocClass -> DocClass -> DocClass -> [DocClass])
-> Enum DocClass
forall a.
(a -> a)
-> (a -> a)
-> (Int -> a)
-> (a -> Int)
-> (a -> [a])
-> (a -> a -> [a])
-> (a -> a -> [a])
-> (a -> a -> a -> [a])
-> Enum a
enumFromThenTo :: DocClass -> DocClass -> DocClass -> [DocClass]
$cenumFromThenTo :: DocClass -> DocClass -> DocClass -> [DocClass]
enumFromTo :: DocClass -> DocClass -> [DocClass]
$cenumFromTo :: DocClass -> DocClass -> [DocClass]
enumFromThen :: DocClass -> DocClass -> [DocClass]
$cenumFromThen :: DocClass -> DocClass -> [DocClass]
enumFrom :: DocClass -> [DocClass]
$cenumFrom :: DocClass -> [DocClass]
fromEnum :: DocClass -> Int
$cfromEnum :: DocClass -> Int
toEnum :: Int -> DocClass
$ctoEnum :: Int -> DocClass
pred :: DocClass -> DocClass
$cpred :: DocClass -> DocClass
succ :: DocClass -> DocClass
$csucc :: DocClass -> DocClass
Enum, Int -> DocClass -> ShowS
[DocClass] -> ShowS
DocClass -> FilePath
(Int -> DocClass -> ShowS)
-> (DocClass -> FilePath) -> ([DocClass] -> ShowS) -> Show DocClass
forall a.
(Int -> a -> ShowS) -> (a -> FilePath) -> ([a] -> ShowS) -> Show a
showList :: [DocClass] -> ShowS
$cshowList :: [DocClass] -> ShowS
show :: DocClass -> FilePath
$cshow :: DocClass -> FilePath
showsPrec :: Int -> DocClass -> ShowS
$cshowsPrec :: Int -> DocClass -> ShowS
Show, (forall x. DocClass -> Rep DocClass x)
-> (forall x. Rep DocClass x -> DocClass) -> Generic DocClass
forall x. Rep DocClass x -> DocClass
forall x. DocClass -> Rep DocClass x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep DocClass x -> DocClass
$cfrom :: forall x. DocClass -> Rep DocClass x
Generic, Get DocClass
[DocClass] -> Put
DocClass -> Put
(DocClass -> Put)
-> Get DocClass -> ([DocClass] -> Put) -> Binary DocClass
forall t. (t -> Put) -> Get t -> ([t] -> Put) -> Binary t
putList :: [DocClass] -> Put
$cputList :: [DocClass] -> Put
get :: Get DocClass
$cget :: Get DocClass
put :: DocClass -> Put
$cput :: DocClass -> Put
Binary)
docClassP :: Parser DocClass
docClassP :: Parser DocClass
docClassP =
(Text -> Parser Text
symbol Text
"Agents" Parser Text -> DocClass -> Parser DocClass
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> DocClass
Agents) Parser DocClass -> Parser DocClass -> Parser DocClass
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
(Text -> Parser Text
symbol Text
"AI" Parser Text -> DocClass -> Parser DocClass
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> DocClass
AI) Parser DocClass -> Parser DocClass -> Parser DocClass
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
(Text -> Parser Text
symbol Text
"DB" Parser Text -> DocClass -> Parser DocClass
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> DocClass
DB) Parser DocClass -> Parser DocClass -> Parser DocClass
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
(Text -> Parser Text
symbol Text
"IR" Parser Text -> DocClass -> Parser DocClass
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> DocClass
IR) Parser DocClass -> Parser DocClass -> Parser DocClass
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
(Text -> Parser Text
symbol Text
"ML" Parser Text -> DocClass -> Parser DocClass
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> DocClass
ML) Parser DocClass -> Parser DocClass -> Parser DocClass
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
(Text -> Parser Text
symbol Text
"HCI" Parser Text -> DocClass -> Parser DocClass
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> DocClass
HCI)
content :: (MonadThrow io, MonadIO io) => ConduitT TarChunk o io ()
content :: ConduitT TarChunk o io ()
content = (FileInfo -> ConduitM ByteString o io ())
-> ConduitT TarChunk o io ()
forall (m :: * -> *) o.
MonadThrow m =>
(FileInfo -> ConduitM ByteString o m ())
-> ConduitM TarChunk o m ()
withFileInfo ((FileInfo -> ConduitM ByteString o io ())
-> ConduitT TarChunk o io ())
-> (FileInfo -> ConduitM ByteString o io ())
-> ConduitT TarChunk o io ()
forall a b. (a -> b) -> a -> b
$ \FileInfo
fi ->
Bool -> ConduitM ByteString o io () -> ConduitM ByteString o io ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when ((ShowS
takeExtension ShowS -> (ByteString -> FilePath) -> ByteString -> FilePath
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> FilePath
unpack (ByteString -> FilePath) -> ByteString -> FilePath
forall a b. (a -> b) -> a -> b
$ FileInfo -> ByteString
filePath FileInfo
fi) FilePath -> FilePath -> Bool
forall a. Eq a => a -> a -> Bool
== FilePath
".content") (ConduitM ByteString o io () -> ConduitM ByteString o io ())
-> ConduitM ByteString o io () -> ConduitM ByteString o io ()
forall a b. (a -> b) -> a -> b
$
ConduitT ByteString (Row Text) io ()
forall (m :: * -> *).
MonadThrow m =>
ConduitT ByteString (Row Text) m ()
parseTSV ConduitT ByteString (Row Text) io ()
-> ConduitM (Row Text) o io () -> ConduitM ByteString o io ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
(Row Text -> Text) -> ConduitT (Row Text) Text io ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
C.map Row Text -> Text
T.unwords ConduitT (Row Text) Text io ()
-> ConduitM Text o io () -> ConduitM (Row Text) o io ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
(Text -> Either (ParseErrorBundle Text Void) ContentRow)
-> ConduitT
Text (Either (ParseErrorBundle Text Void) ContentRow) io ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
C.map (Parsec Void Text ContentRow
-> FilePath
-> Text
-> Either (ParseErrorBundle Text Void) ContentRow
forall e s a.
Parsec e s a -> FilePath -> s -> Either (ParseErrorBundle s e) a
parse Parsec Void Text ContentRow
contentRowP FilePath
"") ConduitT
Text (Either (ParseErrorBundle Text Void) ContentRow) io ()
-> ConduitM
(Either (ParseErrorBundle Text Void) ContentRow) o io ()
-> ConduitM Text o io ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
ConduitM (Either (ParseErrorBundle Text Void) ContentRow) o io ()
forall a (m :: * -> *) o. (Show a, MonadIO m) => ConduitT a o m ()
C.print
contentToFile :: (MonadThrow m, MonadResource m) =>
FilePath -> FileInfo -> ConduitT ByteString c m ()
contentToFile :: FilePath -> FileInfo -> ConduitT ByteString c m ()
contentToFile FilePath
dir FileInfo
fi = Bool -> ConduitT ByteString c m () -> ConduitT ByteString c m ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when ((ShowS
takeExtension ShowS -> (ByteString -> FilePath) -> ByteString -> FilePath
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> FilePath
unpack (ByteString -> FilePath) -> ByteString -> FilePath
forall a b. (a -> b) -> a -> b
$ FileInfo -> ByteString
filePath FileInfo
fi) FilePath -> FilePath -> Bool
forall a. Eq a => a -> a -> Bool
== FilePath
".content") (ConduitT ByteString c m () -> ConduitT ByteString c m ())
-> ConduitT ByteString c m () -> ConduitT ByteString c m ()
forall a b. (a -> b) -> a -> b
$ do
ConduitT ByteString (Row Text) m ()
forall (m :: * -> *).
MonadThrow m =>
ConduitT ByteString (Row Text) m ()
parseTSV ConduitT ByteString (Row Text) m ()
-> ConduitM (Row Text) c m () -> ConduitT ByteString c m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
(Row Text -> Text) -> ConduitT (Row Text) Text m ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
C.map Row Text -> Text
T.unwords ConduitT (Row Text) Text m ()
-> ConduitM Text c m () -> ConduitM (Row Text) c m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
(Text -> ContentRow) -> ConduitT Text ContentRow m ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
C.map ( \Text
r -> case Parsec Void Text ContentRow
-> FilePath
-> Text
-> Either (ParseErrorBundle Text Void) ContentRow
forall e s a.
Parsec e s a -> FilePath -> s -> Either (ParseErrorBundle s e) a
parse Parsec Void Text ContentRow
contentRowP FilePath
"" Text
r of
Left ParseErrorBundle Text Void
e -> FilePath -> ContentRow
forall a. HasCallStack => FilePath -> a
error (FilePath -> ContentRow) -> FilePath -> ContentRow
forall a b. (a -> b) -> a -> b
$ ParseErrorBundle Text Void -> FilePath
forall s e.
(VisualStream s, TraversableStream s, ShowErrorComponent e) =>
ParseErrorBundle s e -> FilePath
errorBundlePretty ParseErrorBundle Text Void
e
Right ContentRow
x -> ContentRow
x ) ConduitT Text ContentRow m ()
-> ConduitM ContentRow c m () -> ConduitM Text c m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
ConduitT ContentRow ByteString m ()
forall b (m :: * -> *).
(Binary b, MonadThrow m) =>
ConduitT b ByteString m ()
CB.conduitEncode ConduitT ContentRow ByteString m ()
-> ConduitT ByteString c m () -> ConduitM ContentRow c m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
FilePath -> ConduitT ByteString c m ()
forall (m :: * -> *) o.
MonadResource m =>
FilePath -> ConduitT ByteString o m ()
C.sinkFile (FilePath
dir FilePath -> ShowS
</> FilePath
"content-z")
contentFromFile :: (MonadResource m, MonadThrow m) => FilePath -> ConduitT i ContentRow m ()
contentFromFile :: FilePath -> ConduitT i ContentRow m ()
contentFromFile FilePath
dir =
FilePath -> ConduitT i ByteString m ()
forall (m :: * -> *) i.
MonadResource m =>
FilePath -> ConduitT i ByteString m ()
C.sourceFile (FilePath
dir FilePath -> ShowS
</> FilePath
"content-z") ConduitT i ByteString m ()
-> ConduitM ByteString ContentRow m ()
-> ConduitT i ContentRow m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
ConduitM ByteString ContentRow m ()
forall b (m :: * -> *).
(Binary b, MonadThrow m) =>
ConduitT ByteString b m ()
CB.conduitDecode
data ContentRow = CRow {
ContentRow -> FilePath
crId :: String
, ContentRow -> Seq Int16
crFeatures :: Seq Int16
, ContentRow -> DocClass
crClass :: DocClass
} deriving (ContentRow -> ContentRow -> Bool
(ContentRow -> ContentRow -> Bool)
-> (ContentRow -> ContentRow -> Bool) -> Eq ContentRow
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: ContentRow -> ContentRow -> Bool
$c/= :: ContentRow -> ContentRow -> Bool
== :: ContentRow -> ContentRow -> Bool
$c== :: ContentRow -> ContentRow -> Bool
Eq, Eq ContentRow
Eq ContentRow
-> (ContentRow -> ContentRow -> Ordering)
-> (ContentRow -> ContentRow -> Bool)
-> (ContentRow -> ContentRow -> Bool)
-> (ContentRow -> ContentRow -> Bool)
-> (ContentRow -> ContentRow -> Bool)
-> (ContentRow -> ContentRow -> ContentRow)
-> (ContentRow -> ContentRow -> ContentRow)
-> Ord ContentRow
ContentRow -> ContentRow -> Bool
ContentRow -> ContentRow -> Ordering
ContentRow -> ContentRow -> ContentRow
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: ContentRow -> ContentRow -> ContentRow
$cmin :: ContentRow -> ContentRow -> ContentRow
max :: ContentRow -> ContentRow -> ContentRow
$cmax :: ContentRow -> ContentRow -> ContentRow
>= :: ContentRow -> ContentRow -> Bool
$c>= :: ContentRow -> ContentRow -> Bool
> :: ContentRow -> ContentRow -> Bool
$c> :: ContentRow -> ContentRow -> Bool
<= :: ContentRow -> ContentRow -> Bool
$c<= :: ContentRow -> ContentRow -> Bool
< :: ContentRow -> ContentRow -> Bool
$c< :: ContentRow -> ContentRow -> Bool
compare :: ContentRow -> ContentRow -> Ordering
$ccompare :: ContentRow -> ContentRow -> Ordering
$cp1Ord :: Eq ContentRow
Ord, Int -> ContentRow -> ShowS
[ContentRow] -> ShowS
ContentRow -> FilePath
(Int -> ContentRow -> ShowS)
-> (ContentRow -> FilePath)
-> ([ContentRow] -> ShowS)
-> Show ContentRow
forall a.
(Int -> a -> ShowS) -> (a -> FilePath) -> ([a] -> ShowS) -> Show a
showList :: [ContentRow] -> ShowS
$cshowList :: [ContentRow] -> ShowS
show :: ContentRow -> FilePath
$cshow :: ContentRow -> FilePath
showsPrec :: Int -> ContentRow -> ShowS
$cshowsPrec :: Int -> ContentRow -> ShowS
Show, (forall x. ContentRow -> Rep ContentRow x)
-> (forall x. Rep ContentRow x -> ContentRow) -> Generic ContentRow
forall x. Rep ContentRow x -> ContentRow
forall x. ContentRow -> Rep ContentRow x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep ContentRow x -> ContentRow
$cfrom :: forall x. ContentRow -> Rep ContentRow x
Generic, Get ContentRow
[ContentRow] -> Put
ContentRow -> Put
(ContentRow -> Put)
-> Get ContentRow -> ([ContentRow] -> Put) -> Binary ContentRow
forall t. (t -> Put) -> Get t -> ([t] -> Put) -> Binary t
putList :: [ContentRow] -> Put
$cputList :: [ContentRow] -> Put
get :: Get ContentRow
$cget :: Get ContentRow
put :: ContentRow -> Put
$cput :: ContentRow -> Put
Binary)
bit :: Parser Bool
bit :: Parser Bool
bit = (Token Text -> ParsecT Void Text Identity (Token Text)
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char Char
Token Text
'0' ParsecT Void Text Identity Char -> Bool -> Parser Bool
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> Bool
False) Parser Bool -> Parser Bool -> Parser Bool
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Token Text -> ParsecT Void Text Identity (Token Text)
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char Char
Token Text
'1' ParsecT Void Text Identity Char -> Bool -> Parser Bool
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> Bool
True)
sparse :: Foldable t => t Bool -> Seq Int16
sparse :: t Bool -> Seq Int16
sparse = (Seq Int16, Int16) -> Seq Int16
forall a b. (a, b) -> a
fst ((Seq Int16, Int16) -> Seq Int16)
-> (t Bool -> (Seq Int16, Int16)) -> t Bool -> Seq Int16
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ((Seq Int16, Int16) -> Bool -> (Seq Int16, Int16))
-> (Seq Int16, Int16) -> t Bool -> (Seq Int16, Int16)
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl (\(Seq Int16
acc, Int16
i) Bool
b -> if Bool
b then (Seq Int16
acc Seq Int16 -> Int16 -> Seq Int16
forall a. Seq a -> a -> Seq a
|> Int16
i, Int16 -> Int16
forall a. Enum a => a -> a
succ Int16
i) else (Seq Int16
acc, Int16 -> Int16
forall a. Enum a => a -> a
succ Int16
i)) (Seq Int16
forall a. Monoid a => a
mempty, Int16
0)
contentRowP :: Parser ContentRow
contentRowP :: Parsec Void Text ContentRow
contentRowP = do
FilePath
i <- Parser FilePath -> Parser FilePath
forall a. Parser a -> Parser a
lexeme Parser FilePath
alphaNum
let n :: Int
n = Int
3703
[Bool]
foh <- Int -> Parser Bool -> ParsecT Void Text Identity [Bool]
forall (m :: * -> *) a. Monad m => Int -> m a -> m [a]
count Int
n (Parser Bool -> Parser Bool
forall a. Parser a -> Parser a
lexeme Parser Bool
bit)
let feats :: Seq Int16
feats = [Bool] -> Seq Int16
forall (t :: * -> *). Foldable t => t Bool -> Seq Int16
sparse [Bool]
foh
DocClass
c <- Parser DocClass -> Parser DocClass
forall a. Parser a -> Parser a
lexeme Parser DocClass
docClassP
ContentRow -> Parsec Void Text ContentRow
forall (f :: * -> *) a. Applicative f => a -> f a
pure (ContentRow -> Parsec Void Text ContentRow)
-> ContentRow -> Parsec Void Text ContentRow
forall a b. (a -> b) -> a -> b
$ FilePath -> Seq Int16 -> DocClass -> ContentRow
CRow FilePath
i Seq Int16
feats DocClass
c
citesToFile :: (MonadThrow m, MonadIO m, MonadResource m) =>
FilePath
-> FileInfo
-> ConduitT ByteString c m ()
citesToFile :: FilePath -> FileInfo -> ConduitT ByteString c m ()
citesToFile FilePath
dir FileInfo
fi = do
let fpath :: FilePath
fpath = ByteString -> FilePath
unpack (ByteString -> FilePath) -> ByteString -> FilePath
forall a b. (a -> b) -> a -> b
$ FileInfo -> ByteString
filePath FileInfo
fi
Bool -> ConduitT ByteString c m () -> ConduitT ByteString c m ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (ShowS
takeExtension FilePath
fpath FilePath -> FilePath -> Bool
forall a. Eq a => a -> a -> Bool
== FilePath
".cites") (ConduitT ByteString c m () -> ConduitT ByteString c m ())
-> ConduitT ByteString c m () -> ConduitT ByteString c m ()
forall a b. (a -> b) -> a -> b
$
ConduitT ByteString (Row Text) m ()
forall (m :: * -> *).
MonadThrow m =>
ConduitT ByteString (Row Text) m ()
parseTSV ConduitT ByteString (Row Text) m ()
-> ConduitM (Row Text) c m () -> ConduitT ByteString c m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
(Row Text -> Text) -> ConduitT (Row Text) Text m ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
C.map Row Text -> Text
T.unwords ConduitT (Row Text) Text m ()
-> ConduitM Text c m () -> ConduitM (Row Text) c m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
(Text -> CitesRow FilePath)
-> ConduitT Text (CitesRow FilePath) m ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
C.map ( \Text
r -> case Parsec Void Text (CitesRow FilePath)
-> FilePath
-> Text
-> Either (ParseErrorBundle Text Void) (CitesRow FilePath)
forall e s a.
Parsec e s a -> FilePath -> s -> Either (ParseErrorBundle s e) a
parse Parsec Void Text (CitesRow FilePath)
citesRowP FilePath
"" Text
r of
Left ParseErrorBundle Text Void
e -> FilePath -> CitesRow FilePath
forall a. HasCallStack => FilePath -> a
error (FilePath -> CitesRow FilePath) -> FilePath -> CitesRow FilePath
forall a b. (a -> b) -> a -> b
$ ParseErrorBundle Text Void -> FilePath
forall s e.
(VisualStream s, TraversableStream s, ShowErrorComponent e) =>
ParseErrorBundle s e -> FilePath
errorBundlePretty ParseErrorBundle Text Void
e
Right CitesRow FilePath
x -> CitesRow FilePath
x ) ConduitT Text (CitesRow FilePath) m ()
-> ConduitM (CitesRow FilePath) c m () -> ConduitM Text c m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
ConduitT (CitesRow FilePath) ByteString m ()
forall b (m :: * -> *).
(Binary b, MonadThrow m) =>
ConduitT b ByteString m ()
CB.conduitEncode ConduitT (CitesRow FilePath) ByteString m ()
-> ConduitT ByteString c m ()
-> ConduitM (CitesRow FilePath) c m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
FilePath -> ConduitT ByteString c m ()
forall (m :: * -> *) o.
MonadResource m =>
FilePath -> ConduitT ByteString o m ()
C.sinkFile (FilePath
dir FilePath -> ShowS
</> FilePath
"cites")
citesFromFile :: (MonadResource m, MonadThrow m) => FilePath -> ConduitT i (CitesRow String) m ()
citesFromFile :: FilePath -> ConduitT i (CitesRow FilePath) m ()
citesFromFile FilePath
dir =
FilePath -> ConduitT i ByteString m ()
forall (m :: * -> *) i.
MonadResource m =>
FilePath -> ConduitT i ByteString m ()
C.sourceFile (FilePath
dir FilePath -> ShowS
</> FilePath
"cites") ConduitT i ByteString m ()
-> ConduitM ByteString (CitesRow FilePath) m ()
-> ConduitT i (CitesRow FilePath) m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
ConduitM ByteString (CitesRow FilePath) m ()
forall b (m :: * -> *).
(Binary b, MonadThrow m) =>
ConduitT ByteString b m ()
CB.conduitDecode
citeseerGraph :: FilePath
-> IO (G.Graph ContentRow)
citeseerGraph :: FilePath -> IO (Graph ContentRow)
citeseerGraph FilePath
dir = do
Map FilePath (Seq Int16, DocClass)
mm <- FilePath -> IO (Map FilePath (Seq Int16, DocClass))
restoreContent FilePath
dir
ResourceT IO (Graph ContentRow) -> IO (Graph ContentRow)
forall (m :: * -> *) a. MonadUnliftIO m => ResourceT m a -> m a
runResourceT (ResourceT IO (Graph ContentRow) -> IO (Graph ContentRow))
-> ResourceT IO (Graph ContentRow) -> IO (Graph ContentRow)
forall a b. (a -> b) -> a -> b
$ ConduitT () Void (ResourceT IO) (Graph ContentRow)
-> ResourceT IO (Graph ContentRow)
forall (m :: * -> *) r. Monad m => ConduitT () Void m r -> m r
runConduit (ConduitT () Void (ResourceT IO) (Graph ContentRow)
-> ResourceT IO (Graph ContentRow))
-> ConduitT () Void (ResourceT IO) (Graph ContentRow)
-> ResourceT IO (Graph ContentRow)
forall a b. (a -> b) -> a -> b
$
FilePath -> ConduitT () (CitesRow FilePath) (ResourceT IO) ()
forall (m :: * -> *) i.
(MonadResource m, MonadThrow m) =>
FilePath -> ConduitT i (CitesRow FilePath) m ()
citesFromFile FilePath
dir ConduitT () (CitesRow FilePath) (ResourceT IO) ()
-> ConduitM
(CitesRow FilePath) Void (ResourceT IO) (Graph ContentRow)
-> ConduitT () Void (ResourceT IO) (Graph ContentRow)
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
(Graph ContentRow -> CitesRow FilePath -> Graph ContentRow)
-> Graph ContentRow
-> ConduitM
(CitesRow FilePath) Void (ResourceT IO) (Graph ContentRow)
forall (m :: * -> *) a b o.
Monad m =>
(a -> b -> a) -> a -> ConduitT b o m a
C.foldl (\Graph ContentRow
gr (CitesRow FilePath
b FilePath
a) ->
let
edm :: Maybe ((Seq Int16, DocClass), (Seq Int16, DocClass))
edm = (,) ((Seq Int16, DocClass)
-> (Seq Int16, DocClass)
-> ((Seq Int16, DocClass), (Seq Int16, DocClass)))
-> Maybe (Seq Int16, DocClass)
-> Maybe
((Seq Int16, DocClass)
-> ((Seq Int16, DocClass), (Seq Int16, DocClass)))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FilePath
-> Map FilePath (Seq Int16, DocClass)
-> Maybe (Seq Int16, DocClass)
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup FilePath
a Map FilePath (Seq Int16, DocClass)
mm Maybe
((Seq Int16, DocClass)
-> ((Seq Int16, DocClass), (Seq Int16, DocClass)))
-> Maybe (Seq Int16, DocClass)
-> Maybe ((Seq Int16, DocClass), (Seq Int16, DocClass))
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> FilePath
-> Map FilePath (Seq Int16, DocClass)
-> Maybe (Seq Int16, DocClass)
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup FilePath
b Map FilePath (Seq Int16, DocClass)
mm
in
case Maybe ((Seq Int16, DocClass), (Seq Int16, DocClass))
edm of
Maybe ((Seq Int16, DocClass), (Seq Int16, DocClass))
Nothing -> Graph ContentRow
gr
Just ((Seq Int16
bffs, DocClass
bc), (Seq Int16
affs, DocClass
ac)) ->
let
acr :: ContentRow
acr = FilePath -> Seq Int16 -> DocClass -> ContentRow
CRow FilePath
a Seq Int16
affs DocClass
ac
bcr :: ContentRow
bcr = FilePath -> Seq Int16 -> DocClass -> ContentRow
CRow FilePath
b Seq Int16
bffs DocClass
bc
in
(ContentRow
acr ContentRow -> ContentRow -> Graph ContentRow
forall a. a -> a -> Graph a
`G.edge` ContentRow
bcr) Graph ContentRow -> Graph ContentRow -> Graph ContentRow
forall a. Graph a -> Graph a -> Graph a
`G.overlay` Graph ContentRow
gr
) Graph ContentRow
forall a. Graph a
G.empty
citeseerGraphEdges :: (MonadResource m, MonadThrow m) =>
FilePath
-> M.Map String (Seq Int16, DocClass)
-> ConduitT i (Maybe (G.Graph ContentRow)) m ()
citeseerGraphEdges :: FilePath
-> Map FilePath (Seq Int16, DocClass)
-> ConduitT i (Maybe (Graph ContentRow)) m ()
citeseerGraphEdges FilePath
dir Map FilePath (Seq Int16, DocClass)
mm =
FilePath -> ConduitT i (CitesRow FilePath) m ()
forall (m :: * -> *) i.
(MonadResource m, MonadThrow m) =>
FilePath -> ConduitT i (CitesRow FilePath) m ()
citesFromFile FilePath
dir ConduitT i (CitesRow FilePath) m ()
-> ConduitM (CitesRow FilePath) (Maybe (Graph ContentRow)) m ()
-> ConduitT i (Maybe (Graph ContentRow)) m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
(CitesRow FilePath -> Maybe (Graph ContentRow))
-> ConduitM (CitesRow FilePath) (Maybe (Graph ContentRow)) m ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
C.map (\(CitesRow FilePath
b FilePath
a) ->
case (,) ((Seq Int16, DocClass)
-> (Seq Int16, DocClass)
-> ((Seq Int16, DocClass), (Seq Int16, DocClass)))
-> Maybe (Seq Int16, DocClass)
-> Maybe
((Seq Int16, DocClass)
-> ((Seq Int16, DocClass), (Seq Int16, DocClass)))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FilePath
-> Map FilePath (Seq Int16, DocClass)
-> Maybe (Seq Int16, DocClass)
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup FilePath
a Map FilePath (Seq Int16, DocClass)
mm Maybe
((Seq Int16, DocClass)
-> ((Seq Int16, DocClass), (Seq Int16, DocClass)))
-> Maybe (Seq Int16, DocClass)
-> Maybe ((Seq Int16, DocClass), (Seq Int16, DocClass))
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> FilePath
-> Map FilePath (Seq Int16, DocClass)
-> Maybe (Seq Int16, DocClass)
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup FilePath
b Map FilePath (Seq Int16, DocClass)
mm of
Maybe ((Seq Int16, DocClass), (Seq Int16, DocClass))
Nothing -> Maybe (Graph ContentRow)
forall a. Maybe a
Nothing
Just ((Seq Int16
bffs, DocClass
bc), (Seq Int16
affs, DocClass
ac)) ->
let
acr :: ContentRow
acr = FilePath -> Seq Int16 -> DocClass -> ContentRow
CRow FilePath
a Seq Int16
affs DocClass
ac
bcr :: ContentRow
bcr = FilePath -> Seq Int16 -> DocClass -> ContentRow
CRow FilePath
b Seq Int16
bffs DocClass
bc
in Graph ContentRow -> Maybe (Graph ContentRow)
forall a. a -> Maybe a
Just (ContentRow
acr ContentRow -> ContentRow -> Graph ContentRow
forall a. a -> a -> Graph a
`G.edge` ContentRow
bcr))
data CitesRow a = CitesRow { CitesRow a -> a
cirTo :: a, CitesRow a -> a
cirFrom :: a } deriving (CitesRow a -> CitesRow a -> Bool
(CitesRow a -> CitesRow a -> Bool)
-> (CitesRow a -> CitesRow a -> Bool) -> Eq (CitesRow a)
forall a. Eq a => CitesRow a -> CitesRow a -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: CitesRow a -> CitesRow a -> Bool
$c/= :: forall a. Eq a => CitesRow a -> CitesRow a -> Bool
== :: CitesRow a -> CitesRow a -> Bool
$c== :: forall a. Eq a => CitesRow a -> CitesRow a -> Bool
Eq, Int -> CitesRow a -> ShowS
[CitesRow a] -> ShowS
CitesRow a -> FilePath
(Int -> CitesRow a -> ShowS)
-> (CitesRow a -> FilePath)
-> ([CitesRow a] -> ShowS)
-> Show (CitesRow a)
forall a. Show a => Int -> CitesRow a -> ShowS
forall a. Show a => [CitesRow a] -> ShowS
forall a. Show a => CitesRow a -> FilePath
forall a.
(Int -> a -> ShowS) -> (a -> FilePath) -> ([a] -> ShowS) -> Show a
showList :: [CitesRow a] -> ShowS
$cshowList :: forall a. Show a => [CitesRow a] -> ShowS
show :: CitesRow a -> FilePath
$cshow :: forall a. Show a => CitesRow a -> FilePath
showsPrec :: Int -> CitesRow a -> ShowS
$cshowsPrec :: forall a. Show a => Int -> CitesRow a -> ShowS
Show, (forall x. CitesRow a -> Rep (CitesRow a) x)
-> (forall x. Rep (CitesRow a) x -> CitesRow a)
-> Generic (CitesRow a)
forall x. Rep (CitesRow a) x -> CitesRow a
forall x. CitesRow a -> Rep (CitesRow a) x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
forall a x. Rep (CitesRow a) x -> CitesRow a
forall a x. CitesRow a -> Rep (CitesRow a) x
$cto :: forall a x. Rep (CitesRow a) x -> CitesRow a
$cfrom :: forall a x. CitesRow a -> Rep (CitesRow a) x
Generic, Get (CitesRow a)
[CitesRow a] -> Put
CitesRow a -> Put
(CitesRow a -> Put)
-> Get (CitesRow a) -> ([CitesRow a] -> Put) -> Binary (CitesRow a)
forall a. Binary a => Get (CitesRow a)
forall a. Binary a => [CitesRow a] -> Put
forall a. Binary a => CitesRow a -> Put
forall t. (t -> Put) -> Get t -> ([t] -> Put) -> Binary t
putList :: [CitesRow a] -> Put
$cputList :: forall a. Binary a => [CitesRow a] -> Put
get :: Get (CitesRow a)
$cget :: forall a. Binary a => Get (CitesRow a)
put :: CitesRow a -> Put
$cput :: forall a. Binary a => CitesRow a -> Put
Binary)
citesRowP :: Parser (CitesRow String)
citesRowP :: Parsec Void Text (CitesRow FilePath)
citesRowP = FilePath -> FilePath -> CitesRow FilePath
forall a. a -> a -> CitesRow a
CitesRow (FilePath -> FilePath -> CitesRow FilePath)
-> Parser FilePath
-> ParsecT Void Text Identity (FilePath -> CitesRow FilePath)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser FilePath -> Parser FilePath
forall a. Parser a -> Parser a
lexeme Parser FilePath
alphaNum ParsecT Void Text Identity (FilePath -> CitesRow FilePath)
-> Parser FilePath -> Parsec Void Text (CitesRow FilePath)
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Parser FilePath -> Parser FilePath
forall a. Parser a -> Parser a
lexeme Parser FilePath
alphaNum