{-# LANGUAGE ScopedTypeVariables #-} -- | Generic module for using Webrexp as a user. -- the main functions for the user are queryDocument to perform an in-memory -- evaluation, and evalWebRexpDepthFirst module Text.Webrexp ( -- * In memory evaluation ParseableType( .. ) , queryDocument , queryDocumentM -- * Default evaluation , evalWebRexp , evalWebRexpDepthFirst , parseWebRexp , evalParsedWebRexp , executeParsedWebRexp -- * Crawling configuration , Conf (..) , defaultConf , evalWebRexpWithConf ) where import Control.Monad import Control.Monad.IO.Class import Control.Monad.ST import Text.Parsec import System.IO import System.Exit import Data.Array.ST import Data.Array.IO import Text.Webrexp.Exprtypes import Text.Webrexp.Parser( webRexpParser ) import Text.Webrexp.HaXmlNode import Text.Webrexp.HxtNode import Text.Webrexp.JsonNode import Text.Webrexp.UnionNode import Text.Webrexp.DirectoryNode import Text.Webrexp.GraphWalker import Text.Webrexp.ResourcePath import Text.Webrexp.WebContext import Text.Webrexp.WebRexpAutomata import Text.Webrexp.Remote.MimeTypes import qualified Text.Webrexp.ProjectByteString as B data Conf = Conf { hammeringDelay :: Int , userAgent :: String , output :: Handle , verbose :: Bool , quiet :: Bool , expr :: String , showHelp :: Bool , depthEvaluation :: Bool , outputGraphViz :: Bool } defaultConf :: Conf defaultConf = Conf { hammeringDelay = 1500 , userAgent = "" , output = stdout , verbose = False , quiet = False , expr = "" , showHelp = False , outputGraphViz = False , depthEvaluation = True } type CrawledNode = UnionNode (UnionNode HxtNode HaXmLNode) (UnionNode JsonNode DirectoryNode) type Crawled a = WebCrawler IOArray CrawledNode ResourcePath a type MemoryCrawl s a = WebContextT (STArray s) CrawledNode ResourcePath (ST s) a initialState :: IO (EvalState CrawledNode ResourcePath) initialState = do node <- currentDirectoryNode return . Node $ repurposeNode (UnionRight . UnionRight) node -- | Query a document in memory and retrieve the results, you can use it in combination -- to the quasiquoting facility to embed the webrexp in haskell : -- -- > {-# LANGUAGE QuasiQuotes #-} -- > import Text.Webrexp -- > import Text.Webrexp.Quote -- > import qualified Data.ByteString.Char8 as B -- > -- > main :: IO () -- > main = print $ queryDocument ParseableJson document [webrexpParse| some things [.] |] -- > where document = B.pack "{ \"some\": { \"things\": \"a phrase\" } }" -- -- The returned values contain possible errors as 'Left' and real value as 'Right. -- queryDocument :: ParseableType -> B.ByteString -> WebRexp -> [Either String String] queryDocument docType str query = runST $ queryDocumentM docType str query -- | Exactly same thing as 'queryDocument', but in ST queryDocumentM :: forall s . ParseableType -> B.ByteString -> WebRexp -> ST s [Either String String] queryDocumentM docType str query = executeWithEmptyContext todo where ignoreLog _ = return () loggers = (ignoreLog, ignoreLog, ignoreLog) todo :: MemoryCrawl s Bool todo = do initialNode <- parseUnion loggers (Just docType) (Local "") str case initialNode of AccessError -> return False DataBlob _ _ -> return False Result rezPath a -> let initNode = NodeContext { rootRef = rezPath , this = a , parents = ImmutableHistory [] } in do setLogLevel Quiet evalDepthFirst (Node initNode) query -- | Prepare a webrexp. -- This function is useful if the expression has -- to be applied many times. parseWebRexp :: String -> Maybe WebRexp parseWebRexp str = case runParser webRexpParser () "expr" str of Left _ -> Nothing Right e -> Just e -- | Evaluation for pre-parsed webrexp. -- Best method if a webrexp has to be evaluated -- many times. Evaluated using breadth first method. evalParsedWebRexp :: WebRexp -> IO Bool evalParsedWebRexp wexpr = evalWithEmptyContext crawled where crawled :: Crawled Bool = evalBreadthFirst (Text "") wexpr -- | Evaluate a webrexp and return all the dumped text as 'Right' -- and all errors as 'Left'. Evaluated using depth first method. executeParsedWebRexp :: WebRexp -> IO [Either String String] executeParsedWebRexp wexpr = executeWithEmptyContext crawled where crawled :: Crawled Bool = evalDepthFirst (Text "") wexpr -- | Simple evaluation function, evaluation is -- the breadth first type. evalWebRexp :: String -> IO Bool evalWebRexp = evalWebRexpWithEvaluator $ evalBreadthFirst (Text "") -- | Evaluate a webrexp in depth first fashion, returning a success -- status telling if the evaluation got up to the end. evalWebRexpDepthFirst :: String -> IO Bool evalWebRexpDepthFirst = evalWebRexpWithEvaluator $ evalDepthFirst (Text "") -- | Simplest function to eval a webrexp. -- Return the evaluation status of the webrexp, -- True for full evaluation success. evalWebRexpWithEvaluator :: (WebRexp -> Crawled Bool) -> String -> IO Bool evalWebRexpWithEvaluator evaluator str = case runParser webRexpParser () "expr" str of Left err -> do putStrLn $ "Parsing error :\n" ++ show err return False Right wexpr -> let crawled :: Crawled Bool = evaluator wexpr in evalWithEmptyContext crawled -- | Function used in the command line program. evalWebRexpWithConf :: Conf -> IO Bool evalWebRexpWithConf conf = case runParser webRexpParser () "expr" (expr conf) of Left err -> do putStrLn "Parsing error :\n" print err return False Right wexpr -> do when (outputGraphViz conf) (do let packed = packRefFiltering wexpr dumpAutomata (expr conf) stdout $ buildAutomata packed exitWith ExitSuccess) when (verbose conf) (do putStrLn $ "code " ++ show (expr conf) print wexpr) let crawled :: Crawled Bool = do setUserAgent $ userAgent conf setOutput $ output conf setHttpDelay $ hammeringDelay conf when (quiet conf) (setLogLevel Quiet) when (verbose conf) (setLogLevel Verbose) initState <- liftIO initialState if depthEvaluation conf then evalDepthFirst initState wexpr else evalBreadthFirst initState wexpr rez <- evalWithEmptyContext crawled when (output conf /= stdout) (hClose $ output conf) return rez