{-# LANGUAGE ForeignFunctionInterface #-} -- hexpat, a Haskell wrapper for expat -- Copyright (C) 2008 Evan Martin -- Copyright (C) 2009 Stephen Blackheath -- | Low-level interface to Expat. Unless speed is paramount, this should -- normally be avoided in favour of the interfaces provided by -- 'Text.XML.Expat.SAX' and 'Text.XML.Expat.Tree'. Basic usage is: -- -- (1) Make a new parser: 'newParser'. -- -- (2) Set up callbacks on the parser: 'setStartElementHandler', etc. -- -- (3) Feed data into the parser: 'parse', 'parse'' or 'parseChunk'. module Text.XML.Expat.IO ( -- ** Parser Setup Parser, newParser, -- ** Parsing parse, parse', parseChunk, Encoding(..), XMLParseError(..), getParseLocation, XMLParseLocation(..), -- ** Parser Callbacks StartElementHandler, EndElementHandler, CharacterDataHandler, ExternalEntityRefHandler, SkippedEntityHandler, setStartElementHandler, setEndElementHandler, setCharacterDataHandler, setExternalEntityRefHandler, setSkippedEntityHandler, setUseForeignDTD, -- ** Lower-level interface parseExternalEntityReference, unsafeParseChunk, withHandlers, unsafeSetHandlers, unsafeReleaseHandlers, ExpatHandlers, -- ** Helpers encodingToString ) where import Control.Exception (bracket) import Control.Parallel.Strategies import Control.Monad import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as BSL import Data.IORef import Foreign import Foreign.C -- |Opaque parser type. type ParserPtr = Ptr () data Parser = Parser { _parserObj :: ForeignPtr () , _startElementHandler :: IORef CStartElementHandler , _endElementHandler :: IORef CEndElementHandler , _cdataHandler :: IORef CCharacterDataHandler , _externalEntityRefHandler :: IORef (Maybe CExternalEntityRefHandler) , _skippedEntityHandler :: IORef (Maybe CSkippedEntityHandler) } instance Show Parser where showsPrec _ (Parser fp _ _ _ _ _) = showsPrec 0 fp withParser :: Parser -> (ParserPtr -> IO a) -> IO a withParser (Parser fp _ _ _ _ _) = withForeignPtr fp -- |Encoding types available for the document encoding. data Encoding = ASCII | UTF8 | UTF16 | ISO88591 encodingToString :: Encoding -> String encodingToString ASCII = "US-ASCII" encodingToString UTF8 = "UTF-8" encodingToString UTF16 = "UTF-16" encodingToString ISO88591 = "ISO-8859-1" withOptEncoding :: Maybe Encoding -> (CString -> IO a) -> IO a withOptEncoding Nothing f = f nullPtr withOptEncoding (Just enc) f = withCString (encodingToString enc) f parserCreate :: Maybe Encoding -> IO (ParserPtr) parserCreate a1 = withOptEncoding a1 $ \a1' -> parserCreate'_ a1' >>= \res -> let {res' = id res} in return (res') -- | Create a 'Parser'. newParser :: Maybe Encoding -> IO Parser newParser enc = do ptr <- parserCreate enc fptr <- newForeignPtr parserFree ptr nullStartH <- newIORef nullCStartElementHandler nullEndH <- newIORef nullCEndElementHandler nullCharH <- newIORef nullCCharacterDataHandler extH <- newIORef Nothing skipH <- newIORef Nothing return $ Parser fptr nullStartH nullEndH nullCharH extH skipH setUseForeignDTD :: Parser -> Bool -> IO () setUseForeignDTD p b = withParser p $ \p' -> xmlUseForeignDTD p' b' where b' = if b then 1 else 0 -- ByteString.useAsCStringLen is almost what we need, but C2HS wants a CInt -- instead of an Int. withBStringLen :: BS.ByteString -> ((CString, CInt) -> IO a) -> IO a withBStringLen bs f = do BS.useAsCStringLen bs $ \(str, len) -> f (str, fromIntegral len) unStatus :: CInt -> Bool unStatus 0 = False unStatus _ = True -- |@parse data@ feeds /lazy/ ByteString data into a 'Parser'. It returns -- Nothing on success, or Just the parse error. parse :: Parser -> BSL.ByteString -> IO (Maybe XMLParseError) parse parser bs = withHandlers parser $ do ok <- doParseChunks (BSL.toChunks bs) if ok then return Nothing else Just `fmap` getError parser where doParseChunks [] = doParseChunk parser BS.empty True doParseChunks (c:cs) = do ok <- doParseChunk parser c False if ok then doParseChunks cs else return False -- |@parse data@ feeds /strict/ ByteString data into a 'Parser'. It returns -- Nothing on success, or Just the parse error. parse' :: Parser -> BS.ByteString -> IO (Maybe XMLParseError) parse' parser bs = withHandlers parser $ do ok <- doParseChunk parser bs True if ok then return Nothing else Just `fmap` getError parser -- |@parseChunk data False@ feeds /strict/ ByteString data into a -- 'Parser'. The end of the data is indicated by passing @True@ for the -- final parameter. It returns Nothing on success, or Just the parse error. parseChunk :: Parser -> BS.ByteString -> Bool -- ^ True if last chunk -> IO (Maybe XMLParseError) parseChunk parser xml final = withHandlers parser $ unsafeParseChunk parser xml final parseExternalEntityReference :: Parser -> CString -- ^ context -> Maybe Encoding -- ^ encoding -> CStringLen -- ^ text -> IO Bool parseExternalEntityReference parser context encoding (text,sz) = withParser parser $ \p -> do extp <- withOptEncoding encoding $ xmlExternalEntityParserCreate p context e <- doParseChunk'_ extp text (fromIntegral sz) 1 parserFree' extp return $ e == 1 -- | This variant of 'parseChunk' must either be called inside 'withHandlers' -- (safest), or between 'unsafeSetHandlers' and 'unsafeReleaseHandlers', and -- this will give you better performance than 'parseChunk' if you process -- multiple chunks inside. unsafeParseChunk :: Parser -> BS.ByteString -> Bool -> IO (Maybe XMLParseError) unsafeParseChunk parser xml final = do ok <- doParseChunk parser xml final if ok then return Nothing else Just `fmap` getError parser getError :: Parser -> IO XMLParseError getError parser = withParser parser $ \p -> do code <- xmlGetErrorCode p cerr <- xmlErrorString code err <- peekCString cerr loc <- getParseLocation parser return $ XMLParseError err loc data ExpatHandlers = ExpatHandlers (FunPtr CStartElementHandler) (FunPtr CEndElementHandler) (FunPtr CCharacterDataHandler) (Maybe (FunPtr CExternalEntityRefHandler)) (Maybe (FunPtr CSkippedEntityHandler)) unsafeSetHandlers :: Parser -> IO ExpatHandlers unsafeSetHandlers parser@(Parser _ startRef endRef charRef extRef skipRef) = do cStartH <- mkCStartElementHandler =<< readIORef startRef cEndH <- mkCEndElementHandler =<< readIORef endRef cCharH <- mkCCharacterDataHandler =<< readIORef charRef mExtH <- readIORef extRef >>= maybe (return Nothing) (\h -> liftM Just $ mkCExternalEntityRefHandler h) mSkipH <- readIORef skipRef >>= maybe (return Nothing) (\h -> liftM Just $ mkCSkippedEntityHandler h) withParser parser $ \p -> do xmlSetstartelementhandler p cStartH xmlSetendelementhandler p cEndH xmlSetcharacterdatahandler p cCharH maybe (return ()) (xmlSetExternalEntityRefHandler p) mExtH maybe (return ()) (xmlSetSkippedEntityHandler p) mSkipH return $ ExpatHandlers cStartH cEndH cCharH mExtH mSkipH unsafeReleaseHandlers :: ExpatHandlers -> IO () unsafeReleaseHandlers (ExpatHandlers cStartH cEndH cCharH mcExtH mcSkipH) = do freeHaskellFunPtr cStartH freeHaskellFunPtr cEndH freeHaskellFunPtr cCharH maybe (return ()) freeHaskellFunPtr mcExtH maybe (return ()) freeHaskellFunPtr mcSkipH -- | 'unsafeParseChunk' is required to be called inside @withHandlers@. -- Safer than using 'unsafeSetHandlers' / 'unsafeReleaseHandlers'. withHandlers :: Parser -> IO a -- ^ Computation where unsafeParseChunk may be used -> IO a withHandlers parser code = do bracket (unsafeSetHandlers parser) unsafeReleaseHandlers (\_ -> code) -- |Obtain C value from Haskell 'Bool'. -- cFromBool :: Num a => Bool -> a cFromBool = fromBool doParseChunk :: Parser -> BS.ByteString -> Bool -> IO (Bool) doParseChunk a1 a2 a3 = withParser a1 $ \a1' -> withBStringLen a2 $ \(a2'1, a2'2) -> let {a3' = cFromBool a3} in doParseChunk'_ a1' a2'1 a2'2 a3' >>= \res -> let {res' = unStatus res} in return (res') -- | Parse error, consisting of message text and error location data XMLParseError = XMLParseError String XMLParseLocation deriving (Eq, Show) instance NFData XMLParseError where rnf (XMLParseError msg loc) = rnf (msg, loc) -- | Specifies a location of an event within the input text data XMLParseLocation = XMLParseLocation { xmlLineNumber :: Int64, -- ^ Line number of the event xmlColumnNumber :: Int64, -- ^ Column number of the event xmlByteIndex :: Int64, -- ^ Byte index of event from start of document xmlByteCount :: Int64 -- ^ The number of bytes in the event } deriving (Eq, Show) instance NFData XMLParseLocation where rnf (XMLParseLocation lin col ind cou) = rnf (lin, col, ind, cou) getParseLocation :: Parser -> IO XMLParseLocation getParseLocation parser = withParser parser $ \p -> do line <- xmlGetCurrentLineNumber p col <- xmlGetCurrentColumnNumber p index <- xmlGetCurrentByteIndex p count <- xmlGetCurrentByteCount p return $ XMLParseLocation { xmlLineNumber = fromIntegral line, xmlColumnNumber = fromIntegral col, xmlByteIndex = fromIntegral index, xmlByteCount = fromIntegral count } -- | The type of the \"element started\" callback. The first parameter is the -- element name; the second are the (attribute, value) pairs. Return True to -- continue parsing as normal, or False to terminate the parse. type StartElementHandler = CString -> [(CString, CString)] -> IO Bool -- | The type of the \"element ended\" callback. The parameter is the element -- name. Return True to continue parsing as normal, or False to terminate the -- parse. type EndElementHandler = CString -> IO Bool -- | The type of the \"character data\" callback. The parameter is the -- character data processed. This callback may be called more than once while -- processing a single conceptual block of text. Return True to continue -- parsing as normal, or False to terminate the parse. type CharacterDataHandler = CStringLen -> IO Bool -- | The type of the \"external entity reference\" callback. See the expat -- documentation. type ExternalEntityRefHandler = Parser -> CString -- context -> CString -- base -> CString -- systemID -> CString -- publicID -> IO Bool -- | Set a skipped entity handler. This is called in two situations: -- -- 1. An entity reference is encountered for which no declaration has been read -- and this is not an error. -- -- 2. An internal entity reference is read, but not expanded, because -- @XML_SetDefaultHandler@ has been called. type SkippedEntityHandler = CString -- entityName -> Int -- is a parameter entity? -> IO Bool type CStartElementHandler = Ptr () -> CString -> Ptr CString -> IO () nullCStartElementHandler :: CStartElementHandler nullCStartElementHandler _ _ _ = return () foreign import ccall safe "wrapper" mkCStartElementHandler :: CStartElementHandler -> IO (FunPtr CStartElementHandler) wrapStartElementHandler :: Parser -> StartElementHandler -> CStartElementHandler wrapStartElementHandler parser handler = h where h _ cname cattrs = do cattrlist <- peekArray0 nullPtr cattrs stillRunning <- handler cname (pairwise cattrlist) unless stillRunning $ stopParser parser -- | Attach a StartElementHandler to a Parser. setStartElementHandler :: Parser -> StartElementHandler -> IO () setStartElementHandler parser@(Parser _ startRef _ _ _ _) handler = writeIORef startRef $ wrapStartElementHandler parser handler type CEndElementHandler = Ptr () -> CString -> IO () nullCEndElementHandler :: CEndElementHandler nullCEndElementHandler _ _ = return () foreign import ccall safe "wrapper" mkCEndElementHandler :: CEndElementHandler -> IO (FunPtr CEndElementHandler) wrapEndElementHandler :: Parser -> EndElementHandler -> CEndElementHandler wrapEndElementHandler parser handler = h where h _ cname = do stillRunning <- handler cname unless stillRunning $ stopParser parser -- | Attach an EndElementHandler to a Parser. setEndElementHandler :: Parser -> EndElementHandler -> IO () setEndElementHandler parser@(Parser _ _ endRef _ _ _) handler = writeIORef endRef $ wrapEndElementHandler parser handler type CCharacterDataHandler = Ptr () -> CString -> CInt -> IO () nullCCharacterDataHandler :: CCharacterDataHandler nullCCharacterDataHandler _ _ _ = return () foreign import ccall safe "wrapper" mkCCharacterDataHandler :: CCharacterDataHandler -> IO (FunPtr CCharacterDataHandler) wrapCharacterDataHandler :: Parser -> CharacterDataHandler -> CCharacterDataHandler wrapCharacterDataHandler parser handler = h where h _ cdata len = do stillRunning <- handler (cdata, fromIntegral len) unless stillRunning $ stopParser parser -- | Attach an CharacterDataHandler to a Parser. setCharacterDataHandler :: Parser -> CharacterDataHandler -> IO () setCharacterDataHandler parser@(Parser _ _ _ charRef _ _) handler = writeIORef charRef $ wrapCharacterDataHandler parser handler pairwise :: [a] -> [(a,a)] pairwise (x1:x2:xs) = (x1,x2) : pairwise xs pairwise _ = [] stopParser :: Parser -> IO () stopParser parser = withParser parser $ \p -> xmlStopParser p 0 ------------------------------------------------------------------------------ -- C imports foreign import ccall unsafe "XML_ParserCreate" parserCreate'_ :: ((Ptr CChar) -> (IO (Ptr ()))) foreign import ccall unsafe "XML_SetStartElementHandler" xmlSetstartelementhandler :: ((Ptr ()) -> ((FunPtr ((Ptr ()) -> ((Ptr CChar) -> ((Ptr (Ptr CChar)) -> (IO ()))))) -> (IO ()))) foreign import ccall unsafe "XML_SetEndElementHandler" xmlSetendelementhandler :: ((Ptr ()) -> ((FunPtr ((Ptr ()) -> ((Ptr CChar) -> (IO ())))) -> (IO ()))) foreign import ccall unsafe "XML_SetCharacterDataHandler" xmlSetcharacterdatahandler :: ((Ptr ()) -> ((FunPtr ((Ptr ()) -> ((Ptr CChar) -> (CInt -> (IO ()))))) -> (IO ()))) foreign import ccall safe "XML_Parse" doParseChunk'_ :: ((Ptr ()) -> ((Ptr CChar) -> (CInt -> (CInt -> (IO CInt))))) foreign import ccall unsafe "XML_UseForeignDTD" xmlUseForeignDTD :: ParserPtr -- ^ parser -> CChar -- ^ use foreign DTD? (external entity ref -- handler will be called with publicID & -- systemID set to null -> IO () foreign import ccall "&XML_ParserFree" parserFree :: FunPtr (ParserPtr -> IO ()) foreign import ccall "XML_ParserFree" parserFree' :: ParserPtr -> IO () type CExternalEntityRefHandler = ParserPtr -- parser -> Ptr CChar -- context -> Ptr CChar -- base -> Ptr CChar -- systemID -> Ptr CChar -- publicID -> IO () foreign import ccall safe "wrapper" mkCExternalEntityRefHandler :: CExternalEntityRefHandler -> IO (FunPtr CExternalEntityRefHandler) foreign import ccall unsafe "XML_SetExternalEntityRefHandler" xmlSetExternalEntityRefHandler :: ParserPtr -> FunPtr CExternalEntityRefHandler -> IO () foreign import ccall unsafe "XML_SetSkippedEntityHandler" xmlSetSkippedEntityHandler :: ParserPtr -> FunPtr CSkippedEntityHandler -> IO () foreign import ccall unsafe "XML_ExternalEntityParserCreate" xmlExternalEntityParserCreate :: ParserPtr -> CString -- ^ context -> CString -- ^ encoding -> IO ParserPtr type CSkippedEntityHandler = Ptr () -- user data pointer -> CString -- entity name -> CInt -- is a parameter entity? -> IO () foreign import ccall safe "wrapper" mkCSkippedEntityHandler :: CSkippedEntityHandler -> IO (FunPtr CSkippedEntityHandler) wrapExternalEntityRefHandler :: Parser -> ExternalEntityRefHandler -> CExternalEntityRefHandler wrapExternalEntityRefHandler parser handler = h where h _ context base systemID publicID = do stillRunning <- handler parser context base systemID publicID unless stillRunning $ stopParser parser wrapSkippedEntityHandler :: Parser -> SkippedEntityHandler -> CSkippedEntityHandler wrapSkippedEntityHandler parser handler = h where h _ entityName i = do stillRunning <- handler entityName (fromIntegral i) unless stillRunning $ stopParser parser setExternalEntityRefHandler :: Parser -> ExternalEntityRefHandler -> IO () setExternalEntityRefHandler parser h = writeIORef ref $ Just $ wrapExternalEntityRefHandler parser h where ref = _externalEntityRefHandler parser setSkippedEntityHandler :: Parser -> SkippedEntityHandler -> IO () setSkippedEntityHandler parser h = writeIORef ref $ Just $ wrapSkippedEntityHandler parser h where ref = _skippedEntityHandler parser -- Note on word sizes: -- -- on expat 2.0: -- XML_GetCurrentLineNumber returns XML_Size -- XML_GetCurrentColumnNumber returns XML_Size -- XML_GetCurrentByteIndex returns XML_Index -- These are defined in expat_external.h -- -- debian-i386 says XML_Size and XML_Index are 4 bytes. -- ubuntu-amd64 says XML_Size and XML_Index are 8 bytes. -- These two systems do NOT define XML_LARGE_SIZE, which would force these types -- to be 64-bit. -- -- If we guess the word size too small, it shouldn't matter: We will just discard -- the most significant part. If we get the word size too large, we will get -- garbage (very bad). -- -- So - what I will do is use CLong and CULong, which correspond to what expat -- is using when XML_LARGE_SIZE is disabled, and give the correct sizes on the -- two machines mentioned above. At the absolute worst the word size will be too -- short. foreign import ccall unsafe "expat.h XML_GetErrorCode" xmlGetErrorCode :: ParserPtr -> IO CInt foreign import ccall unsafe "expat.h XML_GetCurrentLineNumber" xmlGetCurrentLineNumber :: ParserPtr -> IO CULong foreign import ccall unsafe "expat.h XML_GetCurrentColumnNumber" xmlGetCurrentColumnNumber :: ParserPtr -> IO CULong foreign import ccall unsafe "expat.h XML_GetCurrentByteIndex" xmlGetCurrentByteIndex :: ParserPtr -> IO CLong foreign import ccall unsafe "expat.h XML_GetCurrentByteCount" xmlGetCurrentByteCount :: ParserPtr -> IO CInt foreign import ccall unsafe "expat.h XML_ErrorString" xmlErrorString :: CInt -> IO CString foreign import ccall unsafe "expat.h XML_StopParser" xmlStopParser :: ParserPtr -> CInt -> IO ()