{-# LANGUAGE CPP #-}
{-# LANGUAGE BangPatterns        #-}
{-# LANGUAGE BinaryLiterals      #-}
{-# LANGUAGE LambdaCase          #-}
{-# LANGUAGE NamedFieldPuns      #-}
{-# LANGUAGE OverloadedStrings   #-}
{-# LANGUAGE ScopedTypeVariables #-}
-- | SAX parser and API for XML.
module Xeno.SAX
  ( process
  , Process(..)
  , StringLike(..)
  , fold
  , validate
  , validateEx
  , dump
  , skipDoctype
  ) where

import           Control.Exception (throw)
import           Control.Monad (unless)
import           Control.Monad.ST (ST, runST)
import           Control.Monad.State.Strict (State, evalStateT, execState, modify', lift, get, put)
import           Control.Spork (spork)
import           Data.Bits (testBit)
import           Data.ByteString (ByteString)
import qualified Data.ByteString as S
import qualified Data.ByteString.Char8 as S8
import qualified Data.ByteString.Unsafe as SU
import           Data.Char (isSpace)
import           Data.Functor.Identity (Identity(..))
import           Data.Semigroup ()
import           Data.STRef (newSTRef, modifySTRef', readSTRef)
import           Data.Word (Word8, Word64)
import           Xeno.Types


class StringLike str where
    s_index'       :: str -> Int -> Word8
    elemIndexFrom' :: Word8 -> str -> Int -> Maybe Int
    drop'          :: Int -> str -> str
    substring'     :: str -> Int -> Int -> ByteString
    toBS           :: str -> ByteString

instance StringLike ByteString where
    s_index' :: ByteString -> Int -> Word8
s_index'       = ByteString -> Int -> Word8
s_index
    {-# INLINE s_index' #-}
    elemIndexFrom' :: Word8 -> ByteString -> Int -> Maybe Int
elemIndexFrom' = Word8 -> ByteString -> Int -> Maybe Int
elemIndexFrom
    {-# INLINE elemIndexFrom' #-}
    drop' :: Int -> ByteString -> ByteString
drop'          = Int -> ByteString -> ByteString
S.drop
    {-# INLINE drop' #-}
    substring' :: ByteString -> Int -> Int -> ByteString
substring'     = ByteString -> Int -> Int -> ByteString
substring
    {-# INLINE substring' #-}
    toBS :: ByteString -> ByteString
toBS           = forall a. a -> a
id
    {-# INLINE toBS #-}

instance StringLike ByteStringZeroTerminated where
    s_index' :: ByteStringZeroTerminated -> Int -> Word8
s_index' (BSZT ByteString
ps) Int
n = ByteString
ps ByteString -> Int -> Word8
`SU.unsafeIndex` Int
n
    {-# INLINE s_index' #-}
    elemIndexFrom' :: Word8 -> ByteStringZeroTerminated -> Int -> Maybe Int
elemIndexFrom' Word8
w (BSZT ByteString
bs) Int
i = Word8 -> ByteString -> Int -> Maybe Int
elemIndexFrom Word8
w ByteString
bs Int
i
    {-# INLINE elemIndexFrom' #-}
    drop' :: Int -> ByteStringZeroTerminated -> ByteStringZeroTerminated
drop' Int
i (BSZT ByteString
bs) = ByteString -> ByteStringZeroTerminated
BSZT forall a b. (a -> b) -> a -> b
$ Int -> ByteString -> ByteString
S.drop Int
i ByteString
bs
    {-# INLINE drop' #-}
    substring' :: ByteStringZeroTerminated -> Int -> Int -> ByteString
substring' (BSZT ByteString
bs) Int
s Int
t = ByteString -> Int -> Int -> ByteString
substring ByteString
bs Int
s Int
t
    {-# INLINE substring' #-}
    toBS :: ByteStringZeroTerminated -> ByteString
toBS (BSZT ByteString
bs) = ByteString
bs
    {-# INLINE toBS #-}

-- | Parameters to the 'process' function
data Process a =
  Process {
      forall a. Process a -> ByteString -> a
openF    :: !(ByteString ->               a) -- ^ Open tag.
    , forall a. Process a -> ByteString -> ByteString -> a
attrF    :: !(ByteString -> ByteString -> a) -- ^ Tag attribute.
    , forall a. Process a -> ByteString -> a
endOpenF :: !(ByteString ->               a) -- ^ End open tag.
    , forall a. Process a -> ByteString -> a
textF    :: !(ByteString ->               a) -- ^ Text.
    , forall a. Process a -> ByteString -> a
closeF   :: !(ByteString ->               a) -- ^ Close tag.
    , forall a. Process a -> ByteString -> a
cdataF   :: !(ByteString ->               a) -- ^ CDATA.
    }

--------------------------------------------------------------------------------
-- Helpful interfaces to the parser

-- | Parse the XML but return no result, process no events.
--
-- N.B.: Only the lexical correctness of the input string is checked, not its XML semantics (e.g. only if tags are well formed, not whether tags are properly closed)
--
-- > > :set -XOverloadedStrings
-- > > validate "<b>"
-- > True
--
-- > > validate "<b"
-- > False
validate :: (StringLike str) => str -> Bool
validate :: forall str. StringLike str => str -> Bool
validate str
s =
  case forall e a. Exception e => a -> Either e a
spork
         (forall a. Identity a -> a
runIdentity
            (forall (m :: * -> *) str.
(Monad m, StringLike str) =>
Process (m ()) -> str -> m ()
process
               Process {
                 openF :: ByteString -> Identity ()
openF    = \ByteString
_   -> forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
               , attrF :: ByteString -> ByteString -> Identity ()
attrF    = \ByteString
_ ByteString
_ -> forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
               , endOpenF :: ByteString -> Identity ()
endOpenF = \ByteString
_   -> forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
               , textF :: ByteString -> Identity ()
textF    = \ByteString
_   -> forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
               , closeF :: ByteString -> Identity ()
closeF   = \ByteString
_   -> forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
               , cdataF :: ByteString -> Identity ()
cdataF   = \ByteString
_   -> forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
               }
               str
s)) of
    Left (XenoException
_ :: XenoException) -> Bool
False
    Right ()
_ -> Bool
True
-- It must be inlined or specialised to ByteString/ByteStringZeroTerminated
{-# INLINE validate #-}
{-# SPECIALISE validate :: ByteString -> Bool #-}
{-# SPECIALISE validate :: ByteStringZeroTerminated -> Bool #-}


-- | Parse the XML and checks tags nesting.
--
validateEx :: (StringLike str) => str -> Bool
validateEx :: forall str. StringLike str => str -> Bool
validateEx str
s =
  case forall e a. Exception e => a -> Either e a
spork
         (forall a. (forall s. ST s a) -> a
runST forall a b. (a -> b) -> a -> b
$ do
            STRef s [ByteString]
tags <- forall a s. a -> ST s (STRef s a)
newSTRef []
            (forall (m :: * -> *) str.
(Monad m, StringLike str) =>
Process (m ()) -> str -> m ()
process
               Process {
                 openF :: ByteString -> ST s ()
openF    = \ByteString
tag   -> forall s a. STRef s a -> (a -> a) -> ST s ()
modifySTRef' STRef s [ByteString]
tags (ByteString
tagforall a. a -> [a] -> [a]
:)
               , attrF :: ByteString -> ByteString -> ST s ()
attrF    = \ByteString
_ ByteString
_ -> forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
               , endOpenF :: ByteString -> ST s ()
endOpenF = \ByteString
_   -> forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
               , textF :: ByteString -> ST s ()
textF    = \ByteString
_   -> forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
               , closeF :: ByteString -> ST s ()
closeF   = \ByteString
tag  ->
                   forall s a. STRef s a -> (a -> a) -> ST s ()
modifySTRef' STRef s [ByteString]
tags forall a b. (a -> b) -> a -> b
$ \case
                      [] -> forall (m :: * -> *) a. MonadFail m => [Char] -> m a
fail forall a b. (a -> b) -> a -> b
$ [Char]
"Unexpected close tag \"" forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show ByteString
tag forall a. [a] -> [a] -> [a]
++ [Char]
"\""
                      (ByteString
expectedTag:[ByteString]
tags') ->
                          if ByteString
expectedTag forall a. Eq a => a -> a -> Bool
== ByteString
tag
                          then [ByteString]
tags'
                          else forall (m :: * -> *) a. MonadFail m => [Char] -> m a
fail forall a b. (a -> b) -> a -> b
$ [Char]
"Unexpected close tag. Expected \"" forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show ByteString
expectedTag forall a. [a] -> [a] -> [a]
++ [Char]
"\", but got \"" forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show ByteString
tag forall a. [a] -> [a] -> [a]
++ [Char]
"\""
               , cdataF :: ByteString -> ST s ()
cdataF   = \ByteString
_   -> forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
               }
               str
s)
            forall s a. STRef s a -> ST s a
readSTRef STRef s [ByteString]
tags forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \case
                [] -> forall (m :: * -> *) a. Monad m => a -> m a
return ()
                [ByteString]
tags' -> forall a. HasCallStack => [Char] -> a
error forall a b. (a -> b) -> a -> b
$ [Char]
"Not all tags closed: " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show [ByteString]
tags'
         ) of
    Left (XenoException
_ :: XenoException) -> Bool
False
    Right ()
_ -> Bool
True
{-# INLINE validateEx #-}
{-# SPECIALISE validateEx :: ByteString -> Bool #-}
{-# SPECIALISE validateEx :: ByteStringZeroTerminated -> Bool #-}


-- | Parse the XML and pretty print it to stdout.
dump :: ByteString -> IO ()
dump :: ByteString -> IO ()
dump ByteString
str =
  forall (m :: * -> *) s a. Monad m => StateT s m a -> s -> m a
evalStateT
    (forall (m :: * -> *) str.
(Monad m, StringLike str) =>
Process (m ()) -> str -> m ()
process
       Process {
         openF :: ByteString -> StateT Int IO ()
openF = \ByteString
name -> do
          Int
level <- forall s (m :: * -> *). MonadState s m => m s
get
          forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (ByteString -> IO ()
S8.putStr (Int -> Char -> ByteString
S8.replicate Int
level Char
' ' forall a. Semigroup a => a -> a -> a
<> ByteString
"<" forall a. Semigroup a => a -> a -> a
<> ByteString
name forall a. Semigroup a => a -> a -> a
<> ByteString
""))
       , attrF :: ByteString -> ByteString -> StateT Int IO ()
attrF = \ByteString
key ByteString
value -> forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (ByteString -> IO ()
S8.putStr (ByteString
" " forall a. Semigroup a => a -> a -> a
<> ByteString
key forall a. Semigroup a => a -> a -> a
<> ByteString
"=\"" forall a. Semigroup a => a -> a -> a
<> ByteString
value forall a. Semigroup a => a -> a -> a
<> ByteString
"\""))
       , endOpenF :: ByteString -> StateT Int IO ()
endOpenF = \ByteString
_ -> do
          Int
level <- forall s (m :: * -> *). MonadState s m => m s
get
          let !level' :: Int
level' = Int
level forall a. Num a => a -> a -> a
+ Int
2
          forall s (m :: * -> *). MonadState s m => s -> m ()
put Int
level'
          forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (ByteString -> IO ()
S8.putStrLn (ByteString
">"))
       , textF :: ByteString -> StateT Int IO ()
textF = \ByteString
text -> do
          Int
level <- forall s (m :: * -> *). MonadState s m => m s
get
          forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (ByteString -> IO ()
S8.putStrLn (Int -> Char -> ByteString
S8.replicate Int
level Char
' ' forall a. Semigroup a => a -> a -> a
<> [Char] -> ByteString
S8.pack (forall a. Show a => a -> [Char]
show ByteString
text)))
       , closeF :: ByteString -> StateT Int IO ()
closeF = \ByteString
name -> do
          Int
level <- forall s (m :: * -> *). MonadState s m => m s
get
          let !level' :: Int
level' = Int
level forall a. Num a => a -> a -> a
- Int
2
          forall s (m :: * -> *). MonadState s m => s -> m ()
put Int
level'
          forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (ByteString -> IO ()
S8.putStrLn (Int -> Char -> ByteString
S8.replicate Int
level' Char
' ' forall a. Semigroup a => a -> a -> a
<> ByteString
"</" forall a. Semigroup a => a -> a -> a
<> ByteString
name forall a. Semigroup a => a -> a -> a
<> ByteString
">"))
       , cdataF :: ByteString -> StateT Int IO ()
cdataF = \ByteString
cdata -> do
          Int
level <- forall s (m :: * -> *). MonadState s m => m s
get
          forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (ByteString -> IO ()
S8.putStrLn (Int -> Char -> ByteString
S8.replicate Int
level Char
' ' forall a. Semigroup a => a -> a -> a
<> ByteString
"CDATA: " forall a. Semigroup a => a -> a -> a
<> [Char] -> ByteString
S8.pack (forall a. Show a => a -> [Char]
show ByteString
cdata)))
       }
       ByteString
str)
    (Int
0 :: Int)

-- | Fold over the XML input.
fold
  :: (s -> ByteString -> s) -- ^ Open tag.
  -> (s -> ByteString -> ByteString -> s) -- ^ Attribute key/value.
  -> (s -> ByteString -> s) -- ^ End of open tag.
  -> (s -> ByteString -> s) -- ^ Text.
  -> (s -> ByteString -> s) -- ^ Close tag.
  -> (s -> ByteString -> s) -- ^ CDATA.
  -> s
  -> ByteString
  -> Either XenoException s
fold :: forall s.
(s -> ByteString -> s)
-> (s -> ByteString -> ByteString -> s)
-> (s -> ByteString -> s)
-> (s -> ByteString -> s)
-> (s -> ByteString -> s)
-> (s -> ByteString -> s)
-> s
-> ByteString
-> Either XenoException s
fold s -> ByteString -> s
openF s -> ByteString -> ByteString -> s
attrF s -> ByteString -> s
endOpenF s -> ByteString -> s
textF s -> ByteString -> s
closeF s -> ByteString -> s
cdataF s
s ByteString
str =
  forall e a. Exception e => a -> Either e a
spork
    (forall s a. State s a -> s -> s
execState
       (forall (m :: * -> *) str.
(Monad m, StringLike str) =>
Process (m ()) -> str -> m ()
process Process {
            openF :: ByteString -> StateT s Identity ()
openF    = \ByteString
name -> forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify' (\s
s' -> s -> ByteString -> s
openF s
s' ByteString
name)
          , attrF :: ByteString -> ByteString -> StateT s Identity ()
attrF    = \ByteString
key ByteString
value -> forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify' (\s
s' -> s -> ByteString -> ByteString -> s
attrF s
s' ByteString
key ByteString
value)
          , endOpenF :: ByteString -> StateT s Identity ()
endOpenF = \ByteString
name -> forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify' (\s
s' -> s -> ByteString -> s
endOpenF s
s' ByteString
name)
          , textF :: ByteString -> StateT s Identity ()
textF    = \ByteString
text -> forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify' (\s
s' -> s -> ByteString -> s
textF s
s' ByteString
text)
          , closeF :: ByteString -> StateT s Identity ()
closeF   = \ByteString
name -> forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify' (\s
s' -> s -> ByteString -> s
closeF s
s' ByteString
name)
          , cdataF :: ByteString -> StateT s Identity ()
cdataF   = \ByteString
cdata -> forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify' (\s
s' -> s -> ByteString -> s
cdataF s
s' ByteString
cdata)
        } ByteString
str)
       s
s)

--------------------------------------------------------------------------------
-- Main parsing function

-- | Process events with callbacks in the XML input.
process
  :: (Monad m, StringLike str)
  => Process (m ())
  -> str
  -> m ()
process :: forall (m :: * -> *) str.
(Monad m, StringLike str) =>
Process (m ()) -> str -> m ()
process !(Process {ByteString -> m ()
openF :: ByteString -> m ()
openF :: forall a. Process a -> ByteString -> a
openF, ByteString -> ByteString -> m ()
attrF :: ByteString -> ByteString -> m ()
attrF :: forall a. Process a -> ByteString -> ByteString -> a
attrF, ByteString -> m ()
endOpenF :: ByteString -> m ()
endOpenF :: forall a. Process a -> ByteString -> a
endOpenF, ByteString -> m ()
textF :: ByteString -> m ()
textF :: forall a. Process a -> ByteString -> a
textF, ByteString -> m ()
closeF :: ByteString -> m ()
closeF :: forall a. Process a -> ByteString -> a
closeF, ByteString -> m ()
cdataF :: ByteString -> m ()
cdataF :: forall a. Process a -> ByteString -> a
cdataF}) str
str = Int -> m ()
findLT Int
0
  where
    findLT :: Int -> m ()
findLT Int
index =
      case forall str. StringLike str => Word8 -> str -> Int -> Maybe Int
elemIndexFrom' Word8
openTagChar str
str Int
index of
        Maybe Int
Nothing -> forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless (ByteString -> Bool
S.null ByteString
text) (ByteString -> m ()
textF ByteString
text)
          where text :: ByteString
text = forall str. StringLike str => str -> ByteString
toBS forall a b. (a -> b) -> a -> b
$ forall str. StringLike str => Int -> str -> str
drop' Int
index str
str
        Just Int
fromLt -> do
          forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless (ByteString -> Bool
S.null ByteString
text) (ByteString -> m ()
textF ByteString
text)
          Int -> m ()
checkOpenComment (Int
fromLt forall a. Num a => a -> a -> a
+ Int
1)
          where text :: ByteString
text = forall str. StringLike str => str -> Int -> Int -> ByteString
substring' str
str Int
index Int
fromLt
    -- Find open comment, CDATA or tag name.
    checkOpenComment :: Int -> m ()
checkOpenComment Int
index
      | forall str. StringLike str => str -> Int -> Word8
s_index' str
this Int
0 forall a. Eq a => a -> a -> Bool
== Word8
bangChar -- !
      , forall str. StringLike str => str -> Int -> Word8
s_index' str
this Int
1 forall a. Eq a => a -> a -> Bool
== Word8
commentChar -- -
      , forall str. StringLike str => str -> Int -> Word8
s_index' str
this Int
2 forall a. Eq a => a -> a -> Bool
== Word8
commentChar -- -
      =  Int -> m ()
findCommentEnd (Int
index forall a. Num a => a -> a -> a
+ Int
3)

      | forall str. StringLike str => str -> Int -> Word8
s_index' str
this Int
0 forall a. Eq a => a -> a -> Bool
== Word8
bangChar -- !
      , forall str. StringLike str => str -> Int -> Word8
s_index' str
this Int
1 forall a. Eq a => a -> a -> Bool
== Word8
openAngleBracketChar -- [
      , forall str. StringLike str => str -> Int -> Word8
s_index' str
this Int
2 forall a. Eq a => a -> a -> Bool
== Word8
67 -- C
      , forall str. StringLike str => str -> Int -> Word8
s_index' str
this Int
3 forall a. Eq a => a -> a -> Bool
== Word8
68 -- D
      , forall str. StringLike str => str -> Int -> Word8
s_index' str
this Int
4 forall a. Eq a => a -> a -> Bool
== Word8
65 -- A
      , forall str. StringLike str => str -> Int -> Word8
s_index' str
this Int
5 forall a. Eq a => a -> a -> Bool
== Word8
84 -- T
      , forall str. StringLike str => str -> Int -> Word8
s_index' str
this Int
6 forall a. Eq a => a -> a -> Bool
== Word8
65 -- A
      , forall str. StringLike str => str -> Int -> Word8
s_index' str
this Int
7 forall a. Eq a => a -> a -> Bool
== Word8
openAngleBracketChar -- [
      =  Int -> Int -> m ()
findCDataEnd (Int
index forall a. Num a => a -> a -> a
+ Int
8) (Int
index forall a. Num a => a -> a -> a
+ Int
8)

      | Bool
otherwise
      = Int -> m ()
findTagName Int
index
      where
        this :: str
this = forall str. StringLike str => Int -> str -> str
drop' Int
index str
str
    findCommentEnd :: Int -> m ()
findCommentEnd Int
index =
      case forall str. StringLike str => Word8 -> str -> Int -> Maybe Int
elemIndexFrom' Word8
commentChar str
str Int
index of
        Maybe Int
Nothing -> forall a e. Exception e => e -> a
throw forall a b. (a -> b) -> a -> b
$ Int -> ByteString -> XenoException
XenoParseError Int
index ByteString
"Couldn't find the closing comment dash."
        Just Int
fromDash ->
          if forall str. StringLike str => str -> Int -> Word8
s_index' str
this Int
0 forall a. Eq a => a -> a -> Bool
== Word8
commentChar Bool -> Bool -> Bool
&& forall str. StringLike str => str -> Int -> Word8
s_index' str
this Int
1 forall a. Eq a => a -> a -> Bool
== Word8
closeTagChar
            then Int -> m ()
findLT (Int
fromDash forall a. Num a => a -> a -> a
+ Int
2)
            else Int -> m ()
findCommentEnd (Int
fromDash forall a. Num a => a -> a -> a
+ Int
1)
          where this :: str
this = forall str. StringLike str => Int -> str -> str
drop' Int
index str
str
    findCDataEnd :: Int -> Int -> m ()
findCDataEnd Int
cdata_start Int
index =
      case forall str. StringLike str => Word8 -> str -> Int -> Maybe Int
elemIndexFrom' Word8
closeAngleBracketChar str
str Int
index of
        Maybe Int
Nothing -> forall a e. Exception e => e -> a
throw forall a b. (a -> b) -> a -> b
$ Int -> ByteString -> XenoException
XenoParseError Int
index ByteString
"Couldn't find closing angle bracket for CDATA."
        Just Int
fromCloseAngleBracket ->
          if forall str. StringLike str => str -> Int -> Word8
s_index' str
str (Int
fromCloseAngleBracket forall a. Num a => a -> a -> a
+ Int
1) forall a. Eq a => a -> a -> Bool
== Word8
closeAngleBracketChar
             then do
               ByteString -> m ()
cdataF (forall str. StringLike str => str -> Int -> Int -> ByteString
substring' str
str Int
cdata_start Int
fromCloseAngleBracket)
               Int -> m ()
findLT (Int
fromCloseAngleBracket forall a. Num a => a -> a -> a
+ Int
3) -- Start after ]]>
             else
               -- We only found one ], that means that we need to keep searching.
               Int -> Int -> m ()
findCDataEnd Int
cdata_start (Int
fromCloseAngleBracket forall a. Num a => a -> a -> a
+ Int
1)
    findTagName :: Int -> m ()
findTagName Int
index0
      | forall str. StringLike str => str -> Int -> Word8
s_index' str
str Int
index0 forall a. Eq a => a -> a -> Bool
== Word8
questionChar =
        case forall str. StringLike str => Word8 -> str -> Int -> Maybe Int
elemIndexFrom' Word8
closeTagChar str
str Int
spaceOrCloseTag of
          Maybe Int
Nothing -> forall a e. Exception e => e -> a
throw forall a b. (a -> b) -> a -> b
$ Int -> ByteString -> XenoException
XenoParseError Int
index ByteString
"Couldn't find the end of the tag."
          Just Int
fromGt -> do
            Int -> m ()
findLT (Int
fromGt forall a. Num a => a -> a -> a
+ Int
1)
      | forall str. StringLike str => str -> Int -> Word8
s_index' str
str Int
spaceOrCloseTag forall a. Eq a => a -> a -> Bool
== Word8
closeTagChar = do
        let tagname :: ByteString
tagname = forall str. StringLike str => str -> Int -> Int -> ByteString
substring' str
str Int
index Int
spaceOrCloseTag
        if forall str. StringLike str => str -> Int -> Word8
s_index' str
str Int
index0 forall a. Eq a => a -> a -> Bool
== Word8
slashChar
          then ByteString -> m ()
closeF ByteString
tagname
          else do
            ByteString -> m ()
openF ByteString
tagname
            ByteString -> m ()
endOpenF ByteString
tagname
        Int -> m ()
findLT (Int
spaceOrCloseTag forall a. Num a => a -> a -> a
+ Int
1)
      | Bool
otherwise = do
        let tagname :: ByteString
tagname = forall str. StringLike str => str -> Int -> Int -> ByteString
substring' str
str Int
index Int
spaceOrCloseTag
        ByteString -> m ()
openF ByteString
tagname
        Either Int Int
result <- Int -> m (Either Int Int)
findAttributes Int
spaceOrCloseTag
        ByteString -> m ()
endOpenF ByteString
tagname
        case Either Int Int
result of
          Right Int
closingTag -> Int -> m ()
findLT (Int
closingTag forall a. Num a => a -> a -> a
+ Int
1)
          Left Int
closingPair -> do
            ByteString -> m ()
closeF ByteString
tagname
            Int -> m ()
findLT (Int
closingPair forall a. Num a => a -> a -> a
+ Int
2)
      where
        index :: Int
index =
          if forall str. StringLike str => str -> Int -> Word8
s_index' str
str Int
index0 forall a. Eq a => a -> a -> Bool
== Word8
slashChar
            then Int
index0 forall a. Num a => a -> a -> a
+ Int
1
            else Int
index0
        spaceOrCloseTag :: Int
spaceOrCloseTag = forall str. StringLike str => str -> Int -> Int
parseName str
str Int
index
    findAttributes :: Int -> m (Either Int Int)
findAttributes Int
index0
      -- case: />
      | forall str. StringLike str => str -> Int -> Word8
s_index' str
str Int
index forall a. Eq a => a -> a -> Bool
== Word8
slashChar
      , forall str. StringLike str => str -> Int -> Word8
s_index' str
str (Int
index forall a. Num a => a -> a -> a
+ Int
1) forall a. Eq a => a -> a -> Bool
== Word8
closeTagChar
      = forall (f :: * -> *) a. Applicative f => a -> f a
pure (forall a b. a -> Either a b
Left Int
index)

      -- case: >
      | forall str. StringLike str => str -> Int -> Word8
s_index' str
str Int
index forall a. Eq a => a -> a -> Bool
== Word8
closeTagChar
      = forall (f :: * -> *) a. Applicative f => a -> f a
pure (forall a b. b -> Either a b
Right Int
index)

      -- case: attr=' or attr="
      | forall str. StringLike str => str -> Int -> Word8
s_index' str
str Int
afterAttrName forall a. Eq a => a -> a -> Bool
== Word8
equalChar
      , Word8
usedChar forall a. Eq a => a -> a -> Bool
== Word8
quoteChar Bool -> Bool -> Bool
|| Word8
usedChar forall a. Eq a => a -> a -> Bool
== Word8
doubleQuoteChar
      = case forall str. StringLike str => Word8 -> str -> Int -> Maybe Int
elemIndexFrom' Word8
usedChar str
str (Int
quoteIndex forall a. Num a => a -> a -> a
+ Int
1) of
          Maybe Int
Nothing ->
            forall a e. Exception e => e -> a
throw
              (Int -> ByteString -> XenoException
XenoParseError Int
index ByteString
"Couldn't find the matching quote character.")
          Just Int
endQuoteIndex -> do
            ByteString -> ByteString -> m ()
attrF
              (forall str. StringLike str => str -> Int -> Int -> ByteString
substring' str
str Int
index Int
afterAttrName)
              (forall str. StringLike str => str -> Int -> Int -> ByteString
substring'
                 str
str
                 (Int
quoteIndex forall a. Num a => a -> a -> a
+ Int
1)
                 (Int
endQuoteIndex))
            Int -> m (Either Int Int)
findAttributes (Int
endQuoteIndex forall a. Num a => a -> a -> a
+ Int
1)

      -- case: attr= without following quote
      | forall str. StringLike str => str -> Int -> Word8
s_index' str
str Int
afterAttrName forall a. Eq a => a -> a -> Bool
== Word8
equalChar
      = forall a e. Exception e => e -> a
throw (Int -> ByteString -> XenoException
XenoParseError Int
index(ByteString
"Expected ' or \", got: " forall a. Semigroup a => a -> a -> a
<> Word8 -> ByteString
S.singleton Word8
usedChar))

      | Bool
otherwise
      = forall a e. Exception e => e -> a
throw (Int -> ByteString -> XenoException
XenoParseError Int
index (ByteString
"Expected =, got: " forall a. Semigroup a => a -> a -> a
<> Word8 -> ByteString
S.singleton (forall str. StringLike str => str -> Int -> Word8
s_index' str
str Int
afterAttrName) forall a. Semigroup a => a -> a -> a
<> ByteString
" at character index: " forall a. Semigroup a => a -> a -> a
<> ([Char] -> ByteString
S8.pack forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Show a => a -> [Char]
show) Int
afterAttrName))
      where
        index :: Int
index = forall str. StringLike str => str -> Int -> Int
skipSpaces str
str Int
index0
#ifdef WHITESPACE_AROUND_EQUALS
        afterAttrName = skipSpaces str (parseName str index)
        quoteIndex = skipSpaces str (afterAttrName + 1)
#else
        afterAttrName :: Int
afterAttrName = forall str. StringLike str => str -> Int -> Int
parseName str
str Int
index
        quoteIndex :: Int
quoteIndex = Int
afterAttrName forall a. Num a => a -> a -> a
+ Int
1
#endif
        usedChar :: Word8
usedChar = forall str. StringLike str => str -> Int -> Word8
s_index' str
str Int
quoteIndex

{-# INLINE process #-}
{-# SPECIALISE process :: Process (Identity ()) -> ByteString -> Identity ()
               #-}
{-# SPECIALISE process :: Process (State s ()) -> ByteString -> State s ()
               #-}
{-# SPECIALISE process :: Process (ST s ()) -> ByteString -> ST s ()
               #-}
{-# SPECIALISE process :: Process (IO ()) -> ByteString -> IO ()
               #-}
{-# SPECIALISE process :: Process (Identity ()) -> ByteStringZeroTerminated -> Identity ()
               #-}
{-# SPECIALISE process :: Process (State s ()) -> ByteStringZeroTerminated -> State s ()
               #-}
{-# SPECIALISE process :: Process (ST s ()) -> ByteStringZeroTerminated -> ST s ()
               #-}
{-# SPECIALISE process :: Process (IO ()) -> ByteStringZeroTerminated -> IO ()
               #-}

--------------------------------------------------------------------------------
-- ByteString utilities

-- | /O(1)/ 'ByteString' index (subscript) operator, starting from 0.
s_index :: ByteString -> Int -> Word8
s_index :: ByteString -> Int -> Word8
s_index ByteString
ps Int
n
    | Int
n forall a. Ord a => a -> a -> Bool
< Int
0            = forall a e. Exception e => e -> a
throw (Int -> ByteString -> XenoException
XenoStringIndexProblem Int
n ByteString
ps)
    | Int
n forall a. Ord a => a -> a -> Bool
>= ByteString -> Int
S.length ByteString
ps = forall a e. Exception e => e -> a
throw (Int -> ByteString -> XenoException
XenoStringIndexProblem Int
n ByteString
ps)
    | Bool
otherwise        = ByteString
ps ByteString -> Int -> Word8
`SU.unsafeIndex` Int
n
{-# INLINE s_index #-}

-- | A fast space skipping function.
skipSpaces :: (StringLike str) => str -> Int -> Int
skipSpaces :: forall str. StringLike str => str -> Int -> Int
skipSpaces str
str Int
i =
  if Word8 -> Bool
isSpaceChar (forall str. StringLike str => str -> Int -> Word8
s_index' str
str Int
i)
    then forall str. StringLike str => str -> Int -> Int
skipSpaces str
str (Int
i forall a. Num a => a -> a -> a
+ Int
1)
    else Int
i
{-# INLINE skipSpaces #-}

-- | Get a substring of a string.
substring :: ByteString -> Int -> Int -> ByteString
substring :: ByteString -> Int -> Int -> ByteString
substring ByteString
s Int
start Int
end = Int -> ByteString -> ByteString
S.take (Int
end forall a. Num a => a -> a -> a
- Int
start) (Int -> ByteString -> ByteString
S.drop Int
start ByteString
s)
{-# INLINE substring #-}

-- | Basically @findIndex (not . isNameChar)@, but doesn't allocate.
parseName :: (StringLike str) => str -> Int -> Int
parseName :: forall str. StringLike str => str -> Int -> Int
parseName str
str Int
index =
  if Bool -> Bool
not (Word8 -> Bool
isNameChar1 (forall str. StringLike str => str -> Int -> Word8
s_index' str
str Int
index))
     then Int
index
     else forall str. StringLike str => str -> Int -> Int
parseName' str
str (Int
index forall a. Num a => a -> a -> a
+ Int
1)
{-# INLINE parseName #-}

-- | Basically @findIndex (not . isNameChar)@, but doesn't allocate.
parseName' :: (StringLike str) => str -> Int -> Int
parseName' :: forall str. StringLike str => str -> Int -> Int
parseName' str
str Int
index =
  if Bool -> Bool
not (Word8 -> Bool
isNameChar (forall str. StringLike str => str -> Int -> Word8
s_index' str
str Int
index))
     then Int
index
     else forall str. StringLike str => str -> Int -> Int
parseName' str
str (Int
index forall a. Num a => a -> a -> a
+ Int
1)
{-# INLINE parseName' #-}

-- | Get index of an element starting from offset.
elemIndexFrom :: Word8 -> ByteString -> Int -> Maybe Int
elemIndexFrom :: Word8 -> ByteString -> Int -> Maybe Int
elemIndexFrom Word8
c ByteString
str Int
offset = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (forall a. Num a => a -> a -> a
+ Int
offset) (Word8 -> ByteString -> Maybe Int
S.elemIndex Word8
c (Int -> ByteString -> ByteString
S.drop Int
offset ByteString
str))
-- Without the INLINE below, the whole function is twice as slow and
-- has linear allocation. See git commit with this comment for
-- results.
{-# INLINE elemIndexFrom #-}

--------------------------------------------------------------------------------
-- Character types

isSpaceChar :: Word8 -> Bool
isSpaceChar :: Word8 -> Bool
isSpaceChar = forall a. Bits a => a -> Int -> Bool
testBit (Int
0b100000000000000000010011000000000 :: Int) forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (Integral a, Num b) => a -> b
fromIntegral
--                       |                  |  ||  bits:
--                       |                  |  |+-- 9
--                       |                  |  +--- 10
--                       |                  +------ 13
--                       +------------------------- 32
{-# INLINE isSpaceChar #-}

-- | Is the character a valid first tag/attribute name constituent?
-- 'a'-'z', 'A'-'Z', '_', ':'
isNameChar1 :: Word8 -> Bool
isNameChar1 :: Word8 -> Bool
isNameChar1 Word8
c =
  (Word8
c forall a. Ord a => a -> a -> Bool
>= Word8
97 Bool -> Bool -> Bool
&& Word8
c forall a. Ord a => a -> a -> Bool
<= Word8
122) Bool -> Bool -> Bool
|| (Word8
c forall a. Ord a => a -> a -> Bool
>= Word8
65 Bool -> Bool -> Bool
&& Word8
c forall a. Ord a => a -> a -> Bool
<= Word8
90) Bool -> Bool -> Bool
|| Word8
c forall a. Eq a => a -> a -> Bool
== Word8
95 Bool -> Bool -> Bool
|| Word8
c forall a. Eq a => a -> a -> Bool
== Word8
58
{-# INLINE isNameChar1 #-}

-- isNameCharOriginal :: Word8 -> Bool
-- isNameCharOriginal c =
--   (c >= 97 && c <= 122) || (c >= 65 && c <= 90) || c == 95 || c == 58 ||
--   c == 45 || c == 46 || (c >= 48 && c <= 57)
-- {-# INLINE isNameCharOriginal #-}
--
-- TODO Strange, but highMaskIsNameChar, lowMaskIsNameChar don't calculate fast... FIX IT
-- highMaskIsNameChar, lowMaskIsNameChar :: Word64
-- (highMaskIsNameChar, lowMaskIsNameChar) =
--     foldl (\(hi,low) char -> (hi `setBit` (char - 64), low `setBit` char)) -- NB: `setBit` can process overflowed values (where char < 64; -- TODO fix it
--           (0::Word64, 0::Word64)
--           (map fromIntegral (filter isNameCharOriginal [0..128]))
-- {-# INLINE highMaskIsNameChar #-}
-- {-# INLINE lowMaskIsNameChar #-}

-- | Is the character a valid tag/attribute name constituent?
-- isNameChar1 + '-', '.', '0'-'9'
isNameChar :: Word8 -> Bool
isNameChar :: Word8 -> Bool
isNameChar Word8
char = (Word64
lowMaskIsNameChar forall a. Bits a => a -> Int -> Bool
`testBit` Int
char'low) Bool -> Bool -> Bool
|| (Word64
highMaskIsNameChar forall a. Bits a => a -> Int -> Bool
`testBit` Int
char'high)
   -- TODO 1) change code to use W# instead of Word64
   --      2) Document `ii - 64` -- there is underflow, but `testBit` can process this!
  where
    char'low :: Int
char'low  = forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
char
    char'high :: Int
char'high = forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8
char forall a. Num a => a -> a -> a
- Word8
64)
    highMaskIsNameChar :: Word64
    highMaskIsNameChar :: Word64
highMaskIsNameChar = Word64
0b11111111111111111111111111010000111111111111111111111111110
    --                     ------------+------------- |    ------------+-------------
    --                                 |              |                |  bits:
    --                                 |              |                +-- 65-90
    --                                 |              +------------------- 95
    --                                 +---------------------------------- 97-122
    lowMaskIsNameChar :: Word64
    lowMaskIsNameChar :: Word64
lowMaskIsNameChar =  Word64
0b11111111111011000000000000000000000000000000000000000000000
    --                     -----+----- ||
    --                          |      ||  bits:
    --                          |      |+-- 45
    --                          |      +--- 46
    --                          +---------- 48-58
{-# INLINE isNameChar #-}

-- | Char for '\''.
quoteChar :: Word8
quoteChar :: Word8
quoteChar = Word8
39

-- | Char for '"'.
doubleQuoteChar :: Word8
doubleQuoteChar :: Word8
doubleQuoteChar = Word8
34

-- | Char for '='.
equalChar :: Word8
equalChar :: Word8
equalChar = Word8
61

-- | Char for '?'.
questionChar :: Word8
questionChar :: Word8
questionChar = Word8
63

-- | Char for '/'.
slashChar :: Word8
slashChar :: Word8
slashChar = Word8
47

-- | Exclaimation character !.
bangChar :: Word8
bangChar :: Word8
bangChar = Word8
33

-- | The dash character.
commentChar :: Word8
commentChar :: Word8
commentChar = Word8
45 -- '-'

-- | Open tag character.
openTagChar :: Word8
openTagChar :: Word8
openTagChar = Word8
60 -- '<'

-- | Close tag character.
closeTagChar :: Word8
closeTagChar :: Word8
closeTagChar = Word8
62 -- '>'

-- | Open angle bracket character.
openAngleBracketChar :: Word8
openAngleBracketChar :: Word8
openAngleBracketChar = Word8
91

-- | Close angle bracket character.
closeAngleBracketChar :: Word8
closeAngleBracketChar :: Word8
closeAngleBracketChar = Word8
93

-- | Skip initial DOCTYPE declaration
skipDoctype :: ByteString -> ByteString
skipDoctype :: ByteString -> ByteString
skipDoctype ByteString
arg =
    if ByteString
"<!DOCTYPE" ByteString -> ByteString -> Bool
`S8.isPrefixOf` ByteString
bs
      then let (ByteString
_, ByteString
rest)=ByteString
">" ByteString -> ByteString -> (ByteString, ByteString)
`S8.breakSubstring` ByteString
bs
           in ByteString -> ByteString
skipBlanks forall a b. (a -> b) -> a -> b
$ Int -> ByteString -> ByteString
S8.drop Int
1 ByteString
rest
      else ByteString
bs
  where
    bs :: ByteString
bs = ByteString -> ByteString
skipBlanks ByteString
arg
    skipBlanks :: ByteString -> ByteString
skipBlanks = (Char -> Bool) -> ByteString -> ByteString
S8.dropWhile Char -> Bool
isSpace