src/Codec/Xlsx/Parser/Internal/Fast.hs

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE MultiWayIf #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TupleSections #-}
module Codec.Xlsx.Parser.Internal.Fast
  ( FromXenoNode(..)
  , collectChildren
  , maybeChild
  , requireChild
  , childList
  , maybeFromChild
  , fromChild
  , fromChildList
  , maybeParse
  , requireAndParse
  , childListAny
  , maybeElementVal
  , toAttrParser
  , parseAttributes
  , FromAttrBs(..)
  , unexpectedAttrBs
  , maybeAttrBs
  , maybeAttr
  , fromAttr
  , fromAttrDef
  , contentBs
  , contentX
  , nsPrefixes
  , addPrefix
  ) where

import Control.Applicative
import Control.Arrow (second)
import Control.Exception (Exception, throw)
import Control.Monad (ap, forM, join, liftM)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Unsafe as SU
import Data.Maybe
import Data.Monoid
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import Data.Word (Word8)
import Xeno.DOM hiding (parse)

import Codec.Xlsx.Parser.Internal.Util

class FromXenoNode a where
  fromXenoNode :: Node -> Either Text a

newtype ChildCollector a = ChildCollector
  { runChildCollector :: [Node] -> Either Text ([Node], a)
  }

instance Functor ChildCollector where
  fmap f a = ChildCollector $ \ns ->
    second f <$> runChildCollector a ns

instance Applicative ChildCollector where
  pure a = ChildCollector $ \ns ->
    return (ns, a)
  cf <*> ca = ChildCollector $ \ns -> do
    (ns', f) <- runChildCollector cf ns
    (ns'', a) <- runChildCollector ca ns'
    return (ns'', f a)

instance Alternative ChildCollector where
  empty = ChildCollector $ \_ -> Left "ChildCollector.empty"
  ChildCollector f <|> ChildCollector g = ChildCollector $ \ns ->
    either (const $ g ns) Right (f ns)

instance Monad ChildCollector where
  return = pure
  ChildCollector f >>= g = ChildCollector $ \ns ->
    either Left (\(!ns', f') -> runChildCollector (g f') ns') (f ns)

toChildCollector :: Either Text a -> ChildCollector a
toChildCollector unlifted =
  case unlifted of
    Right a -> return a
    Left e -> ChildCollector $ \_ -> Left e

collectChildren :: Node -> ChildCollector a -> Either Text a
collectChildren n c = snd <$> runChildCollector c (children n)

maybeChild :: ByteString -> ChildCollector (Maybe Node)
maybeChild nm =
  ChildCollector $ \case
    (n:ns)
      | name n == nm -> pure (ns, Just n)
    ns -> pure (ns, Nothing)

requireChild :: ByteString -> ChildCollector Node
requireChild nm =
  ChildCollector $ \case
    (n:ns)
      | name n == nm -> pure (ns, n)
    _ ->
      Left $ "required element " <> T.pack (show nm) <> " was not found"

childList :: ByteString -> ChildCollector [Node]
childList nm = do
  mNode <- maybeChild nm
  case mNode of
    Just n -> (n:) <$> childList nm
    Nothing -> return []

maybeFromChild :: (FromXenoNode a) => ByteString -> ChildCollector (Maybe a)
maybeFromChild nm = do
  mNode <- maybeChild nm
  mapM (toChildCollector . fromXenoNode) mNode

fromChild :: (FromXenoNode a) => ByteString -> ChildCollector a
fromChild nm = do
  n <- requireChild nm
  case fromXenoNode n of
    Right a -> return a
    Left e -> ChildCollector $ \_ -> Left e

fromChildList :: (FromXenoNode a) => ByteString -> ChildCollector [a]
fromChildList nm = do
  mA <- maybeFromChild nm
  case mA of
    Just a -> (a:) <$> fromChildList nm
    Nothing -> return []

maybeParse :: ByteString -> (Node -> Either Text a) -> ChildCollector (Maybe a)
maybeParse nm parse = maybeChild nm >>= (toChildCollector . mapM parse)

requireAndParse :: ByteString -> (Node -> Either Text a) -> ChildCollector a
requireAndParse nm parse = requireChild nm >>= (toChildCollector . parse)

childListAny :: (FromXenoNode a) => Node -> Either Text [a]
childListAny = mapM fromXenoNode . children

maybeElementVal :: (FromAttrBs a) => ByteString -> ChildCollector (Maybe a)
maybeElementVal nm = do
  mN <- maybeChild nm
  fmap join . forM mN $ \n ->
    toChildCollector . parseAttributes n $ maybeAttr "val"

-- Stolen from XML Conduit
newtype AttrParser a = AttrParser
  { runAttrParser :: [(ByteString, ByteString)] -> Either Text ( [( ByteString
                                                                  , ByteString)]
                                                               , a)
  }

instance Monad AttrParser where
  return a = AttrParser $ \as -> Right (as, a)
  (AttrParser f) >>= g =
    AttrParser $ \as ->
      either Left (\(as', f') -> runAttrParser (g f') as') (f as)
instance Applicative AttrParser where
    pure = return
    (<*>) = ap
instance Functor AttrParser where
  fmap = liftM

attrError :: Text -> AttrParser a
attrError err = AttrParser $ \_ -> Left err

toAttrParser :: Either Text a -> AttrParser a
toAttrParser unlifted =
  case unlifted of
    Right a -> return a
    Left e -> AttrParser $ \_ -> Left e

maybeAttrBs :: ByteString -> AttrParser (Maybe ByteString)
maybeAttrBs attrName = AttrParser $ go id
  where
    go front [] = Right (front [], Nothing)
    go front (a@(nm, val):as) =
      if nm == attrName
        then Right (front as, Just val)
        else go (front . (:) a) as

requireAttrBs :: ByteString -> AttrParser ByteString
requireAttrBs nm = do
  mVal <- maybeAttrBs nm
  case mVal of
    Just val -> return val
    Nothing -> attrError $ "attribute " <> T.pack (show nm) <> " is required"

unexpectedAttrBs :: Text -> ByteString -> Either Text a
unexpectedAttrBs typ val =
  Left $ "Unexpected value for " <> typ <> ": " <> T.pack (show val)

fromAttr :: FromAttrBs a => ByteString -> AttrParser a
fromAttr nm = do
  bs <- requireAttrBs nm
  toAttrParser $ fromAttrBs bs

maybeAttr :: FromAttrBs a => ByteString -> AttrParser (Maybe a)
maybeAttr nm = do
  mBs <- maybeAttrBs nm
  forM mBs (toAttrParser . fromAttrBs)

fromAttrDef :: FromAttrBs a => ByteString -> a -> AttrParser a
fromAttrDef nm defVal = fromMaybe defVal <$> maybeAttr nm

parseAttributes :: Node -> AttrParser a -> Either Text a
parseAttributes n attrParser =
  case runAttrParser attrParser (attributes n) of
    Left e -> Left e
    Right (_, a) -> return a

class FromAttrBs a where
  fromAttrBs :: ByteString -> Either Text a

instance FromAttrBs ByteString where
  fromAttrBs = pure

instance FromAttrBs Bool where
    fromAttrBs x | x == "1" || x == "true"  = return True
                 | x == "0" || x == "false" = return False
                 | otherwise                = unexpectedAttrBs "boolean" x

instance FromAttrBs Int where
  -- it appears that parser in text is more optimized than the one in
  -- attoparsec at least as of text-1.2.2.2 and attoparsec-0.13.1.0
  fromAttrBs = decimal . T.decodeLatin1

instance FromAttrBs Double where
  -- as for rationals
  fromAttrBs = rational . T.decodeLatin1

instance FromAttrBs Text where
  fromAttrBs = replaceEntititesBs

replaceEntititesBs :: ByteString -> Either Text Text
replaceEntititesBs str =
  T.decodeUtf8 . BS.concat <$> findAmp 0
  where
    findAmp :: Int -> Either Text [ByteString]
    findAmp index =
      case elemIndexFrom ampersand str index of
        Nothing -> if BS.null text then return [] else return [text]
          where text = BS.drop index str
        Just fromAmp ->
          if BS.null text
             then checkEntity fromAmp
             else (text:) <$> checkEntity fromAmp
          where text = substring str index fromAmp
    checkEntity index =
      case elemIndexFrom semicolon str index of
        Just fromSemi | fromSemi >= index + 3 -> do
                          entity <- checkElementVal (index + 1) (fromSemi - index - 1)
                          (BS.singleton entity:) <$> findAmp (fromSemi + 1)
        _ -> Left "Unending entity"
    checkElementVal index len =
      if | len == 2
           && s_index this 0 == 108 -- l
           && s_index this 1 == 116 -- t
            -> return 60 -- '<'
         | len == 2
           && s_index this 0 == 103 -- g
           && s_index this 1 == 116 -- t
            -> return 62 -- '>'
         | len == 3
           && s_index this 0 ==  97 -- a
           && s_index this 1 == 109 -- m
           && s_index this 2 == 112 -- p
            -> return 38 -- '&'
         | len == 4
           && s_index this 0 == 113 -- q
           && s_index this 1 == 117 -- u
           && s_index this 2 == 111 -- o
           && s_index this 3 == 116 -- t
            -> return 34 -- '"'
         | len == 4
           && s_index this 0 ==  97 -- a
           && s_index this 1 == 112 -- p
           && s_index this 2 == 111 -- o
           && s_index this 3 == 115 -- s
           -> return 39 -- '\''
         | otherwise -> Left $ "Bad entity " <> T.pack (show $ (substring str (index-1) (index+len+1)))
      where
        this = BS.drop index str
    ampersand = 38
    semicolon = 59

data EntityReplaceException = EntityReplaceException deriving Show

instance Exception EntityReplaceException

-- | /O(1)/ 'ByteString' index (subscript) operator, starting from 0.
s_index :: ByteString -> Int -> Word8
s_index ps n
    | n < 0             = throw EntityReplaceException
    | n >= BS.length ps = throw EntityReplaceException
    | otherwise         = ps `SU.unsafeIndex` n
{-# INLINE s_index #-}

-- | Get index of an element starting from offset.
elemIndexFrom :: Word8 -> ByteString -> Int -> Maybe Int
elemIndexFrom c str offset = fmap (+ offset) (BS.elemIndex c (BS.drop offset str))
-- Without the INLINE below, the whole function is twice as slow and
-- has linear allocation. See git commit with this comment for
-- results.
{-# INLINE elemIndexFrom #-}

-- | Get a substring of a string.
substring :: ByteString -> Int -> Int -> ByteString
substring s start end = BS.take (end - start) (BS.drop start s)
{-# INLINE substring #-}

newtype NsPrefixes = NsPrefixes [(ByteString, ByteString)]

nsPrefixes :: Node -> NsPrefixes
nsPrefixes root =
  NsPrefixes . flip mapMaybe (attributes root) $ \(nm, val) ->
    (val, ) <$> BS.stripPrefix "xmlns:" nm

addPrefix :: NsPrefixes -> ByteString -> (ByteString -> ByteString)
addPrefix (NsPrefixes prefixes) ns =
  maybe id (\prefix nm -> BS.concat [prefix, ":", nm]) $ Prelude.lookup ns prefixes

contentBs :: Node -> ByteString
contentBs n = BS.concat . map toBs $ contents n
  where
    toBs (Element _) = BS.empty
    toBs (Text bs) = bs
    toBs (CData bs) = bs

contentX :: Node -> Either Text Text
contentX = replaceEntititesBs . contentBs