{-# LANGUAGE BangPatterns #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE InstanceSigs #-} {-# LANGUAGE MultiParamTypeClasses #-} module HaskellWorks.Data.Xml.Succinct.Index ( XmlIndex(..) , XmlIndexAt(..) ) where import Control.Arrow import Data.Monoid import HaskellWorks.Data.Bits.BitWise import HaskellWorks.Data.Drop import HaskellWorks.Data.Positioning import HaskellWorks.Data.RankSelect.Base.Rank0 import HaskellWorks.Data.RankSelect.Base.Rank1 import HaskellWorks.Data.RankSelect.Base.Select1 import HaskellWorks.Data.TreeCursor import HaskellWorks.Data.Uncons import HaskellWorks.Data.Xml.CharLike import HaskellWorks.Data.Xml.Grammar import HaskellWorks.Data.Xml.Succinct import Prelude hiding (drop) import qualified Data.Attoparsec.ByteString.Char8 as ABC import qualified Data.ByteString as BS import qualified Data.List as L import qualified HaskellWorks.Data.BalancedParens as BP data XmlIndex = XmlIndexDocument [XmlIndex] | XmlIndexElement String [XmlIndex] | XmlIndexCData BS.ByteString | XmlIndexComment BS.ByteString | XmlIndexMeta String [XmlIndex] | XmlIndexAttrList [XmlIndex] | XmlIndexValue BS.ByteString | XmlIndexAttrName BS.ByteString | XmlIndexAttrValue BS.ByteString | XmlIndexError String deriving (Eq, Show) data XmlIndexState = InAttrList | InElement | Unknown deriving (Eq, Show) class XmlIndexAt a where xmlIndexAt :: a -> XmlIndex pos :: (Select1 v, Rank1 w) => XmlCursor t v w -> Position pos c = lastPositionOf (select1 (interests c) (rank1 (balancedParens c) (cursorRank c))) remText :: (Drop v, Select1 v1, Rank1 w) => XmlCursor v v1 w -> v remText c = drop (toCount (pos c)) (cursorText c) instance (BP.BalancedParens w, Rank0 w, Rank1 w, Select1 v, TestBit w) => XmlIndexAt (XmlCursor BS.ByteString v w) where xmlIndexAt :: XmlCursor BS.ByteString v w -> XmlIndex xmlIndexAt = getIndexAt Unknown getIndexAt :: (BP.BalancedParens w, Rank0 w, Rank1 w, Select1 v, TestBit w) => XmlIndexState -> XmlCursor BS.ByteString v w -> XmlIndex getIndexAt state k = case uncons remainder of Just (!c, cs) | isElementStart c -> parseElem cs Just (!c, _ ) | isSpace c -> XmlIndexAttrList $ mapValuesFrom InAttrList (firstChild k) Just (!c, _ ) | isAttribute && isQuote c -> XmlIndexAttrValue remainder Just _ | isAttribute -> XmlIndexAttrName remainder Just _ -> XmlIndexValue remainder Nothing -> XmlIndexError "End of data" where remainder = remText k mapValuesFrom s = L.unfoldr (fmap (getIndexAt s &&& nextSibling)) isAttribute = case state of InAttrList -> True InElement -> False Unknown -> case remText <$> parent k >>= uncons of Just (!c, _) | isSpace c -> True _ -> False parseElem bs = case ABC.parse parseXmlElement bs of ABC.Fail {} -> decodeErr "Unable to parse element name" bs ABC.Partial _ -> decodeErr "Unexpected end of string" bs ABC.Done i r -> case r of XmlElementTypeCData -> XmlIndexCData i XmlElementTypeComment -> XmlIndexComment i XmlElementTypeMeta s -> XmlIndexMeta s (mapValuesFrom InElement $ firstChild k) XmlElementTypeElement s -> XmlIndexElement s (mapValuesFrom InElement $ firstChild k) XmlElementTypeDocument -> XmlIndexDocument (mapValuesFrom InElement (firstChild k) <> mapValuesFrom InElement (nextSibling k)) decodeErr :: String -> BS.ByteString -> XmlIndex decodeErr reason bs = XmlIndexError $ reason <>": " <> show (BS.take 20 bs) <> "...'"