{-# language BangPatterns #-} {-# language LambdaCase #-} {-# language MagicHash #-} {-# language NamedFieldPuns #-} module Xml ( Node(..) , Content(..) , Attribute(..) , decode ) where import Data.Word (Word8) import Data.Bytes (Bytes) import Data.Text.Short (ShortText) import Data.Primitive (SmallArray) import Data.Bytes.Parser (Parser) import GHC.Exts (Char(C#),Char#) import Data.Chunks (Chunks) import Data.Builder.ST (Builder) import qualified Data.Chunks as Chunks import qualified Data.Text.Short as TS import qualified Data.Text.Short.Unsafe as TS import qualified Data.Builder.ST as Builder import qualified Data.Bytes as Bytes import qualified Data.Bytes.Parser as Parser import qualified Data.Bytes.Parser.Rebindable as R import qualified Data.Bytes.Parser.Latin as Latin import qualified Data.Bytes.Parser.Unsafe as Unsafe import qualified Data.Bytes.Parser.Utf8 as Utf8 data Node = Text !ShortText | Element {-# UNPACK #-} !Content deriving (Show,Eq) data Content = Content { tag :: !ShortText , attributes :: !(SmallArray Attribute) , children :: !(SmallArray Node) } deriving (Show,Eq) data Attribute = Attribute { name :: !ShortText , value :: !ShortText } deriving (Show,Eq) decode :: Bytes -> Maybe Node decode !b = Parser.parseBytesMaybe elementNodeParser b elementNodeParser :: Parser () s Node elementNodeParser = do Latin.char () '<' btag <- Parser.takeWhile (\w -> not (isXmlSpace w) && w /= 0x3E && w /= 0x2F) case Bytes.length btag of 0 -> Parser.fail () _ -> pure () tag <- case TS.fromShortByteString (Bytes.toShortByteStringClone btag) of Nothing -> Parser.fail () Just ttag -> pure ttag -- Note that parserAttributes consumes leading and trailing whitespace. attrs <- parserAttributes =<< Parser.effect Builder.new let !attributes = Chunks.concat attrs Latin.any () >>= \case '>' -> do nodes <- childrenParser tag pure (Element Content{tag,attributes,children=Chunks.concat nodes}) '/' -> do Latin.char () '>' pure (Element Content{tag,attributes,children=mempty}) _ -> Parser.fail () textNodeParser :: Parser () s Node textNodeParser = do raw <- Parser.takeWhile (\w -> w /= 0x3C) case Bytes.any (\w -> w > 0x7F || w == 0x26) raw of True -> Parser.fail () -- TODO: escape or check UTF-8 encoding here instead False -> pure (Text (TS.fromShortByteStringUnsafe (Bytes.toShortByteStringClone raw))) -- This eats the closing tag as well. childrenParser :: ShortText -- opening tag name, looking for a closing tag that matches -> Parser () s (Chunks Node) childrenParser !tag = do b0 <- Parser.effect Builder.new childrenParserLoop tag b0 childrenParserLoop :: ShortText -- opening tag name, looking for a closing tag that matches -> Builder s Node -> Parser () s (Chunks Node) childrenParserLoop !tag !b0 = Latin.any () >>= \case '<' -> Latin.any () >>= \case '/' -> do Utf8.shortText () tag Parser.skipWhile isXmlSpace Latin.char () '>' Parser.effect (Builder.freeze b0) _ -> do Unsafe.unconsume 2 node <- elementNodeParser b1 <- Parser.effect (Builder.push node b0) childrenParserLoop tag b1 _ -> do Unsafe.unconsume 1 node <- textNodeParser b1 <- Parser.effect (Builder.push node b0) childrenParserLoop tag b1 isXmlSpace :: Word8 -> Bool isXmlSpace = \case 0x20 -> True 0x09 -> True 0x0D -> True 0x0A -> True _ -> False parserAttributes :: Builder s Attribute -> Parser () s (Chunks Attribute) parserAttributes !b0 = do Parser.skipWhile isXmlSpace peekIsNameStartChar >>= \case True -> do attr <- parserAttribute b1 <- Parser.effect (Builder.push attr b0) parserAttributes b1 False -> do Parser.skipWhile isXmlSpace Parser.effect (Builder.freeze b0) -- From the spec, we have: -- Attribute ::= Name Eq AttValue -- Eq ::= S? '=' S? -- Name ::= NameStartChar (NameChar)* -- -- Precondition A: The first character is a NameStartChar. This parser -- does not check this. parserAttribute :: Parser () s Attribute parserAttribute = do bname <- Parser.takeWhile (\w -> not (isXmlSpace w) && w /= 0x3D) -- We may assume that length of bname is at least one because of -- precondition A. !name <- case TS.fromShortByteString (Bytes.toShortByteStringClone bname) of Nothing -> Parser.fail () Just tname -> pure tname Parser.skipWhile isXmlSpace Latin.char () '=' Parser.skipWhile isXmlSpace !value <- parserAttributeValue pure Attribute{name,value} -- TODO: This is woefully incomplete parserAttributeValue :: Parser () s ShortText parserAttributeValue = do Latin.any () >>= \case '"' -> do bval <- Parser.takeWhile (\w -> w /= 0x22) Latin.char () '"' case TS.fromShortByteString (Bytes.toShortByteStringClone bval) of Nothing -> Parser.fail () Just tval -> pure tval '\'' -> do bval <- Parser.takeWhile (\w -> w /= 0x27) Latin.char () '\'' case TS.fromShortByteString (Bytes.toShortByteStringClone bval) of Nothing -> Parser.fail () Just tval -> pure tval _ -> Parser.fail () peekIsNameStartChar :: Parser () s Bool peekIsNameStartChar = Unsafe.cursor R.>>= \pos -> Utf8.any# () R.>>= \c -> Unsafe.jump pos R.>>= \_ -> R.pure (isNameStartChar c) isNameStartChar :: Char# -> Bool isNameStartChar c = case C# c of ':' -> True '_' -> True _ | C# c >= 'A' && C# c <= 'Z' -> True _ | C# c >= 'a' && C# c <= 'z' -> True _ -> False