{-#LANGUAGE TypeSynonymInstances #-} {-#LANGUAGE FlexibleInstances #-} {-#LANGUAGE OverlappingInstances #-} module HtmlPage ( -- * Types HtmlPage, Tag(..), TagId, Attributes, -- * Content functions emptyContent, emptyPage, (?), (?=), (<<+), (<+), (<<), -- * Attributes functions emptyAttributes, attributeList, (@+), (@?), -- * Cursor functions tagId, select, nextTagId, parentTagId, -- * IO functions fromFile, ) where import Data.List (find) import Data.Text (unpack, strip, pack, toLower) ------------------------------------------------------------------------------ -- Types -- ------------------------------------------------------------------------------ -- | An HTML page. type HtmlPage = Content -- | An HTML tag. data Tag = PairedTag { tagName :: String, attributes :: Attributes, content :: Content } | UnpairedTag { tagName :: String, attributes :: Attributes } | Doctype { dtdInformation :: String } | Comment { commentContent :: String } -- | An attribute of a tag. data Attribute = Attribute { attributeName :: String, attributeValue :: String } -- | A content (a list containing strings and tags). type Content = [Either String Tag] -- | A list of attributes. type Attributes = [Attribute] -- | The id of a tag in the page (i.e. a cursor). type TagId = [Int] ------------------------------------------------------------------------------ -- Library functions -- ------------------------------------------------------------------------------ instance Read Content where readsPrec _ string | null string = [(emptyContent, "")] | otherwise = [getContent string emptyContent] where -- | Returns the content of a tag from a given string. getContent :: String -> Content -> (Content, String) getContent "" content = (content, "") getContent ('<' : '/' : after) content = (content, snd (splitFirst after '>')) getContent ('<' : after) content | isScript && scriptEmpty = getContent afterScript (content <<+ tag) | isScript && not scriptEmpty = getContent afterScript (content <<+ (tag << (emptyContent <+ script))) | otherwise = case tag of PairedTag {} -> getContent afterTag $ content <<+ (tag << tagContent) _ -> getContent rest (content <<+ tag) where isScript = after `startsWith` "script" scriptEmpty = null (trim script) tag = read tagString :: Tag (tagString, rest) = getTagString 0 "" ('<' : after) (tagContent, afterTag) = getContent rest emptyContent (script, afterScript) = getScript 0 rest getContent string content | contentEmpty && null after = getContent "" content | contentEmpty && (not . null) after = getContent ('<' : after) content | otherwise = getContent ('<' : after) (content <+ contentString) where (contentString, after) = splitFirst string '<' contentEmpty = null (trim contentString) instance Read Tag where readsPrec _ "" = noParse readsPrec _ string = [getTag (trim string)] where -- | Returns a tag from a given string. getTag :: String -> (Tag, String) getTag ('<' : string) | last tagContent == '/' = (UnpairedTag tagName attributesUnpairedTag, "") | string `startsWith` "!--" = (Comment contentComment, "") | string `startsWith` "!doctype" = (Doctype dtdInformation, "") | otherwise = (PairedTag tagName attributesPairedTag emptyContent, "") where tagContent = take (length string - 1) string (tagName, attributes) = if ' ' `elem` tagContent then splitFirst tagContent ' ' else splitFirst tagContent '/' attributesPairedTag = read attributes :: Attributes attributesUnpairedTag = read (take (length attributes - 1) attributes) :: Attributes contentComment = drop 3 (take (length string - 3) string) (_, dtdInformation) = splitFirst_ (dropSpaces attributes) " PUBLIC " getTag _ = noParse instance Read Attribute where readsPrec _ string | '=' `elem` string = [(Attribute name value, "")] | otherwise = noParse where (name, valueInQuotes) = splitFirst (trim string) '=' value = case split valueInQuotes '\"' of [] -> [] value' -> head value' instance Read Attributes where readsPrec _ string = [(getAttributes emptyAttributes "" string, "")] where -- | Returns a list of attributes from a given string. getAttributes :: Attributes -> String -> String -> Attributes getAttributes attributes attribute "" = attributes getAttributes attributes attribute ('\"' : string) = getAttributes (attributes @+ attribute') "" (dropSpaces rest) where (value, rest) = splitFirst string '\"' attribute' = read (attribute ++ "\"" ++ value ++ "\"") :: Attribute getAttributes attributes attribute ('\'' : string) = getAttributes (attributes @+ attribute') "" (dropSpaces rest) where (value, rest) = splitFirst string '\'' attribute' = read (attribute ++ "\'" ++ value ++ "\'") :: Attribute getAttributes attributes attribute (char : string) = getAttributes attributes (attribute ++ [char]) string instance Show Attribute where show (Attribute name value) = name ++ "=\"" ++ value ++ "\"" instance Show Attributes where show attributes = unwords [show attribute | attribute <- attributes] instance Show Tag where show (PairedTag name [] content) = "<" ++ name ++ ">" ++ show content ++ "" show (PairedTag name attributes content) = "<" ++ name ++ " " ++ show attributes ++ ">" ++ show content ++ "" show (UnpairedTag name []) = "<" ++ name ++ " />" show (UnpairedTag name attributes) = "<" ++ name ++ " " ++ show attributes ++ " />" show (Doctype []) = "" show (Doctype dtdInformation) = "" show (Comment content) = "" instance Show Content where show [] = "" show (Left string : content) = string ++ show (content :: Content) show (Right tag : content) = show tag ++ show (content :: Content) instance Show TagId where show [] = "" show [index] = show index show (index : indices) = show index ++ "." ++ show (indices :: TagId) -- | Returns an empty content. emptyContent :: Content emptyContent = [] -- | Returns an empty HTML page. emptyPage :: HtmlPage emptyPage = emptyContent -- | Returns the tag/string at the given position (TagId) in a content. (?) :: Content -> TagId -> Either String Tag (?) _ [] = tagNotInPage (?) content [index] | index < length content = content !! index | otherwise = tagNotInPage (?) content (index : indices) = case content !! index of Right tag @ (PairedTag _ _ content') -> content' ? indices _ -> tagNotInPage -- | Replaces the tag/string at the given position (TagId) in a content. (?=) :: Content -> (TagId, Either String Tag) -> Content (?=) _ ([], _) = impossibleToAddTag (?=) content ([index], stringOrTag) | index < length content = content !!= (index, stringOrTag) | index == length content = case stringOrTag of Left string -> content <+ string Right tag -> content <<+ tag | otherwise = impossibleToAddTag (?=) content (index : indices, stringOrTag) | index < length content = case content !! index of Right tag @ (PairedTag _ _ content') -> content !!= (index, Right $ tag << (content' ?= (indices, stringOrTag))) _ -> impossibleToAddTag | otherwise = impossibleToAddTag -- | Adds a tag to a content. (<<+) :: Content -> Tag -> Content (<<+) content tag = content ++ [Right tag] -- | Adds a string to a content. (<+) :: Content -> String -> Content (<+) _ "" = stringMustNotBeEmpty (<+) content string = content ++ [Left string] -- | Replaces the content of the given tag by the given content. (<<) :: Tag -> Content -> Tag (<<) (PairedTag name attributes _) = PairedTag name attributes (<<) (UnpairedTag _ _) = unpairedTagsDoNotHaveAContent (<<) (Doctype _) = doctypesDoNotHaveAContent (<<) (Comment _) = commentsDoNotHaveAContent -- | Returns a TagId from a given string. tagId :: String -> TagId tagId string = [read index :: Int | index <- split string '.'] -- | Returns a tag matching the given selector string from the given page. -- A selector is composed of a tag name and is optionally followed by a hash -- and an HTML id name. select :: HtmlPage -> String -> (TagId, Tag) select page selector = case find (tagMatches (null after)) (tagList page) of Just tagIdAndTag -> tagIdAndTag _ -> noSuchTag where (tagString, after) = splitFirst selector '#' tagMatches _ (_, Comment _) = False tagMatches _ (_, Doctype _) = False tagMatches True (_, tag) = tagString == tagName tag tagMatches False (_, tag) = tagString == tagName tag && (attributes tag @? "id" == after) -- | Returns the next TagId from a given TagId. nextTagId :: TagId -> Bool -> TagId nextTagId tagId True = tagId ++ [0] nextTagId tagId False = take (length tagId - 1) tagId ++ [last tagId + 1] -- | Returns the parent TagId of the given TagId. parentTagId :: TagId -> TagId parentTagId tagId = take (length tagId - 1) tagId -- | Returns an empty attribute list. emptyAttributes :: Attributes emptyAttributes = [] -- | Returns a list of attributes from a given list of attribute names and -- values. attributeList :: [(String, String)] -> Attributes attributeList namesAndValues = attributes' namesAndValues emptyAttributes where attributes' [] attributes = attributes attributes' ((name, value) : namesAndValues') attributes = attributes' namesAndValues' (attributes @+ Attribute name value) -- | Adds an attribute to an attribute list. (@+) :: Attributes -> Attribute -> Attributes (@+) attributes attribute = attributes ++ [attribute] -- | Returns the value of the attribute matching the given name. (@?) :: Attributes -> String -> String (@?) attributes name = case find (\attribute -> attributeName attribute == name) attributes of Just (Attribute _ value) -> value _ -> "" -- | Reads a file and produces an HtmlPage. fromFile :: FilePath -> IO HtmlPage fromFile filePath = do string <- readFile filePath return (read string :: Content) ------------------------------------------------------------------------------ -- Helper functions -- ------------------------------------------------------------------------------ -- | Replaces, in a given list, the value at the given index with a new given -- value. (!!=) :: [a] -> (Int, a) -> [a] (!!=) [] _ = emptyList (!!=) list (index, value) | index > (length list - 1) = indexTooLarge | index < 0 = negativeIndex | otherwise = take index list ++ [value] ++ drop (index + 1) list -- | Returns the content of a script tag. getScript :: Int -> String -> (String, String) getScript _ "" = ("", "") getScript numberOfQuotesBefore string = if isPair (numberOfQuotesInScript + numberOfQuotesBefore) then (script, dropSpaces afterScript) else (script ++ script', rest') where (script, rest) = splitFirst_ string "' numberOfQuotesInScript = numberOfQuotes script (script', rest') = getScript (numberOfQuotesBefore + numberOfQuotesInScript) rest -- | Returns the tag at the beginning of the string. getTagString :: Int -> String -> String -> (String, String) getTagString _ _ "" = ("", "") getTagString numberOfQuotesBefore suffix string | null suffix && string `startsWith` "" string | null suffix && string `startsWith` "<" = getTagString numberOfQuotesBefore ">" string | isPair (numberOfQuotesInTag + numberOfQuotesBefore) = (tag ++ suffix, rest) | otherwise = (tag ++ tag', rest') where (tag, rest) = splitFirst_ string suffix numberOfQuotesInTag = numberOfQuotes tag (tag', rest') = getTagString (numberOfQuotesBefore + numberOfQuotesInTag) suffix rest -- | Drops the spaces at the beginning of a string. dropSpaces :: String -> String dropSpaces (' ' : string) = dropSpaces string dropSpaces ('\n' : string) = dropSpaces string dropSpaces ('\r' : string) = dropSpaces string dropSpaces ('\t' : string) = dropSpaces string dropSpaces string = string -- | Splits a string in two with a given delimiting character. splitFirst :: String -> Char -> (String, String) splitFirst string delimiter = (before, drop 1 after) where (before, after) = break (== delimiter) string -- | Splits a string in two with a given delimiting string. splitFirst_ :: String -> String -> (String, String) splitFirst_ string delimiter = splitFirst' "" string where delimiterLength = length delimiter splitFirst' before after | take delimiterLength after == delimiter = (before, drop delimiterLength after) | null after = (before, "") | otherwise = splitFirst' (before ++ take 1 after) (drop 1 after) -- | Splits a string with a given delimiting character. split :: String -> Char -> [String] split string delimiter = split' [] string where split' before "" = before split' before after = if null before' then split' before after' else split' (before ++ [before']) after' where (before', after') = splitFirst after delimiter -- | Removes the spaces at the beginning and at the end of a string. trim :: String -> String trim string = unpack . strip $ pack string -- | Returns the number of quotes in a string. numberOfQuotes :: String -> Int numberOfQuotes string = quotes string False False where quotes "" _ _ = 0 quotes ('\"' : after) False False = 1 + quotes after True False quotes ('\'' : after) False False = 1 + quotes after False True quotes ('\"' : after) True _ = 1 + quotes after False False quotes ('\'' : after) True _ = quotes after True False quotes ('\"' : after) _ True = quotes after False True quotes ('\'' : after) _ True = 1 + quotes after False False quotes (char : after) inDoubleQuotes inSingleQuotes = quotes after inDoubleQuotes inSingleQuotes -- | Indicates whether the given integer is pair. isPair :: Int -> Bool isPair int = int `mod` 2 == 0 -- | Indicates whether the first given string starts with the second given -- string (not case sensitive). startsWith :: String -> String -> Bool startsWith _ [] = True startsWith [] _ = False startsWith firstString secondString = firstText == secondText where firstString' = take (length secondString) firstString firstText = toLower $ pack firstString' secondText = toLower $ pack secondString -- | Returns a list containing all the tags (and all their associated TagId) of -- a given Content. tagList :: Content -> [(TagId, Tag)] tagList content' = tagList' content' [0] where tagList' [] _ = [] tagList' (Left _ : rest) currentTagId = restOfList rest currentTagId tagList' (Right tag : rest) currentTagId = case tag of PairedTag {} -> (currentTagId, tag) : contentList ++ restOfList rest currentTagId _ -> (currentTagId, tag) : restOfList rest currentTagId where contentList = tagList' (content tag) (nextTagId currentTagId True) restOfList content tagId = tagList' content (nextTagId tagId False) ------------------------------------------------------------------------------ -- Errors -- ------------------------------------------------------------------------------ noParse :: a noParse = error "HtmlPage.read: no parse" tagNotInPage :: a tagNotInPage = error "HtmlPage.?: this tag does not exist in the page" impossibleToAddTag :: a impossibleToAddTag = error "HtmlPage.?=: impossible to add the tag" noSuchTag :: a noSuchTag = error "HtmlPage.tagId: there is no such tag in the given page" emptyList :: a emptyList = error "HtmlPage.!!=: empty list" indexTooLarge :: a indexTooLarge = error "HtmlPage.!!=: index too large" negativeIndex :: a negativeIndex = error "HtmlPage.!!=: negative index" unpairedTagsDoNotHaveAContent :: a unpairedTagsDoNotHaveAContent = error "HtmlPage.<<: unpaired tags do not have a content" doctypesDoNotHaveAContent :: a doctypesDoNotHaveAContent = error "HtmlPage.<<: doctypes do not have a content" commentsDoNotHaveAContent :: a commentsDoNotHaveAContent = error "HtmlPage.<<: comments do not have a content" stringMustNotBeEmpty :: a stringMustNotBeEmpty = error "HtmlPage.<+: the string must not be empty"