{-# LANGUAGE OverloadedStrings #-}
module Haquery (
Attrs(), Tag(..), Child(..),
render,
tag,
doctype,
html,
head',
body,
div',
text,
form,
input,
article,
a,
title,
link,
script,
cat,
(-.),
id',
class',
src,
href,
type',
style,
alter,
remove,
select,
attrs,
attr,
children,
name,
append,
prepend,
addClass,
removeClass,
hasClass,
toggleClass,
parseHtml,
matches,
parseSelector,
example
) where
import qualified Data.Text as T
import qualified Data.List as L
import qualified Data.Map as M
import qualified Text.ParserCombinators.Parsec as P
import qualified Text.ParserCombinators.Parsec.Expr as E
import qualified Text.ParserCombinators.Parsec.Pos as Pos
import qualified Text.Parsec.Prim as Pr
import qualified Data.List.Split as Spl
import qualified Data.Functor.Identity as I
import qualified Text.HTML.TagSoup as TS
import qualified Text.HTML.TagSoup.Tree as TT
import qualified Data.Char as C
data Attrs = Attrs (M.Map T.Text T.Text) deriving (Eq)
toAttrs :: [(T.Text, T.Text)] -> Attrs
toAttrs l = Attrs $ M.fromList l
attrsToMap :: Attrs -> M.Map T.Text T.Text
attrsToMap (Attrs at) = at
index :: Tag -> Int
index t = case t of
Doctype i _ -> f i
Text i _ -> f i
Tag i _ _ _ -> f i
where
f x = if length x == 0
then -1
else last x
indexes :: Tag -> [Int]
indexes t = case t of
Doctype i _ -> i
Text i _ -> i
Tag i _ _ _ -> i
setIndex :: Int -> Tag -> Tag
setIndex i t = case t of
Doctype ind te -> Doctype (i:ind) te
Text ind te -> Text (i:ind) te
Tag ind te a c -> Tag (i:ind) te a $ map (setIndex i) c
type Child = Tag
data Tag =
Doctype [Int] T.Text
| Text [Int] T.Text
| Tag [Int] T.Text Attrs [Child]
deriving (Eq)
tag :: T.Text -> [(T.Text, T.Text)] -> [Tag] -> Tag
tag n a c = tag' n (toAttrs a) c
tag' :: T.Text -> Attrs -> [Tag] -> Tag
tag' n a c = Tag [] n a $ indexify c
where
indexify cs = map (\(a, b) -> setIndex a b) $ zip [0..] cs
doctype a = Doctype [] a
html a c = tag "html" a c
head' a c = tag "head" a c
body a c = tag "body" a c
text t = Text [] t
div' a c = tag "div" a c
form a c = tag "form" a c
input a c = tag "input" a c
a a1 c = tag "a" a1 c
article a c = tag "article" a c
title a c = tag "title" a c
link a c = tag "link" a c
script a c = tag "script" a c
img a c = tag "img" a c
id' a = cat "id" a
class' a = cat "class" a
src a = cat "src" a
style a = cat "style" a
href a = cat "href" a
type' a = cat "type" a
cat :: T.Text -> T.Text -> (T.Text, T.Text)
cat a b = (a, b)
infix 0 -.
(-.) :: T.Text -> T.Text -> (T.Text, T.Text)
(-.) a b = cat a b
example :: Tag
example =
html [] [
head' [] [],
body [cat "style" "background: #ccc;"] [
text "Hello world.",
div' [cat "class" "just-a-div"] [],
div' [] [
text "Hello again."
]
]
]
voidElements :: M.Map T.Text ()
voidElements = M.fromList $ map (\x -> (x, ()))
["area", "base", "br", "col", "embed", "hr", "img", "input", "keygen", "link", "menuitem", "meta", "param", "source", "track", "wbr"]
instance Show Attrs where
show (Attrs at) = L.intercalate " " $ map (\(a, b) -> T.unpack a ++ "=" ++ show b) $ M.toList at
sa :: Attrs -> String
sa a@(Attrs at) = if M.size at > 0
then " " ++ show a
else ""
sc :: [Child] -> String
sc x = L.intercalate "" $ map (\y -> tabLines $ show y) x
where tabLines x = unlines $ map (\y -> " " ++ y) $ lines x
instance Show Tag where
show (Doctype i a) = "<!DOCTYPE " ++ T.unpack a ++ ">"
show (Tag i n a c) = case M.lookup n voidElements of
Just () -> "<" ++ T.unpack n ++ sa a ++ "/>"
Nothing -> if length c > 0
then "<" ++ T.unpack n ++ sa a ++ ">\n" ++ sc c ++ "</" ++ T.unpack n ++ ">"
else "<" ++ T.unpack n ++ sa a ++ "></" ++ T.unpack n ++ ">"
show (Text i a) = T.unpack a
render :: Tag -> T.Text
render (Doctype i a) = T.concat ["<!DOCTYPE ", a, ">"]
render (Tag i n a c) = case M.lookup n voidElements of
Just () -> T.concat ["<", n, T.pack $ sa a, "/>"]
Nothing -> if length c > 0
then T.concat $ ["<", n, T.pack $ sa a, ">"] ++ map render c ++ ["</", n, ">"]
else T.concat $ ["<", n, T.pack $ sa a, "></", n, ">"]
render (Text i a) = a
attrs :: Tag -> Attrs
attrs (Doctype _ _) = toAttrs []
attrs (Text _ _) = toAttrs []
attrs (Tag _ _ a _) = a
attr :: T.Text -> Tag -> Maybe T.Text
attr attrName tag = case M.lookup attrName $ attrsToMap $ attrs tag of
Just v -> Just v
Nothing -> Nothing
setAttr :: T.Text -> T.Text -> Tag -> Tag
setAttr key val tag = case tag of
Doctype _ _ -> tag
Text _ _ -> tag
Tag i n (Attrs a) c -> Tag i n (Attrs $ M.insert key val a) c
children :: Tag -> [Child]
children (Doctype _ _) = []
children (Text _ _) = []
children (Tag _ _ _ c) = c
name :: Tag -> T.Text
name (Doctype _ _) = "doctype"
name (Text _ _) = "text"
name (Tag _ n _ _) = n
addClass :: T.Text -> Tag -> Tag
addClass clas tag = case attr "class" tag of
Nothing -> setAttr "class" clas tag
Just c -> let spl = T.splitOn " " c in if elem clas spl
then tag
else setAttr "class" (T.intercalate " " $ clas:spl) tag
hasClass :: T.Text -> Tag -> Bool
hasClass clas tag = case attr "class" tag of
Nothing -> False
Just c -> let spl = T.splitOn " " c in elem clas spl
removeClass :: T.Text -> Tag -> Tag
removeClass clas tag = case attr "class" tag of
Nothing -> tag
Just c -> let spl = T.splitOn " " c in if elem clas spl
then setAttr "class" (T.intercalate " " $ filter (/= clas) spl) tag
else tag
toggleClass :: T.Text -> Tag -> Tag
toggleClass clas tag = case attr "class" tag of
Nothing -> setAttr "class" clas tag
Just c -> let spl = T.splitOn " " c in if elem clas spl
then setAttr "class" (T.intercalate " " $ filter (/= clas) spl) tag
else setAttr "class" (T.intercalate " " $ clas:spl) tag
append :: Tag -> Tag -> Tag
append what to = case to of
Tag ind n a c -> Tag ind n a (children to ++ [what])
otherwise -> to
prepend :: Tag -> Tag -> Tag
prepend what to = case to of
Tag ind n a c -> Tag ind n a (what:c)
otherwise -> to
matches :: T.Text -> Tag -> Bool
matches sel tag = matches' [] (parseSelector sel) tag
has :: Selector -> Tag -> Bool
has sel t = if matches' [] sel t
then True
else any (has sel) $ children t
calcRoot :: [Tag] -> Tag -> Tag
calcRoot parents tag = if length parents == 0
then tag
else parents!!0
filtIndex :: (Int -> Bool) -> [a] -> [a]
filtIndex pred xs =
let indexes = [0 ..(length xs) - 1]
iterable = zip indexes xs
in map (\(_, b) -> b) $ filter (\(a, _) -> pred a) iterable
matches' :: [Tag] -> Selector -> Tag -> Bool
matches' parents s tag = case s of
Type t -> name tag == t
Id id -> case attr "id" tag of
Just x -> x == id
Nothing -> False
Class c -> hasClass c tag
Attribute reg attrName attrVal -> case attr attrName tag of
Nothing -> False
Just x -> case reg of
StartsWith -> T.isPrefixOf attrVal x
EndsWith -> T.isSuffixOf attrVal x
Contains -> T.isInfixOf attrVal x
Equals -> x == attrVal
Anything -> True
And selectors -> all (\x -> matches' parents x tag) selectors
Or selectors -> any (\x -> matches' parents x tag) selectors
Not selector -> not $ matches' parents selector tag
Has selector -> any (has selector) $ children tag
AncestorIs sel -> if length parents == 0
then False
else let parinits = zip (L.inits parents) parents in
any (\(pars, subj) -> matches' pars sel subj) parinits
ParentIs sel -> if length parents == 0
then False
else matches' (init parents) sel $ last parents
FirstChild -> index tag == 0
LastChild ->
let pl = length parents
pcl = length . children $ last parents
in if pl == 0 || pcl < 2
then False
else pcl == index tag + 1
Eq selector n ->
let root = calcRoot parents tag
tm = take (n+1) $ select' selector root
in if length tm <= n
then False
else (tm!!n) == tag
LesserThan s n ->
let root = calcRoot parents tag
tm = take n $ select' s root
in any (== tag) tm
GreaterThan s n ->
let root = calcRoot parents tag
tm = drop (n + 1) $ select' s root
in any (== tag) tm
First selector ->
let root = calcRoot parents tag
tm = take 1 $ select' selector root
in any (== tag) tm
Last selector ->
let root = calcRoot parents tag
tm = select' selector root
in if length tm == 0
then False
else last tm == tag
Even selector ->
let root = calcRoot parents tag
tm = filtIndex (\x -> x `mod` 2 == 0) $ select' selector root
in any (== tag) tm
Odd selector ->
let root = calcRoot parents tag
tm = filtIndex (\x -> x `mod` 2 == 1) $ select' selector root
in any (== tag) tm
NthChild n -> index tag + 1 == n
NthLastChild n ->
let pl = length parents
pcl = length . children $ last parents
in if pl == 0 || pcl < n
then False
else pcl - n - 1 == index tag
Empty -> length (children tag) == 0
Parent -> length (children tag) /= 0
NextAdj s -> if length parents == 0
then False
else let sibs = take (index tag) . children $ last parents
in if length sibs > 0
then matches' parents s (last sibs)
else False
NextSibl s -> if length parents == 0
then False
else let sibs = take (index tag) . children $ last parents
in if length sibs > 0
then any (matches' parents s) sibs
else False
Any -> True
Descendant -> error "matches': bug: Descendant"
DirectChild -> error "matches': bug: DirectChild"
Placeholder -> error "matches': bug: Placeholder"
Comma -> error "matches': bug: Comma"
otherwise -> error $ "matches': bug: " ++ show s
alter :: T.Text -> (Tag -> Tag) -> Tag -> Tag
alter sel f t =
let sels = parseSelector sel
alterRec :: [Tag] -> Tag -> Tag
alterRec parents tag = case tag of
Tag i n a c -> appif $ Tag i n a $ map (alterRec $ parents ++ [tag]) c
otherwise -> appif tag
where
appif t =
if matches' parents sels tag
then f t
else t
in alterRec [] t
remove :: T.Text -> Tag -> Tag
remove sel t =
let sels = parseSelector sel
removeRec :: [Tag] -> Tag -> Tag
removeRec parents t = case t of
Tag i n a c -> tag' n a $ filter (matches' parents sels) $ map (removeRec $ parents ++ [t]) c
otherwise -> t
in removeRec [] t
select :: T.Text -> Tag -> [Tag]
select sel t = let sels = parseSelector sel in select' sels t
select' :: Selector -> Tag -> [Tag]
select' sel t =
let selectRec :: [Tag] -> Tag -> [Tag]
selectRec parents tag = case tag of
Tag _ _ _ c -> retif tag ++ (concat $ map (selectRec $ parents ++ [tag]) c)
otherwise -> retif tag
where
retif t =
if matches' parents sel tag
then [t]
else []
in selectRec [] t
data Regexy =
StartsWith
| EndsWith
| Contains
| Equals
| Anything
| ContainsWord
| NotEquals
deriving (Eq, Show)
data Selector =
Any
| Type T.Text
| Id T.Text
| Class T.Text
| Eq Selector Int | Even Selector | Odd Selector
| LesserThan Selector Int | GreaterThan Selector Int
| First Selector | Last Selector
| Parent | Empty
| FirstChild | LastChild | NthChild Int | NthLastChild Int
| And [Selector]
| Or [Selector]
| Not Selector
| Has Selector
| ParentIs Selector
| AncestorIs Selector
| Attribute Regexy T.Text T.Text
| NextAdj Selector
| NextSibl Selector
| Comma
| Descendant
| DirectChild
| IndSep
| AndSep
| Plus
| Tilde
| Placeholder
deriving (Eq, Show)
l :: a -> [a]
l x = [x]
setCrit x v = case x of
Eq s i -> Eq v i
LesserThan s i -> LesserThan v i
GreaterThan s i -> GreaterThan v i
Even s -> Even v
Odd s -> Odd v
First s -> First v
Last s -> Last v
otherwise -> error $ "bug: can't set crit of " ++ show x
isInd x = case x of
Eq _ _ -> True
Even _ -> True
Odd _ -> True
First _ -> True
Last _ -> True
LesserThan _ _ -> True
GreaterThan _ _ -> True
otherwise -> False
isOp :: Selector -> Bool
isOp s = case s of
Comma -> True
Descendant -> True
DirectChild -> True
IndSep -> True
AndSep -> True
Plus -> True
Tilde -> True
otherwise -> False
lace :: (a -> a -> Bool) -> a -> [a] -> [a]
lace pred e l
| length l < 2 = l
| otherwise = if pred (l!!0) (l!!1)
then (l!!0):e:(lace pred e $ tail l)
else (l!!0):(lace pred e $ tail l)
simple s = (not $ isInd s) && (not $ isOp s)
laceAnd :: [Selector] -> [Selector]
laceAnd ss = lace f AndSep ss
where
f a b | not (isOp a) && simple b = True
| otherwise = False
operatorTable ::[[E.Operator Selector () Selector]]
operatorTable = [
[
binary (== AndSep) (\a b -> And [a, b]) E.AssocLeft,
binary (== IndSep) (\a b -> setCrit b a) E.AssocLeft
],
[
binary (== Tilde) (\a b -> And [b, NextSibl a]) E.AssocLeft,
binary (== Plus) (\a b -> And [b, NextAdj a]) E.AssocLeft,
binary (== Descendant) (\a b -> And [b, AncestorIs a]) E.AssocLeft,
binary (== DirectChild) (\a b -> And [b, ParentIs a]) E.AssocLeft
],
[binary (== Comma) (\a b -> Or [a, b]) E.AssocRight]
]
nextPos pos x xs = Pos.incSourceColumn pos 1
predM :: (a -> Bool) -> (a -> Maybe a)
predM x = \y -> if x y
then Just y
else Nothing
binary pred fun assoc = E.Infix (do
P.try $ Pr.tokenPrim show nextPos $ predM pred
return fun)
assoc
parseSelector :: T.Text -> Selector
parseSelector t =
let errMsg = "parseSelector: can't parse selector: " ++ show t
sels = P.parse parseExpr errMsg $ T.unpack t
in case sels of
Left e -> error $ show e
Right ss -> parseSelPrec $ concat ss
term :: Pr.ParsecT [Selector] () I.Identity Selector
term = do
x <- P.anyToken
return x
parseSelPrec :: [Selector] -> Selector
parseSelPrec ss =
let errMsg = "parseSelector: can't parse expression: " ++ show ss
prec = E.buildExpressionParser operatorTable term P.<?> "expression"
laced = laceAnd ss
sels = P.parse prec errMsg laced
in case sels of
Left e -> error $ show e
Right sel -> sel
parseString :: P.Parser T.Text
parseString = do
P.char '"'
x <- P.many (P.noneOf "\"")
P.char '"'
return $ T.pack x
symbol :: P.Parser Char
symbol = P.oneOf "-_"
parseNonquoted :: P.Parser T.Text
parseNonquoted = do
first <- P.letter P.<|> symbol
rest <- P.many (P.letter P.<|> P.digit P.<|> symbol)
return $ T.pack $ first:rest
parseDescendant :: P.Parser [Selector]
parseDescendant = do
P.space
return $ l Descendant
parseId :: P.Parser [Selector]
parseId = do
P.char '#'
id <- parseNonquoted
return $ l $ Id id
parseClass :: P.Parser [Selector]
parseClass = do
P.char '.'
clas <- parseNonquoted
return $ l $ Class clas
parseTyp :: P.Parser [Selector]
parseTyp = do
typ <- parseNonquoted
return $ l $ Type typ
ts x = P.try $ P.string x
parseCons :: P.Parser [Selector]
parseCons = do
c <- P.char '*' P.<|> P.char ':'
case c of
'*' -> return $ l Any
':' -> do
cons <- ts "empty" P.<|> P.string "parent"
P.<|> ts "first-child" P.<|> ts "last-child"
P.<|> P.string "first" P.<|> P.string "last"
P.<|> P.string "even" P.<|> P.string "odd"
return $ case cons of
"parent" -> [Parent]
"empty" -> [Empty]
"last" -> [IndSep, Last Placeholder]
"first" -> [IndSep, First Placeholder]
"first-child" -> [FirstChild]
"last-child" -> [LastChild]
"even" -> [IndSep, Even Placeholder]
"odd" -> [IndSep, Odd Placeholder]
parseNthChildEq :: P.Parser [Selector]
parseNthChildEq = do
a <- ts ":nth-child(" P.<|> ts ":nth-last-child(" P.<|> ts ":eq("
P.<|> ts ":lt(" P.<|> ts ":gt("
num <- P.many1 P.digit
P.char ')'
return $ let n = (read num)::Int in case a of
":nth-child(" -> [NthChild n]
":nth-last-child(" -> [NthLastChild n]
":eq(" -> [IndSep, Eq Placeholder n]
":lt(" -> [IndSep, LesserThan Placeholder n]
":gt(" -> [IndSep, GreaterThan Placeholder n]
parseNotHas :: P.Parser [Selector]
parseNotHas = do
a <- ts ":not(" P.<|> ts ":has("
sels <- parseExpr
P.string ")"
return $ l $ case a of
":not(" -> Not $ parseSelPrec $ concat sels
":has(" -> Has $ parseSelPrec $ concat sels
parseCommaDCSiblings :: P.Parser [Selector]
parseCommaDCSiblings = do
P.many P.space
v <- P.char ',' P.<|> P.char '~' P.<|> P.char '+' P.<|> P.char '>'
P.many P.space
return $ l $ case v of
',' -> Comma
'~' -> Tilde
'+' -> Plus
'>' -> DirectChild
parseAttr :: P.Parser [Selector]
parseAttr = do
P.char '['
attrName <- parseNonquoted
mode <- P.many $ P.string "*=" P.<|> P.string "^="
P.<|> P.string "$=" P.<|> P.string "="
P.<|> P.string "~=" P.<|> P.string "!="
val <- P.many $ parseNonquoted P.<|> parseString
P.char ']'
return $ l $ case mode of
[] -> Attribute Anything attrName ""
["*="] -> Attribute Contains attrName (val!!0)
["^="] -> Attribute StartsWith attrName (val!!0)
["$="] -> Attribute EndsWith attrName (val!!0)
["~="] -> Attribute ContainsWord attrName (val!!0)
["!="] -> Attribute NotEquals attrName (val!!0)
["="] -> Attribute Equals attrName (f val)
where
f :: [T.Text] -> T.Text
f x = if length x == 0
then ""
else x!!0
parseExpr :: P.Parser [[Selector]]
parseExpr = P.many1 $ P.try parseId
P.<|> P.try parseClass
P.<|> P.try parseAttr
P.<|> P.try parseTyp
P.<|> P.try parseCommaDCSiblings
P.<|> P.try parseDescendant
P.<|> P.try parseCons
P.<|> P.try parseNthChildEq
P.<|> P.try parseNotHas
parseHtml :: T.Text -> [Tag]
parseHtml t =
let tgs = (TS.parseTags t)::[TS.Tag T.Text]
whiteSpace :: TS.Tag T.Text -> Bool
whiteSpace t = case t of
TS.TagText s -> T.length (T.filter (not . C.isSpace) s) == 0
otherwise -> False
tgs' = filter (not . whiteSpace) tgs
trees = TT.tagTree tgs'
convert :: TT.TagTree T.Text -> Tag
convert tr = case tr of
TT.TagBranch tname atts childr -> tag tname atts $ map convert childr
TT.TagLeaf tg -> case tg of
TS.TagOpen tname' atts' -> tag tname' atts' []
TS.TagText str -> text str
TS.TagComment str -> text str
otherwise -> error $ "Unexpected tag type: " ++ show tg
in map convert trees