{- | Module for parsing html. -} {-# LANGUAGE OverloadedStrings, NoMonomorphismRestriction, FlexibleContexts #-} module HScraper.HTMLparser ( parseHtml ) where import Control.Monad (void) import Control.Applicative((<$>), (<*)) import Data.List (nub) import Data.Char import qualified Data.Text as T import Text.Parsec import HScraper.Types parseHtml :: T.Text -> Either ParseError HTMLTree parseHtml s = case parse baseParser "" (T.unwords (T.words s)) of Left err -> Left err Right nodes -> Right $ if length nodes == 1 then head nodes else case filter filterHelper nodes of [] -> toTree "html" [] nodes x -> head x baseParser = docType <|> parseNodes filterHelper :: HTMLTree -> Bool filterHelper (NTree (Element x _) _ ) | x == "html" = True filterHelper _ = False oneLiners = (toLeaf . T.pack) <$> do _ <- spaces _ <- char '<' _ <- manyTill (noneOf ">") $try (string "/>") return "" slashFollowedByAlpha = do _ <- char '/' a <- oneOf ['a'..'z'] _ <- char '>' return [a] docType = do try docTypeHandler parseNodes docTypeHandler = (toLeaf . T.pack) <$> do _ <- spaces _ <- string "') return "" comments = (toLeaf . T.pack) <$> do _ <- spaces _ <- string "") return "" comments' = (toLeaf . T.pack) <$> do _ <- spaces _ <- string "/*" _ <- manyTill anyChar (try $string "*/") return "" parseNodes :: Stream s m Char => ParsecT s u m [HTMLTree] parseNodes = manyTill parseNode last' where last' = eof <|> void (try (string " ParsecT s u m HTMLTree parseNode = try oneLiners <|> try comments <|> try comments' <|> try docTypeHandler<|> parseElement <|> parseText parseText :: Stream s m Char => ParsecT s u m HTMLTree parseText = (toLeaf . T.pack) <$>do _ <- spaces many (noneOf "<") exceptionList :: [String] exceptionList = ["link", "br", "img", "meta", "hr", "input"] endList = ["br>"] parseElement :: Stream s m Char => ParsecT s u m HTMLTree parseElement = do (tag, attrs) <- between (char '<') (char '>') tagData tempTag tag attrs tempTag :: Stream s m Char => String -> [(T.Text, T.Text)] -> ParsecT s u m HTMLTree tempTag tag attrs | map toLower tag `elem` exceptionList = return $ toTree (T.pack tag) attrs [] | map toLower tag == "script" = do _ <- manyTill anyChar (try $string "" return $ toTree (T.pack tag) attrs $nub children tagData :: Stream s m Char => ParsecT s u m (String, [(T.Text, T.Text)]) tagData = do t <- tagName attrs <- attributes return (t,attrs) tagName :: Stream s m Char => ParsecT s u m String tagName = many1 (try (char ':') <|> try (char '-') <|> try (char '_') <|> try (char '.') <|> try (char ';') <|> alphaNum) attributes :: Stream s m Char => ParsecT s u m [(T.Text, T.Text)] attributes = spaces >> many (trailingSpaces (try attribute <|> try attribute' <|> try simpleTextInsideTag)) trailingSpaces :: Stream s m Char => ParsecT s u m a -> ParsecT s u m a trailingSpaces = (<* spaces) attribute :: Stream s m Char => ParsecT s u m (T.Text, T.Text) attribute = do name <- tagName spaces _ <- char '=' spaces open <- char '\"' <|> char '\'' -- value <- many (noneOf [open]) value <- manyTill (try (char ' ') <|> anyChar) (try $ char open) return (T.pack name, T.pack value) attribute' :: Stream s m Char => ParsecT s u m (T.Text, T.Text) attribute' = do name <- tagName spaces _ <- char '=' spaces value <- many (noneOf "> ") return (T.pack name, T.pack value) simpleTextInsideTag = do a <- many1 alphaNum spaces return (T.pack a, T.pack "")