{-# LANGUAGE Safe, NoImplicitPrelude, UnicodeSyntax, OverloadedStrings #-} module Data.Microformats2.Parser ( Mf2ParserSettings (..) , HtmlContentMode (..) , parseMf2 , extractProperty -- * HTML parsing stuff (from html-conduit, xml-lens) , documentRoot , parseLBS , sinkDoc ) where import Prelude.Compat import Data.Microformats2.Parser.Property import Data.Microformats2.Parser.HtmlUtil import Data.Microformats2.Parser.Util import Data.Aeson.Lens import Data.Char (isSpace) import qualified Data.HashMap.Strict as HMS import Data.Maybe import qualified Data.Text as T import Network.URI import Safe (headMay) data Mf2ParserSettings = Mf2ParserSettings { htmlMode ∷ HtmlContentMode , baseUri ∷ Maybe URI } deriving (Show, Eq) instance Default Mf2ParserSettings where def = Mf2ParserSettings { htmlMode = Sanitize , baseUri = Nothing } mf2Elements ∷ Traversal' Element Element mf2Elements = attributeSatisfies "class" $ any isMf2Class . T.split isSpace readPropertyName ∷ T.Text → (T.Text, T.Text) readPropertyName x = (fromMaybe "p" $ headMay ps, T.intercalate "-" $ drop 1 ps) where ps = T.splitOn "-" x extractProperty ∷ Mf2ParserSettings → T.Text → Element → Value extractProperty _ "p" e = fromMaybe Null $ String <$> extractP e extractProperty s "u" e = fromMaybe Null $ String <$> case extractU e of Just (u, True) → Just $ resolveURI (baseUri s) u Just (t, False) → Just t Nothing → Nothing extractProperty _ "dt" e = fromMaybe Null $ String <$> extractDt e extractProperty s "e" e = object [ "html" .= getProcessedInnerHtml (htmlMode s) (baseUri s) e , "value" .= getInnerTextRaw e ] extractProperty _ _ _ = Null -- lens-aeson's 'key' doesn't add new keys :-( addValue ∷ T.Text → Value → Value → Value addValue "p" v@(Object o) f = let v' = (fromMaybe f $ v ^? key "properties" . key "name" . nth 0) in Object $ if v' == Null then o else HMS.insert "value" v' o addValue "e" (Object o) f = Object $ HMS.insert "value" (fromMaybe Null $ f ^? key "value") $ HMS.insert "html" (fromMaybe Null $ f ^? key "html") o addValue "u" v@(Object o) f = Object $ HMS.insert "value" (fromMaybe f $ v ^? key "properties" . key "url" . nth 0) o addValue _ (Object o) f = Object $ HMS.insert "value" f o addValue _ x _ = x addImpliedProperties ∷ Mf2ParserSettings → Element → Value → Value addImpliedProperties settings e v@(Object o) = Object $ addIfNull "photo" "photo" resolveURI' $ addIfNull "url" "url" resolveURI' $ addIfNullAndNoOthers "name" "name" id o where addIfNull nameJ nameH f obj = if isNothing $ v ^? key nameJ then HMS.insert nameJ (vsingleton $ f <$> implyProperty nameH e) obj else obj addIfNullAndNoOthers nameJ nameH f obj = if isNothing (v ^? key nameJ) && isNothing (v ^? key "children") && isNothing (e ^? plate . entire . peElements) then HMS.insert nameJ (vsingleton $ f <$> implyProperty nameH e) obj else obj peElements = attributeSatisfies "class" $ any (\x → isPClass x || isEClass x) . T.split isSpace resolveURI' = resolveURI $ baseUri settings addImpliedProperties _ _ v = v removePropertiesOfNestedMicroformats ∷ [Element] → [Element] → [Element] removePropertiesOfNestedMicroformats nmf2s = filter (not . isNested) where isNested e = any (\e' → e `elem` filter (/= e') (e' ^.. entire)) nmf2s parseProperty ∷ Mf2ParserSettings → Element → [Pair] parseProperty settings e = let propNames = groupBy' snd $ map readPropertyName $ filter isPropertyClass $ classes e extractPropertyValue (t, _) = extractProperty settings t e in if any isMf2Class $ classes e then map (\(n, ts) → n .= [ addValue (fst $ head ts) (parseH settings e) (extractPropertyValue $ head ts) ]) propNames else map (\(n, ts) → n .= map extractPropertyValue ts) propNames parseH ∷ Mf2ParserSettings → Element → Value parseH settings e = object $ filter (\(n, v) → not (emptyVal v) || n == "properties") [ "type" .= filter isMf2Class (classes e) , "properties" .= properties , "children" .= childrenMf2 , "shape" .= fromMaybe Null (String <$> e ^? el "area" . attr "shape") , "coords" .= fromMaybe Null (String <$> e ^? el "area" . attr "coords") ] where childrenMf2 = map ((\x → addValue "p" x Null) . parseH settings) $ filter (not . isProperty) $ deduplicateElements allMf2Descendants allMf2Descendants = filter (/= e) $ e ^.. entire . mf2Elements -- we have to do all of this because multiple elements can become multiple properties (with overlap) properties = Object $ HMS.filter (not . emptyVal) properties' (Object properties') = addImpliedProperties settings e $ object $ map mergeProps $ groupBy' fst properties'' properties'' = concatMap (parseProperty settings) $ removePropertiesOfNestedMicroformats allMf2Descendants $ filter (/= e) $ e ^.. entire . propertyElements -- | Parses Microformats 2 from an HTML Element into a JSON Value. parseMf2 ∷ Mf2ParserSettings → Element → Value parseMf2 settings rootEl = object [ "items" .= items, "rels" .= rels, "rel-urls" .= relUrls ] where items = map (parseH settings') $ deduplicateElements $ rootEl' ^.. entire . mf2Elements rels = object $ map (\(r, es) → r .= map snd es) $ groupBy' fst $ expandSnd $ map (\e → (T.split isSpace (e ^. attr "rel"), resolveURI (baseUri settings') $ e ^. attr "href")) linkEls relUrls = object $ map relUrlObject linkEls relUrlObject e = resolveURI (baseUri settings') (e ^. attr "href") .= object (filter (not . emptyVal . snd) [ "rels" .= T.split isSpace (e ^. attr "rel") , "text" .= fromMaybe Null (String <$> getInnerTextWithImgs e) , linkAttr "type" "type" e , linkAttr "media" "media" e , linkAttr "hreflang" "hreflang" e ]) linkAttr nameJ nameH e = nameJ .= fromMaybe Null (String <$> e ^? attr nameH) linkEls = filter (isJust . (^? attr "href")) $ filter (isJust . (^? attr "rel")) $ rootEl' ^.. entire . els [ "a", "link" ] -- Obligatory WTF comment about base[href] being relative to the URI the page was requested from! settings' = settings { baseUri = case (baseUri settings, parseURIReference =<< T.unpack <$> (rootEl' ^? entire . el "base" . attr "href")) of (Just sU, Just tU) → Just (tU `relativeTo` sU) (Just sU, Nothing) → Just sU (Nothing, Just tU) → Just tU (Nothing, Nothing) → Nothing } rootEl' = preprocessHtml rootEl preprocessHtml ∷ Element → Element preprocessHtml (Element n as ns) = Element (lowerName n) as $ map preprocessChildren $ filter (not . isTemplate) ns where isTemplate (NodeElement (Element (Name "template" _ _) _ _)) = True isTemplate _ = False preprocessChildren (NodeElement e) = NodeElement $ preprocessHtml e preprocessChildren x = x lowerName (Name txt x y) = Name (T.toLower txt) x y