Hakyll/Convert/Blogger.hs

{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE OverloadedStrings  #-}
module Hakyll.Convert.Blogger
    (FullPost(..), readPosts, distill)
  where

import           Control.Applicative
import           Control.Arrow
import           Control.Monad
import           Data.Binary
import qualified Data.ByteString             as B
import           Data.Char
import           Data.Data
import           Data.Function
import           Data.List
import           Data.List
import           Data.Maybe
import           Data.Monoid
import           Data.Text                    (Text)
import qualified Data.Text                    as T
import qualified Data.Text.Encoding           as T
import           Data.Time                    (UTCTime)
import           Data.Time.Format             (parseTime, formatTime)
import           System.Locale                (defaultTimeLocale)

import           Hakyll.Core.Compiler
import           Hakyll.Core.Item
import           Hakyll.Web.Template.Context
import           Text.Atom.Feed
import           Text.Atom.Feed.Export
import           Text.Atom.Feed.Import
import           Text.XML.Light

import           Hakyll.Convert.Common

-- | A post and its comments
data FullPost = FullPost
    { fpPost     :: Entry
    , fpComments :: [Entry]
    , fpUri      :: String
    }

-- | An entry is assumed to be either a post, or a comment.
--   If it's a post, it should be associated with the URI
--   that visitors would use to read the post (on the old blog)
--   If it's a comment, it should be the URI for the corresponding
--   post.
data BloggerEntry =
    Post    { beUri_ :: String, beEntry :: Entry }
  | Comment { beUri_ :: String, beEntry :: Entry }
  | Orphan  { beEntry :: Entry }
  deriving (Show)

beUri :: BloggerEntry -> Maybe String
beUri (Orphan _)    = Nothing
beUri (Post u _)    = Just u
beUri (Comment u _) = Just u

-- ---------------------------------------------------------------------
-- Feed to helper type
-- ---------------------------------------------------------------------

-- | Returns only published posts
readPosts :: FilePath -> IO (Maybe [FullPost])
readPosts f = do
    parseAtomDoc <$> B.readFile f
  where
    parseAtomDoc x =
        select =<< parseXMLDoc (T.decodeUtf8 x)
    select =
        fmap (extractPosts . feedEntries) . elementFeed . deleteDrafts

-- has to be done on the XML level as our atom lib doesn't understand
-- the blogger-specific XML for drafts
deleteDrafts :: Element -> Element
deleteDrafts e =
    e { elContent = filter isInnocent (elContent e) }
  where
    isInnocent (Elem e) = not (isDraft e)
    isInnocent _ = True

isDraft :: Element -> Bool
isDraft e =
    isJust $ findElement draft e
  where
    draft = QName
        { qName   = "draft"
        , qURI    = Just "http://purl.org/atom/app#"
        , qPrefix = Just "app"
        }



-- | Warning: this silently ignores orphans, templates, settings
extractPosts :: [Entry] -> [FullPost]
extractPosts entries =
    map toFullPost blocks
  where
    toFullPost (uri, entries) = FullPost
         { fpPost     = post
         , fpComments = comments
         , fpUri      = uri
         }
       where
         post = case [ e | Post _ e <- entries ] of
                    []  -> huh "Block of entries with no post?!"
                    [p] -> p
                    ps  -> huh "Block of entries with more than one post?!"
         comments = [ e | Comment _ e <- entries ]
         huh msg  = error . unlines $ msg : map (txtToString . entryTitle . beEntry) entries
    blocks = [ (u,xs) | (Just u, xs) <- blocks_ ] -- drop orphans
    blocks_ = buckets beUri
            $ map identifyEntry
            $ filter isInteresting entries

-- | Contains actual meat (posts, comments; but not eg. templates)
isInteresting :: Entry -> Bool
isInteresting e =
    not $ any isBoring cats
  where
    isBoring c = any (\t -> isBloggerCategoryOfType t c) ["settings", "template"]
    cats       = entryCategories e


-- | Tag an entry from the blogger feed as either being a post,
--   a comment, or an "orphan" (a comment without an associated post)
identifyEntry :: Entry -> BloggerEntry
identifyEntry e =
    if isPost e
        then case getLink "self" `mplus` getLink "alternate" of
                 Just l  -> Post (postUrl l) e
                 Nothing -> entryError e oopsSelf
        else case getLink "alternate" of
                 Just l  -> Comment (postUrl l) e
                 Nothing -> Orphan e
  where
    isPost  = any (isBloggerCategoryOfType "post") . entryCategories
    postUrl = takeWhile (/= '?') . linkHref
    getLink ty = case filter (isLink ty) $ entryLinks e of
        []  -> Nothing
        [x] -> Just x
        xs  -> entryError e (oopsLink ty)
    isLink ty l = linkRel l == Just (Right ty) && linkType l == Just "text/html"
    oopsSelf    = "Was expecting blog posts to have a self link"
    oopsLink ty = "Was expecting entries have at most one link of type " ++ ty

isBloggerCategory :: Category -> Bool
isBloggerCategory = (== Just "http://schemas.google.com/g/2005#kind")
                  . catScheme

isBloggerCategoryOfType :: String -- ^ \"comment\", \"post\", etc
                        -> Category
                        -> Bool
isBloggerCategoryOfType ty c =
    isBloggerCategory c &&
    catTerm c == "http://schemas.google.com/blogger/2008/kind#" ++ ty

-- ---------------------------------------------------------------------
--
-- ---------------------------------------------------------------------

distill :: FullPost -> DistilledPost
distill fp = DistilledPost
    { dpBody  = body fpost
    , dpUri   = fpUri fp
    , dpTitle = title fpost
    , dpTags  = tags fpost
    , dpCategories = []
    , dpDate  = date fpost
    }
  where
    fpost = fpPost fp
    --
    body = fromContent . entryContent
    fromContent (Just (HTMLContent x)) = T.pack x
    fromContent _ = error "Hakyll.Convert.Blogger.distill expecting HTML"
    --
    title p = case txtToString (entryTitle p) of
         "" -> Nothing
         t  -> Just (T.pack t)
    tags = map (T.pack . catTerm)
         . filter (not . isBloggerCategory)
         . entryCategories
    date x = T.pack $
        case parseTime' =<< entryPublished x of
            Nothing -> "1970-01-01"
            Just  d -> formatTime' d
    parseTime' d = msum $ map (\f -> parseTime defaultTimeLocale f d)
        [ "%FT%T%Q%z"  -- with time zone
        , "%FT%T%QZ"   -- zulu time
        ]
    formatTime' :: UTCTime -> String
    formatTime' = formatTime defaultTimeLocale "%FT%TZ" --for hakyll

-- ---------------------------------------------------------------------
-- odds and ends
-- ---------------------------------------------------------------------

entryError e msg =
    error $ msg ++ " [on entry " ++ entryId e ++ "]\n" ++ show e

buckets :: Ord b => (a -> b) -> [a] -> [ (b,[a]) ]
buckets f = map (first head . unzip)
          . groupBy ((==) `on` fst)
          . sortBy (compare `on` fst)
          . map (\x -> (f x, x))