{-
Copyright (C) 2013 John Lenz <lenz@math.uic.edu>

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
-}
module FilterHtml (filterHtml) where

-- Based on the xss-sanitize project http://hackage.haskell.org/package/xss-sanitize
-- but a lot more restrictive.

import Import
import Data.Maybe (mapMaybe)
import Data.String (IsString)
import Text.HTML.SanitizeXSS (sanitizeAttribute)
import Text.HTML.TagSoup
import qualified Data.Map as Map
import qualified Data.Text.Lazy as TL

-- | Filter Html
filterHtml :: TL.Text -> TL.Text
filterHtml = renderTags . balance Map.empty . safeTags . parseTags

-- | List of allowed tags.
allowedTags :: [TL.Text]
allowedTags = ["div", "p", "br", "blockquote"]

-- | Filter which tags are allowed
safeTags :: [Tag TL.Text] -> [Tag TL.Text]
safeTags [] = []
safeTags (TagOpen name attrs:ts) | name `elem` allowedTags = TagOpen name (safeAttrs attrs) : safeTags ts
safeTags (TagOpen _    _    :ts)                           = safeTags ts

safeTags (t@(TagClose name):ts) | name `elem` allowedTags = t : safeTags ts
safeTags   (TagClose _:ts)                                = safeTags ts

safeTags (t:ts) = t : safeTags ts

-- | Filter the attributes.
safeAttrs :: [Attribute TL.Text] -> [Attribute TL.Text]
safeAttrs = sanitizeAttribute' . filterAttrs
    where
        filterAttrs = filter $ \(n,_) -> n `elem` okAttrs
        okAttrs = ["style", "class"]

-- | Wrapper around xss-sanitize's sanitizeAttribute.  
-- 
-- This clears bad attributes but also filters the CSS within the style attribute, removing
-- things like urls and so forth.
sanitizeAttribute' :: [Attribute TL.Text] -> [Attribute TL.Text]
sanitizeAttribute' = map fromStrict . mapMaybe sanitizeAttribute . map toStrict
    where
        fromStrict (a,b) = (TL.fromStrict a, TL.fromStrict b)
        toStrict (a,b) = (TL.toStrict a, TL.toStrict b)

-- Copied from xss-sanitize and updated to work with lazy text by changing the type signature.
balance :: (IsString a, Ord a) => Map.Map a Int -> [Tag a] -> [Tag a]
balance m [] =
    concatMap go $ Map.toList m
  where
    go (name, i) | name == "br" = []
                 | otherwise    = replicate i $ TagClose name
balance m (t@(TagClose name):tags) =
    case Map.lookup name m of
        Nothing -> TagOpen name [] : TagClose name : balance m tags
        Just i ->
            let m' = if i == 1
                        then Map.delete name m
                        else Map.insert name (i - 1) m
             in t : balance m' tags
balance m (TagOpen name as : tags) =
    TagOpen name as : balance m' tags
  where
    m' = case Map.lookup name m of
            Nothing -> Map.insert name 1 m
            Just i -> Map.insert name (i + 1) m
balance m (t:ts) = t : balance m ts