#if MIN_VERSION_scalpel(0,4,0)
#endif
module Web.WikiCFP.Scraper.Scalpel
( ErrorMsg,
Scraper',
confRoot,
searchRoot
) where
import Control.Applicative ((<$>), (<*>), (<|>), (<*), (*>), optional, pure)
import Control.Monad (guard, forM_, mzero)
import Data.List (sort)
import Data.Maybe (catMaybes)
import Data.Monoid ((<>))
import Data.Text (Text, pack)
import Data.Time (Day, fromGregorian)
import Data.Attoparsec.Text (Parser, parseOnly, skipSpace, string, endOfInput, decimal, takeText, char)
import Text.HTML.Scalpel
( Scraper,
(@:), (@=), (//), chroot, chroots, text, texts, attr, hasClass
)
import Web.WikiCFP.Scraper.Type (Event(..), When(..))
type ErrorMsg = String
type Scraper' = Scraper Text
confRoot :: Scraper' (Either ErrorMsg [Event])
confRoot = concatSuccess $ chroots ("div" @: [hasClass "contsec"] // "table") $ eventsTable
searchRoot :: Scraper' (Either ErrorMsg [Event])
searchRoot = confRoot
concatSuccess :: (Functor m, Monad m) => m [Either e [a]] -> m (Either e [a])
concatSuccess = fmap $ fmap concat . sequence
eventsTable :: Scraper' (Either ErrorMsg [Event])
eventsTable = do
rows <- chroots "tr" (eventRow1 <|> eventRow2 <|> eventRowHeader)
case rows of
(EventRowHeader : rest) -> return $ rowsToEvents_noHeader rest
_ -> mzero
data EventRow = EventRowHeader
| EventRow1 Text Text Text
| EventRow2 Text Text Text
deriving (Eq,Ord,Show)
eventRowHeader :: Scraper' EventRow
eventRowHeader = do
tds <- texts "td"
guard $ length tds == 4
let expected_labels = map pack ["Event", "When", "Where", "Deadline"]
forM_ [0..(length expected_labels 1)] $ \i -> guard $ parsable (spacedText $ expected_labels !! i) $ tds !! i
return EventRowHeader
eventRow1 :: Scraper' EventRow
eventRow1 = do
(sname, url) <- chroot ("td" @: ["rowspan" @= "2"]) $ ((,) <$> text "a" <*> attr "href" "a")
lname <- text ("td" @: ["colspan" @= "3"])
return $ EventRow1 sname url lname
eventRow2 :: Scraper' EventRow
eventRow2 = do
tds <- texts "td"
guard $ length tds == 3
return $ EventRow2 (tds !! 0) (tds !! 1) (tds !! 2)
rowsToEvents_noHeader :: [EventRow] -> Either ErrorMsg [Event]
rowsToEvents_noHeader [] = return []
rowsToEvents_noHeader ((EventRow1 sn url ln) : (EventRow2 when wher dl) : rest) =
(:) <$> createEvent sn url ln when wher dl <*> rowsToEvents_noHeader rest
rowsToEvents_noHeader rows = Left ("Error while parsing rows: " ++ show rows)
urlBase :: Text
urlBase = pack "http://wikicfp.com"
createEvent :: Text -> Text -> Text -> Text -> Text -> Text -> Either ErrorMsg Event
createEvent sname url lname when wher dl = do
when' <- parseWhen when
wher' <- parseWhere wher
dl' <- parseDeadlines dl
return Event { eventShortName = sname,
eventURL = urlBase <> url,
eventLongName = lname,
eventWhen = when',
eventWhere = wher',
eventDeadlines = dl'
}
parsable :: Parser a -> Text -> Bool
parsable p t = either (const False) (const True) $ parseOnly p t
spacedText :: Text -> Parser Text
spacedText expected = skipSpace *> string expected <* skipSpace <* endOfInput
spacedText' :: String -> Parser Text
spacedText' = spacedText . pack
string' :: String -> Parser Text
string' = string . pack
parseWhen :: Text -> Either ErrorMsg (Maybe When)
parseWhen = parseOnly (parserWhen <* endOfInput) where
parserWhen = (spacedText' "N/A" *> pure Nothing)
<|> (Just <$> parserJustWhen)
parserJustWhen = When
<$> (parserDay <* skipSpace <* (char '-') <* skipSpace)
<*> (parserDay <* skipSpace)
parseWhere :: Text -> Either ErrorMsg (Maybe Text)
parseWhere = parseOnly (parserWhere <* endOfInput) where
parserWhere = (spacedText' "N/A" *> pure Nothing) <|> (Just <$> takeText)
parseDeadlines :: Text -> Either ErrorMsg [Day]
parseDeadlines input = sort <$> parseOnly (parserDeadlines <* endOfInput) input where
parserDeadlines = strictParserDeadlines <|> (takeText *> return [])
strictParserDeadlines = do
primary <- parserDay <* skipSpace
msecondary <- optional $ (char '(' *> skipSpace *> parserDay <* skipSpace <* char ')')
return $ maybe [primary] (: [primary]) msecondary
parserDay :: Parser Day
parserDay = impl where
impl = do
m <- parserMonth <* skipSpace
d <- decimal <* (optional $ char ',') <* skipSpace
y <- decimal
return $ fromGregorian y m d
parserMonth = (string' "Jan" *> pure 1)
<|> (string' "Feb" *> pure 2)
<|> (string' "Mar" *> pure 3)
<|> (string' "Apr" *> pure 4)
<|> (string' "May" *> pure 5)
<|> (string' "Jun" *> pure 6)
<|> (string' "Jul" *> pure 7)
<|> (string' "Aug" *> pure 8)
<|> (string' "Sep" *> pure 9)
<|> (string' "Oct" *> pure 10)
<|> (string' "Nov" *> pure 11)
<|> (string' "Dec" *> pure 12)