----------------------------------------------------------------------------- -- | -- Module : Extract -- Copyright : (C) Peter Robinson 2010-2013 -- License : GPL-2 -- -- Maintainer : Peter Robinson -- Stability : experimental -- Portability : portable -- -- Functions for retrieving data from the mathematics genealogy project. -- ----------------------------------------------------------------------------- module Extract where import Network.HTTP import Text.HTML.TagSoup import Data.Char(isSpace) import Data.Text.Lazy(Text) import Data.Text.Lazy.Encoding import Control.Exception import qualified Data.Text.Lazy as T import qualified Data.ByteString.Lazy.Char8 as B import Control.Applicative import Safe(headMay,tailMay) import Entry openURL :: String -> IO Text openURL x = return . T.pack =<< getResponseBody =<< simpleHTTP (getRequest x) getTags :: Text -> IO [Tag Text] getTags url = parseTags <$> openURL (T.unpack url) parseEntry :: [Tag Text] -> Maybe Entry parseEntry tags = do sc <- scientist tags return $ Entry sc (graduationInfos tags) (advisors tags) scientist :: [Tag Text] -> Maybe Text scientist tags = do let t = removeClutter <$> do r <- headMay (sections (~== "

[GraduationInfo] graduationInfos tags = let suffixes = sections (~== "") tags in map (graduationInfo tags) suffixes where graduationInfo _ suffix = let getTag i = notEmptyMaybeTagText $ removeClutter $ suffix !! i degree = getTag 1 univ = getTag 3 year = getTag 5 diss = getTag 18 in matchLen suffix degree univ year diss where matchLen s degree univ year diss | length s >= 19 = GraduationInfo degree univ year diss | length s >= 6 = GraduationInfo degree univ year Nothing | length s >= 4 = GraduationInfo degree univ Nothing Nothing | length s >= 2 = GraduationInfo degree Nothing Nothing Nothing | otherwise = throw $ AssertionFailed "Error - Could not parse downloaded data!" advisors :: [Tag Text] -> [(Text,Text)] advisors tags = let offset = sections (~== "

") tags offset2 = sections (~== "

") tags getTag :: Int -> [Tag Text] -> Maybe Text getTag _ [] = Nothing getTag i suff = let tag = removeClutter $ flip (!!) i $ suff in case tag of TagText t -> maybeAdv $ notEmptyMaybeTagText $ TagText (decodeUtf8 $ B.pack $ T.unpack t) _ -> Nothing getLink :: Int -> [Tag Text] -> Maybe Text getLink _ [] = Nothing getLink i suff = maybeLink $ flip (!!) (i-1) suff in if null offset2 then concatMap (\off -> catMaybes' $ [(getTag 1 off, getLink 1 off)]) offset else concatMap (\off2 -> catMaybes' $ [(getTag i off2, getLink i off2) | i <- [3, 8]]) offset2 where -- TODO: rewrite catMaybes' :: [(Maybe a,Maybe b)] -> [(a,b)] catMaybes' [] = [] catMaybes' ((Just a,Just l):xs) = (a,l) : catMaybes' xs catMaybes' ((Just _,Nothing):xs)= catMaybes' xs catMaybes' ((Nothing,_):xs) = catMaybes' xs maybeAdv adv@(Just t) | T.unpack t == "Advisor: Unknown" = Nothing | otherwise = adv maybeAdv a = a maybeLink t | isTagOpenName (T.pack "a") t = Just $ fromAttrib (T.pack "href") t | otherwise = Nothing notEmptyMaybeTagText :: Tag Text -> Maybe Text notEmptyMaybeTagText (TagText t) | all isSpace (T.unpack t) = Nothing | T.null t = Nothing | otherwise = Just t notEmptyMaybeTagText _ = Nothing -- | Removes preceding, trailing and multiple inter-word spaces: removeClutter :: Tag Text -> Tag Text removeClutter (TagText s) = TagText $ removeInterSpaces (T.strip s) where removeInterSpaces :: Text -> Text removeInterSpaces t = let (first,rest) = T.breakOn (T.pack " ") t in if T.null first then rest else if T.null rest then first else removeInterSpaces $ first `T.append` T.tail rest removeClutter t = t