----------------------------------------------------------------------------- -- | -- Module : Extract -- Copyright : (C) Peter Robinson 2010-2012 -- License : GPL-2 -- -- Maintainer : Peter Robinson -- Stability : experimental -- Portability : portable -- -- Functions for retrieving data from the mathematics genealogy project. -- ----------------------------------------------------------------------------- module Extract where import Network.HTTP import Text.HTML.TagSoup import Control.Monad import Control.Exception import Data.Char(isSpace) import Data.Maybe import Data.Text.Lazy(Text) import Data.Text.Lazy.Encoding import qualified Data.Text.Lazy as T import qualified Data.ByteString.Lazy.Char8 as B import Control.Applicative import Entry openURL x = return . T.pack =<< getResponseBody =<< simpleHTTP (getRequest x) getTags :: Text -> IO [Tag Text] getTags url = parseTags <$> openURL (T.unpack url) parseEntry :: [Tag Text] -> Entry parseEntry tags = Entry (scientist tags) (graduationInfo tags) (advisors tags) scientist :: [Tag Text] -> Text scientist tags = let TagText sc = removeClutter $ head $ tail $ head $ sections (~== "

Maybe GraduationInfo graduationInfo tags = let offset = head $ sections (~== "") tags getTag i = notEmptyMaybeTagText $ removeClutter $ offset !! i degree = getTag 1 univ = getTag 3 year = getTag 5 diss = getTag 18 in matchLen offset degree univ year diss where matchLen offset degree univ year diss | length offset >= 19 = Just $ GraduationInfo degree univ year diss | length offset >= 6 = Just $ GraduationInfo degree univ year Nothing | length offset >= 4 = Just $ GraduationInfo degree univ Nothing Nothing | length offset >= 2 = Just $ GraduationInfo degree Nothing Nothing Nothing | otherwise = Nothing advisors :: [Tag Text] -> [(Text,Text)] advisors tags = let offset = sections (~== "

") tags offset2 = sections (~== "

") tags getTag :: Int -> [[Tag Text]] -> Maybe Text getTag _ [] = Nothing getTag i off = let tag = removeClutter $ flip (!!) i $ head off in case tag of TagText t -> maybeAdv $ notEmptyMaybeTagText $ TagText (decodeUtf8 $ B.pack $ T.unpack t) _ -> Nothing getLink :: Int -> [[Tag Text]] -> Maybe Text getLink _ [] = Nothing getLink i off = maybeLink $ flip (!!) (i-1) $ head off in (catMaybes' (if null offset2 then [(getTag 1 offset, getLink 1 offset)] else [(getTag i offset2, getLink i offset2) | i <- [3, 8]])) -- if null offset2 -- then catMaybes' [(getTag 1 offset,getLink 1 offset)] -- else catMaybes' [ (getTag i offset2,getLink i offset2) | i <- [3,8] ] where -- TODO: rewrite catMaybes' :: [(Maybe a,Maybe b)] -> [(a,b)] catMaybes' [] = [] catMaybes' ((Just a,Just l):xs) = (a,l) : catMaybes' xs catMaybes' ((Just a,Nothing):xs)= catMaybes' xs catMaybes' ((Nothing,_):xs) = catMaybes' xs maybeAdv adv@(Just t) | T.unpack t == "Advisor: Unknown" = Nothing | otherwise = adv maybeAdv a = a maybeLink t | isTagOpenName (T.pack "a") t = Just $ fromAttrib (T.pack "href") t | otherwise = Nothing notEmptyMaybeTagText :: Tag Text -> Maybe Text notEmptyMaybeTagText (TagText t) | all isSpace (T.unpack t) = Nothing | T.null t = Nothing | otherwise = Just t notEmptyMaybeTagText _ = Nothing -- | Removes preceding, trailing and multiple inter-word spaces: removeClutter :: Tag Text -> Tag Text removeClutter (TagText s) = TagText $ removeInterSpaces (T.strip s) where removeInterSpaces :: Text -> Text removeInterSpaces t = let (first,rest) = T.breakOn (T.pack " ") t in if T.null first then rest else if T.null rest then first else removeInterSpaces $ first `T.append` T.tail rest removeClutter t = t