module Language.English.EtymologyOnline where
import Control.Exception
import Control.Monad
import Data.ByteString (ByteString)
import qualified Data.ByteString as Bytes
import Data.ByteString.UTF8 (toString)
import Data.Maybe
import Data.Text.ICU.Convert
import Data.Typeable
import Network.Curl
import System.Random
import Text.XML.Light
data EOException
= CannotParsePageCountForLetter Char
| CannotDownloadPageCountForLetter Char
| CannotParsePageEntries Char Int (Maybe Element)
| CannotDownloadPageEntries Char Int
| NoEntryInPage Char Int
deriving (Typeable,Show)
instance Exception EOException
getRandomEntry :: IO (String,String)
getRandomEntry = do
converter <- open "windows-1252" Nothing
letter <- randomRIO ('a','z')
pages <- getletterpages converter letter
pagenumber <- randomRIO (1,pages)
entries <- getletterentries converter letter pagenumber
index <- randomRIO (1,length entries)
maybe (throw (NoEntryInPage letter pagenumber))
return
(lookup index (zip [1..] entries))
getletterpages :: Converter -> Char -> IO Int
getletterpages converter l = withCurlDo $ do
(code,bytes) <- curlGetString_ url []
case code of
CurlOK -> do
let doc = parseEOXML converter (bytes :: ByteString)
maybe (return 0)
(return . length)
(getlinks doc)
_ -> throw (CannotDownloadPageCountForLetter l)
where url = "http://www.etymonline.com/index.php?l=" ++ [l]
getletterentries :: Converter -> Char -> Int -> IO [(String,String)]
getletterentries converter l pn = withCurlDo $ do
(code,bytes) <- curlGetString_ url []
case code of
CurlOK -> do
let doc = parseEOXML converter (bytes :: ByteString)
maybe (throw (CannotParsePageEntries l pn doc)) return (getentries doc)
_ -> throw (CannotDownloadPageEntries l pn)
where url = "http://www.etymonline.com/index.php?l=" ++ [l] ++ "&p=" ++ show (pn 1)
getentries :: Maybe Element -> Maybe [(String, String)]
getentries doc = do
el <- doc
let dts = grab "dt" el
dds = grab "dd" el
return (zip dts dds)
where grab key = map (trim . allContent)
. findElements (qname key xhtml)
getlinks :: Maybe Element -> Maybe [String]
getlinks doc = do
el <- doc
paging <- filterElement paginglist el
ul <- findChild (qname "ul" xhtml) paging
return (paginglinks ul)
paginglist :: Element -> Bool
paginglist el = findAttr (qname "class" Nothing) el == Just "paging"
paginglinks :: Element -> [String]
paginglinks = mapMaybe (findAttr (qname "href" Nothing))
. getels "a"
where getels key = findElements (qname key xhtml)
allContent :: Element -> String
allContent (Element _ _ contents _) = concatMap switch contents where
switch el =
case el of
Elem e -> allContent e
CRef _ -> ""
Text (CData _ d _) -> d
parseEOXML :: Converter -> ByteString -> Maybe Element
parseEOXML converter =
parseXMLDoc . toUnicode converter
xhtml :: Maybe String
xhtml = Just "http://www.w3.org/1999/xhtml"
qname :: String -> Maybe String -> QName
qname q uri = blank_name { qName = q, qURI = uri }
trim :: String -> String
trim = unwords . words