{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE OverloadedStrings #-}

-- | An API for the Online Etymology Dictionary at http://etymonline.com/.
--   The encoding is ISO-8859-1/latin1 so needs to use uconv.

module Language.English.EtymologyOnline where

import           Control.Exception
import           Control.Monad
import           Data.ByteString       (ByteString)
import qualified Data.ByteString       as Bytes
import           Data.ByteString.UTF8  (toString)
import           Data.Maybe
import           Data.Text.ICU.Convert
import           Data.Typeable
import           Network.Curl
import           System.Random
import           Text.XML.Light

-- | An EO exception.
data EOException
  = CannotParsePageCountForLetter Char
  | CannotDownloadPageCountForLetter Char
  | CannotParsePageEntries Char Int (Maybe Element)
  | CannotDownloadPageEntries Char Int
  | NoEntryInPage Char Int
  deriving (Typeable,Show)
instance Exception EOException

-- | Get a random Etymology Online entry.
getRandomEntry :: IO (String,String)
getRandomEntry = do
  converter <- open "windows-1252" Nothing
  letter <- randomRIO ('a','z')
  pages <- getletterpages converter letter
  pagenumber <- randomRIO (1,pages)
  entries <- getletterentries converter letter pagenumber
  index <- randomRIO (1,length entries)
  maybe (throw (NoEntryInPage letter pagenumber))
        return
        (lookup index (zip [1..] entries))

-- | Get the number of pages for a letter.
getletterpages :: Converter -> Char -> IO Int
getletterpages converter l = withCurlDo $ do
  (code,bytes) <- curlGetString_ url []
  case code of
    CurlOK -> do
      let doc = parseEOXML converter (bytes :: ByteString)
      maybe (return 0) -- Some pages don't have pagination at all.
            (return . length)
            (getlinks doc)
    _ -> throw (CannotDownloadPageCountForLetter l)
  where url = "http://www.etymonline.com/index.php?l=" ++ [l]

-- | Get the entries for a letter and page number.
getletterentries :: Converter -> Char -> Int -> IO [(String,String)]
getletterentries converter l pn = withCurlDo $ do
  (code,bytes) <- curlGetString_ url [] 
  case code of
    CurlOK -> do
      let doc = parseEOXML converter (bytes :: ByteString)
      maybe (throw (CannotParsePageEntries l pn doc)) return (getentries doc)
    _ -> throw (CannotDownloadPageEntries l pn)
  where url = "http://www.etymonline.com/index.php?l=" ++ [l] ++ "&p=" ++ show (pn - 1)

-- Get the entries given the page.
getentries :: Maybe Element -> Maybe [(String, String)]
getentries doc = do
  el <- doc
  let dts = grab "dt" el
      dds = grab "dd" el
  return (zip dts dds)
    
  where grab key = map (trim . allContent)
                 . findElements (qname key xhtml)

-- Get the pagination links from the page.
getlinks :: Maybe Element -> Maybe [String]
getlinks doc = do
  el <- doc
  paging <- filterElement paginglist el
  ul <- findChild (qname "ul" xhtml) paging
  return (paginglinks ul)

-- Find the element with the pagination.
paginglist :: Element -> Bool
paginglist el = findAttr (qname "class" Nothing) el == Just "paging"

-- Get the pagination links.
paginglinks :: Element -> [String]
paginglinks = mapMaybe (findAttr (qname "href" Nothing))
            . getels "a"
              
  where getels key = findElements (qname key xhtml)

-- Get all text content of an element.
allContent :: Element -> String
allContent (Element _ _ contents _) = concatMap switch contents where
  switch el =
    case el of
      Elem e -> allContent e
      CRef _  -> ""
      Text (CData _ d _) -> d

-- Parse an ISO-8859-1 document to an XML tree.
parseEOXML :: Converter -> ByteString -> Maybe Element
parseEOXML converter =
  parseXMLDoc . toUnicode converter

-- Some utils.
      
xhtml :: Maybe String
xhtml = Just "http://www.w3.org/1999/xhtml"

qname :: String -> Maybe String -> QName
qname q uri = blank_name { qName = q, qURI = uri }

trim :: String -> String
trim = unwords . words