{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DeriveAnyClass #-} {-# OPTIONS_HADDOCK hide #-} -- | Defines the interfaces for Pusan National University spell check service. module Language.Hanspell.PnuSpellChecker ( PnuSpellChecker , spellCheckByPnu , pnuSpellCheckerMaxWords ) where import qualified Data.ByteString.UTF8 as BU import qualified Data.ByteString.Lazy.UTF8 as BLU import Data.List import Data.List.Split import Network.HTTP.Types.Status import Text.Regex import Debug.Trace import Control.Monad.Trans.Maybe import Control.Monad.Trans.Class import Network.HTTP.Client import Network.HTTP.Client.TLS import Data.Aeson import GHC.Generics import Language.Hanspell.Typo import Language.Hanspell.Decoder -- | Defines a class for 'spellCheckByPnu' function overloading. class Monad m => PnuSpellChecker m where -- | Requests spell check to PNU server, parses the responses, -- and returns @m [Typo]@. @spellCheckByPnu@ has two return types. -- One is @MaybeT IO [Typo]@, and the other is @IO [Typo]@. -- -- @ -- import Language.Hanspell -- -- example = do -- let sentence = "위에계신분, 잘들리세요?" -- typos <- spellCheckByPnu sentence -- mapM_ (putStrLn . typoToStringWithStyle False) typos -- @ -- -- The expected output is: -- -- @ -- 위에계신분 -> 위에 계신 분 -- '계신 분'으로 띄어 씁니다. -- -- 잘들리세요 -> 잘 들리세요 -- 부사는 뒤의 말과 띄어 써야 합니다. -- (예) -- 곧돌아오마 (x)-> 곧 돌아오마 (o) -- 부디건강해라(x) -> 부디 건강해라(o) -- 어서오십시오(x) -> 어서 오십시오(o) -- ... -- @ spellCheckByPnu :: String -> m [Typo] -- | Obssesive version returning @MaybeT IO [Typo]@. instance PnuSpellChecker (MaybeT IO) where spellCheckByPnu text = htmlToTypos <$> requestToPnu text -- | Bold version returning @IO [Typo]@. instance PnuSpellChecker IO where spellCheckByPnu text = do maybe <- runMaybeT $ htmlToTypos <$> requestToPnu text case maybe of Nothing -> return [] Just typos -> return typos -- | Maximum words count for a 'spellCheckByPnu' request. pnuSpellCheckerMaxWords :: Int pnuSpellCheckerMaxWords = 295 -- 'spellCheckByPnu' prints the message below when the HTTP status code is -- not 200. Mainly due to timeout. pnuConnectError :: String pnuConnectError = "-- 한스펠 오류: 부산대 서버의 접속 오류로 일부 문장 교정에 실패했습니다." -- 'spellCheckByPnu' prints the message below when the response is not of -- spell checker. Mainly due to the changes of service URL. invalidResponseFromPnu :: String invalidResponseFromPnu = "-- 한스펠 오류: 부산대 서비스가 유효하지 않은 양식을 반환했습니다. (" ++ pnuSpellCheckUrl ++ ")" -- Pnu spell checker URL. -- -- Try `curl -H "Content-Type: application/x-www-form-urlencoded" \ -- -X POST http://speller.cs.pusan.ac.kr/results -d \ -- --data-urlencode "text1=안녕 하세요"`. pnuSpellCheckUrl :: String pnuSpellCheckUrl = "http://speller.cs.pusan.ac.kr/results" -- For convenience gsub :: String -> String -> String -> String gsub from to text = subRegex (mkRegex from) text to -- Requests spell check to the server, check the responses, and returns it. -- When the status code is not 200, or the response is not of spell checker, -- traces error message, and returns Nothing. requestToPnu :: String -> MaybeT IO String requestToPnu text = if null (words text) then return "" else do -- Walkaround for PNU server's weired logic let text' = intercalate "\r\n" . splitOn "\n" $ text manager <- lift $ newManager tlsManagerSettings let pair = [("text1",BU.fromString text')] initialRequest <- lift $ parseRequest pnuSpellCheckUrl let request = (urlEncodedBody pair initialRequest) { method = "POST" } response <- lift $ httpLbs request manager let errCode = statusCode (responseStatus response) let pnuResponseInfix = "한국어 맞춤법/문법 검사기" if errCode == 200 then let body = BLU.toString (responseBody response) in if pnuResponseInfix `isInfixOf` body then return body else trace invalidResponseFromPnu (MaybeT $ return Nothing) else trace (pnuConnectError ++ " ("++ show errCode ++ ")") (MaybeT $ return Nothing) -- PNU response format for a Typo. data PnuTypo = PnuTypo { help :: String , errorIdx :: Int , correctMethod :: Int , start :: Int , end :: Int , orgStr :: String , candWord :: String } deriving (Show, Generic, ToJSON, FromJSON) -- PNU response format for Typos. data PnuTypos = PnuTypos { str :: String , errInfo :: [PnuTypo] , idx :: Int } deriving (Show, Generic, ToJSON, FromJSON) -- Parses the response HTML to [Typo]. htmlToTypos :: String -> [Typo] htmlToTypos body = case matchRegex (mkRegex "^\tdata = (.*);") body of Nothing -> [] Just [jsonText] -> map pnuTypoToTypo pnuTypos where Just pnuTyopsList = decode . BLU.fromString $ jsonText pnuTypos = mconcat . map errInfo $ pnuTyopsList -- Converts PnuTypo (response JSON) to Typo. pnuTypoToTypo :: PnuTypo -> Typo pnuTypoToTypo pnuTypo = Typo { errorType = "" , token = decodeEntity . orgStr $ pnuTypo , suggestions = splitOn "|" . decodeEntity $ suggestions' , context = "" , info = decodeEntity . gsub "\n\\(예\\) " "(예)\n" . gsub " *
*" "\n" . (++ "\n") . help $ pnuTypo } where suggestions' = if null . candWord $ pnuTypo then orgStr pnuTypo else candWord pnuTypo