{-# LANGUAGE OverloadedStrings, RecordWildCards, ScopedTypeVariables, FlexibleContexts #-} 
module SphinxEscape where
import Control.Applicative
import Data.Functor.Identity (Identity )
import Text.Parsec hiding (many, (<|>)) 
import Data.Char
import Data.List
 

-- Main function
escapeSphinxQueryString :: String -> String
escapeSphinxQueryString s = intercalate " " . map expressionToString . parseQuery $ s


-- Just a simplified syntax tree. Besides this, all other input has its
-- non-alphanumeric characters stripped, including double and single quotes and
-- parentheses

data Expression = 
        TagFieldSearch String 
      | Literal String
      | Phrase String
      | AndOrExpr Conj Expression Expression 
  deriving Show

data Conj = And | Or
  deriving Show

parseQuery :: String -> [Expression]
parseQuery  inp =
  case Text.Parsec.parse (many expression) "" inp of
    Left x -> error $ "parser failed: " ++ show x
    Right xs -> xs

-- escapes expression to string to pass to sphinx
expressionToString :: Expression -> String
expressionToString (TagFieldSearch s) = "@tag_list" ++ escapeString s
expressionToString (Literal s) = escapeString s
expressionToString (Phrase s) = "\"" ++ s ++ "\"" -- no need to escape the contents
expressionToString (AndOrExpr c a b) = 
    let a' = expressionToString a 
        b' = expressionToString b
        c' = conjToString c 
    -- if either a' or b' is just whitespace, just choose one or the other
    in case (all isSpace a', all isSpace b') of
        (True, False) -> b'
        (False, True) -> a'
        (False, False) -> a' ++ c' ++ b'
        _  -> ""

conjToString :: Conj -> String
conjToString And = " & "
conjToString Or = " | "

-- removes all non-alphanumerics from literal strings that could be parsed
-- mistakenly as Sphinx Extended Query operators
escapeString :: String -> String
escapeString s = map (stripAlphaNum) s

stripAlphaNum :: Char -> Char
stripAlphaNum s | isAlphaNum s = s
                | otherwise = ' '


type Parser' = ParsecT String () Identity 

-- | can be literal or tag field or nothing, followed an expression
topLevelExpression :: Parser' [Expression]
topLevelExpression = do
    a <- option [] ((:[]) <$> (tagField <|> literal))
    xs <- many expression
    return $ a ++ xs


expression :: Parser' Expression
expression = (try andOrExpr) <|> try tagField <|> try phrase <|> literal 

tagField :: Parser' Expression
tagField = do
   char '@'
   string "tag_list" <|> string "(tag_list)"
   s <- manyTill anyChar (try literalStop)
   return $ TagFieldSearch s


andOrExpr :: Parser' Expression
andOrExpr = do 
    a <- (try tagField <|> try phrase <|> literal)
    x <- try conjExpr
    b <- expression  -- recursion
    return $ AndOrExpr x a b

conjExpr :: Parser' Conj
conjExpr = andExpr <|> orExpr

andExpr :: Parser' Conj
andExpr = mkConjExpr ["and", "AND", "&"] And

orExpr :: Parser' Conj
orExpr = mkConjExpr ["or", "OR", "|"] Or


mkConjExpr :: [String] -> Conj -> Parser' Conj
mkConjExpr xs t = 
    try (many1 space >> choice (map (string . (++" ")) xs))
    >> return t

phrase :: Parser' Expression
phrase = do
    _ <- char '"'
    xs <- manyTill anyChar (char '"')
    return . Phrase $ xs

literalStop :: Parser' ()
literalStop = (choice [ 
    lookAhead (tagField >> return ()) 
  , lookAhead (conjExpr >> return ())
  , lookAhead (phrase >> return ())
  , eof
  ])
  <?> "literalStop"

literal :: Parser' Expression
literal = do
    a <- anyChar
    notFollowedBy literalStop
    xs <- manyTill anyChar (try literalStop)
    return . Literal $ a:xs