{-# LANGUAGE OverloadedStrings, RecordWildCards, ScopedTypeVariables, FlexibleContexts #-} 
module SphinxEscape where
import Control.Applicative
import Data.Char
import Data.Functor.Identity (Identity)
import Data.List
import Data.String.Utils (strip)
import Text.Parsec hiding (many, (<|>)) 
 
-- | Extract tag and author filters and prepare resulting
--   query string for submission to Sphinx.
transformQuery :: String                        -- ^ Original query string
               -> ([String], [String], String)  -- ^ tag names, author names, query
transformQuery q = (ts', as', q')
  where 
    (ts, as, qs) = extractFilters $ parseFilters q
    (ts', as')   = formatFilters ts as
    q'           = formatQuery . parseQueryNoFilters $ formatQueryNoEscaping qs

extractFilters :: [Expression] -> ([Expression], [Expression], [Expression])
extractFilters es = (ts, as, q')
  where
    (ts, q)  = partition isTagFilter es
    (as, q') = partition isAuthorFilter q

formatFilters :: [Expression] -> [Expression] -> ([String], [String])
formatFilters ts as = (map tagNameFromExpression ts, map authorNameFromExpression as)

formatQueryWith :: (Expression -> String) -> [Expression] -> String
formatQueryWith f = strip . intercalate " " . map (strip . f)

-- Format query expressions without escaping special characters.
-- This allows a second pass to recognize boolean operators
-- as special characters or words.
formatQueryNoEscaping :: [Expression] -> String
formatQueryNoEscaping = formatQueryWith toStringNoEscaping

-- Format query expressions with escaping of special characters.
formatQuery :: [Expression] -> String
formatQuery = formatQueryWith toString

-- Just a simplified syntax tree. Besides this, all other input has its
-- non-alphanumeric characters stripped, including double and single quotes and
-- parentheses

data Expression = 
        TagFilter String
      | AuthorFilter String
      | Literal String
      | Phrase String
      | AndOrExpr Conj Expression Expression
  deriving Show

data Conj = And | Or deriving Show

toStringNoEscaping :: Expression -> String
toStringNoEscaping (TagFilter s)     = "tag:" ++ maybeQuote s
toStringNoEscaping (AuthorFilter s)  = "author:" ++ maybeQuote s
toStringNoEscaping (Literal s)       = s
toStringNoEscaping (Phrase s)        = quote s -- no need to escape the contents
toStringNoEscaping (AndOrExpr c a b) =
   let a' = toStringNoEscaping a
       b' = toStringNoEscaping b
       c' = conjToString c
   -- if either a' or b' is just whitespace, just choose one or the other
   in case (all isSpace a', all isSpace b') of
       (True, False)  -> b'
       (False, True)  -> a'
       (False, False) -> a' ++ c' ++ b'
       _  -> ""

-- escapes expression to string to pass to sphinx
toString :: Expression -> String
toString (TagFilter s)     = "tag:" ++ maybeQuote (escapeString s)
toString (AuthorFilter s)  = "author:" ++ maybeQuote (escapeString s)
toString (Literal s)       = escapeString s
toString (Phrase s)        = quote s -- no need to escape the contents
toString (AndOrExpr c a b) =
   let a' = toString a
       b' = toString b
       c' = conjToString c
   -- if either a' or b' is just whitespace, just choose one or the other
   in case (all isSpace a', all isSpace b') of
       (True, False)  -> b'
       (False, True)  -> a'
       (False, False) -> a' ++ c' ++ b'
       _  -> ""

quote :: String -> String
quote s = "\"" ++ s ++ "\""

maybeQuote :: String -> String
maybeQuote s = if any isSpace s then quote s else s

conjToString :: Conj -> String
conjToString And = " & "
conjToString Or  = " | "

-- removes all non-alphanumerics from literal strings that could be parsed
-- mistakenly as Sphinx Extended Query operators
escapeString :: String -> String
escapeString = map stripAlphaNum

stripAlphaNum :: Char -> Char
stripAlphaNum s | isAlphaNum s = s
                | otherwise    = ' '


-----------------------------------------------------------------------
-- Parse filters

type Parser' = ParsecT String () Identity 

parseFilters :: String -> [Expression]
parseFilters inp =
  case Text.Parsec.parse (many filtersAndLiterals) "" inp of
    Left x   -> error $ "parser failed: " ++ show x
    Right xs -> xs

filtersAndLiterals :: Parser' Expression
filtersAndLiterals = try tagFilter <|> try authorFilter <|> try phrase <|> literal 

tagFilter :: Parser' Expression
tagFilter = do
   try (string "tag:") <|> try (string "@(tag_list)") <|> string "@tag_list"
   many space
   x <- (try phrase <|> literal)
   let
     s = case x of
       Phrase p  -> p
       Literal l -> l
       otherwise -> "" -- will never be returned (parse error)
   return $ TagFilter s

authorFilter :: Parser' Expression
authorFilter = do
   string "author:"
   many space
   x <- (try phrase <|> literal)
   let
     s = case x of
       Phrase p  -> p
       Literal l -> l
       otherwise -> "" -- will never be returned (parse error)
   return $ AuthorFilter s

phrase :: Parser' Expression
phrase = do
    Phrase <$>
      (between (char '"') (char '"') (many tagChar))
  where tagChar = 
              char '\\' *> (char '"')
          <|> satisfy (`notElem` ("\"\\" :: String))

--     char '"'
--     xs <- manyTill anyChar (char '"')
--     return . Phrase $ xs

-- Copied from http://book.realworldhaskell.org/read/using-parsec.html
-- p_string :: CharParser () String
-- p_string = between (char '\"') (char '\"') (many jchar)
--     where jchar = char '\\' *> (p_escape <|> p_unicode)
--               <|> satisfy (`notElem` "\"\\")
-- 

-- Copied from http://book.realworldhaskell.org/read/using-parsec.html
-- p_string :: CharParser () String
-- p_string = between (char '\"') (char '\"') (many jchar)
--     where jchar = char '\\' *> (p_escape <|> p_unicode)
--               <|> satisfy (`notElem` "\"\\")
-- 
literalStop :: Parser' ()
literalStop = (choice [ 
    lookAhead (tagFilter >> return ()) 
  , lookAhead (authorFilter >> return ()) 
  , lookAhead (phrase >> return ())
  , (space >> return ())
  , eof
  ])
  <?> "literalStop"

literal :: Parser' Expression
literal = do
    a  <- anyChar
    xs <- manyTill anyChar (try literalStop)
    return . Literal $ a:xs

-----------------------------------------------------------------------
-- Parse query string after tag and author filters have been removed.

parseQueryNoFilters :: String -> [Expression]
parseQueryNoFilters inp =
  case Text.Parsec.parse (many expressionNoFilters) "" inp of
    Left x   -> error $ "parser failed: " ++ show x
    Right xs -> xs

expressionNoFilters :: Parser' Expression
expressionNoFilters = try andOrExpr <|> try phrase <|> literalNoFilters

andOrExpr :: Parser' Expression
andOrExpr = do 
   a <- (try phrase <|> literalNoFilters)
   x <- try conjExpr
   b <- expressionNoFilters  -- recursion
   return $ AndOrExpr x a b

conjExpr :: Parser' Conj
conjExpr = andExpr <|> orExpr

andExpr :: Parser' Conj
andExpr = mkConjExpr ["and", "AND", "&"] And

orExpr :: Parser' Conj
orExpr = mkConjExpr ["or", "OR", "|"] Or

mkConjExpr :: [String] -> Conj -> Parser' Conj
mkConjExpr xs t = 
    try (many1 space >> choice (map (string . (++" ")) xs))
    >> return t

literalStopNoFilters :: Parser' ()
literalStopNoFilters = (choice [
    lookAhead (conjExpr >> return ())
  , lookAhead (phrase >> return ())
  , (space >> return ())
  , eof
  ])
  <?> "literalStopNoFilters'"

literalNoFilters :: Parser' Expression
literalNoFilters = do
    a  <- anyChar
    xs <- manyTill anyChar (try literalStopNoFilters)
    return . Literal $ a:xs

-----------------------------------------------------------------------
-- Helper functions

isTagFilter :: Expression -> Bool
isTagFilter (TagFilter _) = True
isTagFilter _             = False

tagNameFromExpression :: Expression -> String
tagNameFromExpression (TagFilter t) = t
tagNameFromExpression _             = error "tagNameFromExpression: not tag"

isAuthorFilter :: Expression -> Bool
isAuthorFilter (AuthorFilter _) = True
isAuthorFilter _                = False

authorNameFromExpression :: Expression -> String
authorNameFromExpression (AuthorFilter t) = t
authorNameFromExpression _                = error "authorNameFromExpression: not author"