module Data.Text.Fuzzy.Tokenize ( TokenizeSpec
                                , IsToken(..)
                                , tokenize
                                , esc
                                , addEmptyFields
                                , emptyFields
                                , nn
                                , sq
                                , sqq
                                , noslits
                                , sl
                                , sr
                                , uw
                                , delims
                                , comment
                                , punct
                                , indent
                                , itabstops
                                , keywords
                                , eol
                                ) where
import Prelude hiding (init)
import Control.Applicative
import Data.Map (Map)
import Data.Maybe (fromMaybe)
import Data.Monoid()
import Data.Set (Set)
import Data.Text (Text)
import qualified Data.List as List
import qualified Data.Map as Map
import qualified Data.Set as Set
import qualified Data.Text as Text
import Control.Monad.RWS
data TokenizeSpec = TokenizeSpec { tsAtoms          :: Set Text
                                 , tsStringQQ       :: Maybe Bool
                                 , tsStringQ        :: Maybe Bool
                                 , tsNoSlits        :: Maybe Bool
                                 , tsLineComment    :: Map Char Text
                                 , tsDelims         :: Set Char
                                 , tsEol            :: Maybe Bool
                                 , tsStripLeft      :: Maybe Bool
                                 , tsStripRight     :: Maybe Bool
                                 , tsUW             :: Maybe Bool
                                 , tsNotNormalize   :: Maybe Bool
                                 , tsEsc            :: Maybe Bool
                                 , tsAddEmptyFields :: Maybe Bool
                                 , tsPunct          :: Set Char
                                 , tsIndent         :: Maybe Bool
                                 , tsItabStops      :: Maybe Int
                                 , tsKeywords       :: Set Text
                                 }
                    deriving (Eq,Ord,Show)
instance Semigroup TokenizeSpec where
  (<>) a b = TokenizeSpec { tsAtoms       = tsAtoms b <> tsAtoms a
                          , tsStringQQ    = tsStringQQ b <|> tsStringQQ a
                          , tsStringQ     = tsStringQ b  <|> tsStringQ a
                          , tsNoSlits     = tsNoSlits b <|> tsNoSlits a
                          , tsLineComment = tsLineComment b <> tsLineComment a
                          , tsDelims      = tsDelims b <> tsDelims a
                          , tsEol         = tsEol b <|> tsEol a
                          , tsStripLeft   = tsStripLeft b <|> tsStripLeft a
                          , tsStripRight  = tsStripRight b <|> tsStripRight a
                          , tsUW          = tsUW b <|> tsUW a
                          , tsNotNormalize = tsNotNormalize b <|> tsNotNormalize a
                          , tsEsc         = tsEsc b <|> tsEsc a
                          , tsAddEmptyFields = tsAddEmptyFields b <|> tsAddEmptyFields a
                          , tsPunct = tsPunct b <> tsPunct a
                          , tsIndent = tsIndent b <|> tsIndent a
                          , tsItabStops = tsItabStops b <|> tsItabStops a
                          , tsKeywords = tsKeywords b <> tsKeywords a
                          }
instance Monoid TokenizeSpec where
  mempty = TokenizeSpec { tsAtoms = mempty
                        , tsStringQQ = Nothing
                        , tsStringQ  = Nothing
                        , tsNoSlits = Nothing
                        , tsLineComment = mempty
                        , tsDelims = mempty
                        , tsEol = Nothing
                        , tsStripLeft = Nothing
                        , tsStripRight = Nothing
                        , tsUW = Nothing
                        , tsNotNormalize = Nothing
                        , tsEsc = Nothing
                        , tsAddEmptyFields = Nothing
                        , tsPunct = mempty
                        , tsIndent = Nothing
                        , tsItabStops = Nothing
                        , tsKeywords = mempty
                        }
justTrue :: Maybe Bool -> Bool
justTrue (Just True) = True
justTrue _ = False
eol :: TokenizeSpec
eol = mempty { tsEol = pure True }
esc :: TokenizeSpec
esc = mempty { tsEsc = pure True }
addEmptyFields :: TokenizeSpec
addEmptyFields = mempty { tsAddEmptyFields = pure True }
emptyFields :: TokenizeSpec
emptyFields = addEmptyFields
nn :: TokenizeSpec
nn = mempty { tsNotNormalize = pure True }
sq :: TokenizeSpec
sq = mempty { tsStringQ = pure True }
sqq :: TokenizeSpec
sqq = mempty { tsStringQQ = pure True }
noslits :: TokenizeSpec
noslits = mempty { tsNoSlits = pure True }
delims :: String -> TokenizeSpec
delims s = mempty { tsDelims = Set.fromList s }
sl :: TokenizeSpec
sl = mempty { tsStripLeft = pure True }
sr :: TokenizeSpec
sr = mempty { tsStripRight = pure True }
uw :: TokenizeSpec
uw = mempty { tsUW = pure True }
comment :: Text -> TokenizeSpec
comment s = mempty { tsLineComment = cmt }
  where
    cmt = case Text.uncons s of
            Just (p,su) -> Map.singleton p su
            Nothing     -> mempty
punct :: Text -> TokenizeSpec
punct s = mempty { tsPunct = Set.fromList (Text.unpack s) }
keywords :: [Text] -> TokenizeSpec
keywords s = mempty { tsKeywords = Set.fromList s }
indent :: TokenizeSpec
indent = mempty { tsIndent = Just True }
itabstops :: Int -> TokenizeSpec
itabstops n = mempty { tsIndent = Just True, tsItabStops = pure n }
newtype TokenizeM w a = TokenizeM (RWS TokenizeSpec w () a)
                        deriving( Applicative
                                , Functor
                                , MonadReader TokenizeSpec
                                , MonadWriter w
                                , MonadState  ()
                                , Monad
                                )
data Token = TChar Char
           | TSChar Char
           | TPunct Char
           | TText Text
           | TSLit Text
           | TKeyword Text
           | TEmpty
           | TDelim
           | TIndent Int
           | TEol
           deriving (Eq,Ord,Show)
class IsToken a where
  
  mkChar   :: Char -> a
  
  mkSChar  :: Char -> a
  
  mkPunct  :: Char -> a
  
  mkText   :: Text -> a
  
  mkStrLit :: Text -> a
  
  mkKeyword :: Text -> a
  
  mkEmpty  :: a
  
  mkDelim  :: a
  mkDelim = mkEmpty
  
  mkIndent :: Int -> a
  mkIndent = const mkEmpty
  
  mkEol :: a
  mkEol = mkEmpty
instance IsToken (Maybe Text) where
  mkChar = pure . Text.singleton
  mkSChar = pure . Text.singleton
  mkPunct = pure . Text.singleton
  mkText = pure
  mkStrLit = pure
  mkKeyword = pure
  mkEmpty = Nothing
instance IsToken Text where
  mkChar   = Text.singleton
  mkSChar  = Text.singleton
  mkPunct  = Text.singleton
  mkText   = id
  mkStrLit = id
  mkKeyword = id
  mkEmpty  = ""
tokenize :: IsToken a => TokenizeSpec -> Text -> [a]
tokenize s t = map tr t1
  where
    t1 = tokenize' s t
    tr (TChar c) = mkChar c
    tr (TSChar c) = mkSChar c
    tr (TText c) = mkText c
    tr (TSLit c) = mkStrLit c
    tr (TKeyword c) = mkKeyword c
    tr TEmpty  = mkEmpty
    tr (TPunct c) = mkPunct c
    tr TDelim  = mkDelim
    tr (TIndent n) = mkIndent n
    tr TEol = mkEol
execTokenizeM :: TokenizeM [Token] a -> TokenizeSpec -> [Token]
execTokenizeM (TokenizeM m) spec =
  let (_,w) = execRWS m spec () in norm w
  where norm x | justTrue (tsNotNormalize spec) = x
               | otherwise = normalize spec x
tokenize' :: TokenizeSpec -> Text -> [Token]
tokenize' spec txt = execTokenizeM (root' txt) spec
  where
    r = spec
    noIndent = not doIndent
    doIndent = justTrue (tsIndent r)
    eolOk = justTrue (tsEol r)
    root' x = scanIndent x >>= root
    root ts = do
      case Text.uncons ts of
        Nothing           -> pure ()
        Just ('\n', rest) | doIndent                  -> raiseEol >> root' rest
        Just (c, rest)    | Set.member c (tsDelims r) -> tell [TDelim]  >> root rest
        Just ('\'', rest) | justTrue (tsStringQ r)    -> scanQ '\'' rest
        Just ('"', rest)  | justTrue (tsStringQQ r)   -> scanQ '"' rest
        Just (c, rest)    | Map.member c (tsLineComment r) -> scanComment (c,rest)
        Just (c, rest)    | Set.member c (tsPunct r)  -> tell [TPunct c] >> root rest
        Just (c, rest)    | otherwise                 -> tell [TChar c] >> root rest
    raiseEol | eolOk = tell [TEol]
             | otherwise = pure ()
    expandSpace ' '  = 1
    expandSpace '\t' = (fromMaybe 8 (tsItabStops r))
    expandSpace _    = 0
    scanIndent x | noIndent = pure x
                 | otherwise = do
      let (ss,as) = Text.span (\c -> c == ' ' || c == '\t') x
      tell [ TIndent (sum (map expandSpace (Text.unpack ss))) ]
      pure as
    scanComment (c,rest) = do
      suff <- Map.lookup c <$> asks tsLineComment
      case suff of
        Just t | Text.isPrefixOf t rest -> do
           root $ Text.dropWhile ('\n' /=) rest
        _  -> tell [TChar c] >> root rest
    scanQ q ts = do
      case Text.uncons ts of
        Nothing           -> root ts
        Just ('\\', rest) | justTrue (tsEsc r) -> unesc (scanQ q) rest
                          | otherwise          -> tell [tsChar '\\'] >> scanQ q rest
        Just (c, rest) | c ==  q   -> root rest
                       | otherwise -> tell [tsChar c] >> scanQ q rest
    unesc f ts =
      case Text.uncons ts of
        Nothing -> f ts
        Just ('"', rs)  -> tell [tsChar '"' ]  >> f rs
        Just ('\'', rs) -> tell [tsChar '\''] >> f rs
        Just ('\\', rs) -> tell [tsChar '\\'] >> f rs
        Just ('t', rs)  -> tell [tsChar '\t'] >> f rs
        Just ('n', rs)  -> tell [tsChar '\n'] >> f rs
        Just ('r', rs)  -> tell [tsChar '\r'] >> f rs
        Just ('a', rs)  -> tell [tsChar '\a'] >> f rs
        Just ('b', rs)  -> tell [tsChar '\b'] >> f rs
        Just ('f', rs)  -> tell [tsChar '\f'] >> f rs
        Just ('v', rs)  -> tell [tsChar '\v'] >> f rs
        Just (_, rs)    -> f rs
    tsChar c | justTrue (tsNoSlits spec) = TChar c
             | otherwise = TSChar c
newtype NormStats = NormStats { nstatBeforeDelim :: Int }
normalize :: TokenizeSpec -> [Token] -> [Token]
normalize spec tokens = snd $ execRWS (go tokens) () init
  where
    go [] = addEmptyField
    go s@(TIndent _ : _) = do
      let (iis, rest') = List.span isIndent s
      tell [TIndent (sum [k | TIndent k <- iis])]
      go rest'
    go (TChar c0 : cs) = do
      let (n,ns) = List.span isTChar cs
      succStat
      let chunk = eatSpaces $ Text.pack (c0 : [ c | TChar c <- n])
      let kw = Set.member chunk (tsKeywords spec)
      tell [ if kw then TKeyword chunk else TText chunk ]
      go ns
    go (TSChar x : xs) = do
      let (n,ns) = List.span isTSChar xs
      succStat
      tell [ TSLit $ Text.pack (x : [ c | TSChar c <- n]) ]
      go ns
    go (TDelim : xs) = do
      addEmptyField
      pruneStat
      go xs
    go (TPunct c : xs) = do
      tell [ TPunct c ]
      succStat
      go xs
    go (x:xs) = tell [x] >> go xs
    succStat = do
      modify (\x -> x { nstatBeforeDelim = succ (nstatBeforeDelim x)})
    pruneStat = do
      modify (\x -> x { nstatBeforeDelim = 0 } )
    addEmptyField = do
      ns <- gets nstatBeforeDelim
      when  (ns == 0 && justTrue (tsAddEmptyFields spec) ) $ do
        tell [ TEmpty ]
    isTChar (TChar _) = True
    isTChar _         = False
    isTSChar (TSChar _) = True
    isTSChar _          = False
    isIndent (TIndent _) = True
    isIndent _           = False
    init = NormStats { nstatBeforeDelim = 0 }
    eatSpaces s | sboth  = Text.strip s
                | sLonly = Text.stripStart s
                | sRonly = Text.stripEnd s
                | sWU    = (Text.unwords . Text.words) s
                | otherwise = s
      where sboth  = justTrue (tsStripLeft spec) && justTrue (tsStripRight spec)
            sLonly = justTrue (tsStripLeft spec) && not (justTrue (tsStripRight spec))
            sRonly = not (justTrue (tsStripLeft spec)) && justTrue (tsStripRight spec)
            sWU    = justTrue (tsUW spec)