-- | An alphabet, where each character is a short piece of @Text@.

module NLP.Alphabet.MultiChar where

import           Control.DeepSeq (NFData(..))
import           Data.Function (on)
import           Data.Hashable
import           Data.Interned
import           Data.Interned.Internal (getCache)
import           Data.Stringable
import           Data.String (IsString)
import qualified Data.ByteString.Short as BS
import qualified Data.ByteString.Short.Internal as BS
import qualified Data.String as S
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import           GHC.Generics
import qualified Data.HashMap.Strict as HM
import qualified Data.Array as A
import           Data.Typeable (Typeable)
import           Data.Data (Data)



-- * 'MultiChar's capture UTF characters that are encoded using one or more
-- symbols.

-- | Interns a 'MultiChar' character.

internMultiChar :: MultiChar -> MultiChar
internMultiChar = uninternMultiChar . intern
{-# Inline internMultiChar #-}

-- | Wrap a short bytestring. Read and Show instances behave like for normal
-- strings.

newtype MultiChar = MultiChar { getMultiChar :: T.Text }
  deriving (Eq,Ord,Generic,Data,Typeable)

instance Show MultiChar where
  showsPrec p (MultiChar mc) r = showsPrec p (toString mc) r
  {-# Inline showsPrec #-}

instance Read MultiChar where
  readsPrec p str = [ (MultiChar x, y) | (x,y) <- readsPrec p str ]
  {-# Inline readsPrec #-}

instance Hashable MultiChar

instance IsString MultiChar where
  fromString = MultiChar . S.fromString
  {-# Inline fromString #-}

instance Stringable MultiChar where
  toString   = T.unpack . getMultiChar
  fromString = MultiChar . T.pack
  length     = T.length . getMultiChar
  fromText   = MultiChar
  toText     = getMultiChar
  {-# Inline toString   #-}
  {-# Inline fromString #-}
  {-# Inline length     #-}
  {-# Inline fromText   #-}
  {-# Inline toText     #-}

instance NFData MultiChar where
  rnf = rnf . getMultiChar
  {-# Inline rnf #-}



-- * Interned

-- | Interned 'MultiChar'.
--
-- TODO Check 'Ord' instance. We @compare `on` uninternMultiChar@.

data InternedMultiChar = InternedMultiChar
  { internedMultiCharId :: {-# UNPACK #-} !Id
  , uninternMultiChar   :: {-# UNPACK #-} !MultiChar
  }
  deriving (Generic,Data,Typeable)

instance IsString InternedMultiChar where
  fromString = intern . S.fromString
  {-# Inline fromString #-}

instance Eq InternedMultiChar where
  (==) = (==) `on` internedMultiCharId
  {-# Inline (==) #-}

instance Ord InternedMultiChar where
  compare = compare `on` uninternMultiChar -- internedMultiCharId
  {-# Inline compare #-}

instance Read InternedMultiChar where
  readsPrec p str = [ (intern x, y) | (x,y) <- readsPrec p str ]
  {-# Inline readsPrec #-}

instance Show InternedMultiChar where
  showsPrec d (InternedMultiChar _ mc) = showsPrec d mc
  {-# Inline showsPrec #-}

instance Hashable InternedMultiChar where
  hashWithSalt salt = hashWithSalt salt . internedMultiCharId
  hash              = hash . internedMultiCharId
  {-# Inline hashWithSalt #-}
  {-# Inline hash         #-}

instance Interned InternedMultiChar where
  type Uninterned InternedMultiChar = MultiChar
  newtype Description InternedMultiChar = DMC MultiChar deriving (Eq,Hashable)
  describe = DMC . MultiChar . T.copy . getMultiChar -- @DMC@ alone is type-correct. With 'T.copy' we make sure not to keep long @Text@s. TODO benchmark!
  identify = InternedMultiChar
  cache    = imcCache
  {-# Inline describe #-}
  {-# Inline identify #-}
  {-# Inline cache    #-}

imcCache :: Cache InternedMultiChar
imcCache = mkCache
{-# NOINLINE imcCache #-}

instance Stringable InternedMultiChar where
  toString   = toString . uninternMultiChar
  fromString = intern . fromString
  length     = Data.Stringable.length . uninternMultiChar
  toText     = toText . uninternMultiChar
  fromText   = intern . fromText
  {-# Inline toString   #-}
  {-# Inline fromString #-}
  {-# Inline length     #-}
  {-# Inline toText     #-}
  {-# Inline fromText   #-}

instance NFData InternedMultiChar where
  rnf (InternedMultiChar i c) = rnf i `seq` rnf c
  {-# Inline rnf #-}