module Linguistics.Word where
import Control.Applicative
import Control.DeepSeq
import Data.ByteString (ByteString)
import Data.Interned
import Data.Interned.ByteString
import qualified Data.Attoparsec.ByteString as AB
import qualified Data.Attoparsec.ByteString.Char8 as AB hiding (takeWhile1)
import qualified Data.Attoparsec.ByteString.Lazy as ABL
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL hiding (unpack)
import qualified Data.ByteString.Lazy.Char8 as BL hiding (readFile)
import qualified Data.ByteString.Short as S
import qualified Data.Vector.Unboxed as VU
import Prelude hiding (Word)
import Data.Stringable
import NLP.Text.BTI
data Word = Word
{ wordID :: !Int
, wordClass :: !BTI
, wordLang :: !BTI
, wordLength :: !Int
, wordWord :: !(VU.Vector BTI)
}
deriving (Show,Eq,Ord)
instance NFData Word where
rnf !(Word {}) = ()
parseWord :: BL.ByteString -> Word
parseWord w = case ABL.eitherResult (ABL.parse go w) of
Left err -> error err
Right p -> force p
where
go = Word <$> AB.decimal
<* AB.many1 AB.space
<*> (wW <$> wrd)
<*> (wW <$> wrd)
<*> AB.decimal
<* AB.many1 AB.space
<*> ((VU.fromList . map wW) <$> (AB.takeWhile1 (not . AB.isHorizontalSpace) `AB.sepBy` AB.space))
wrd = AB.takeWhile1 (not . AB.isHorizontalSpace) <* AB.space
wW = fromByteString
addWordDelims :: Word -> Word
addWordDelims w
| VU.length ww >= 2 && VU.head ww == "^" && VU.last ww == "$" = w
| otherwise = w { wordWord = "^" `VU.cons` wordWord w `VU.snoc` "$" }
where ww = wordWord w
removeWordDelims :: Word -> Word
removeWordDelims w
| VU.length ww >= 2 && VU.head ww == "^" && VU.last ww == "$" = w { wordWord = VU.init . VU.tail $ wordWord w }
| otherwise = w
where ww = wordWord w