lib/NLP/Punkt.hs

{-# LANGUAGE OverloadedStrings #-}

module NLP.Punkt where

import qualified Data.Text as Text
import Data.Text (Text)
import Data.Maybe (catMaybes, fromMaybe)
import Data.HashMap.Strict (HashMap)
import Data.Char (isLower, isAlpha, isSpace)
import qualified Data.HashMap.Strict as Map
import qualified Data.List as List
import Control.Applicative ((<$>), (<*>), (<|>))
import qualified Control.Monad.Reader as Reader

import NLP.Punkt.Match (re_split_pos, word_seps)

-- | Carries various orthographic statistics for a particular textual type.
data OrthoFreq = OrthoFreq {
    freq_lower :: Int,
    -- ^ number of lowercase occurrences
    freq_upper :: Int,
    -- ^ uppercase occurrences
    freq_first_lower :: Int,
    -- ^ number of lowercase occurrences in the first position of a sentence
    freq_internal_upper :: Int,
    -- ^ number of uppercase occurrences strictly internal to a sentence
    freq_after_ender :: Int
    -- ^ number of occurences in the first position
    }
    deriving Show

-- | Represents training data obtained from a corpus required by Punkt.
data PunktData = PunktData {
    type_count :: HashMap Text Int,
    -- ^ Occurrences of each textual type, case-insensitive. Used during Punkt's
    -- type-based stage. Also contains occurrences of trailing periods.
    ortho_count :: HashMap Text OrthoFreq,
    -- ^ Dictionary of orthographic data for each textual type.
    collocations :: HashMap (Text, Text) Int,
    total_enders :: Int,
    total_toks :: Int
    }
    deriving Show

data Entity a = Word a Bool | Punct a | ParaStart | Ellipsis | Dash
    deriving (Eq, Show)

data Token = Token {
    offset :: Int,
    toklen :: Int,
    entity :: Entity Text,
    sentend :: Bool,
    abbrev :: Bool
    }
    deriving Show

type Punkt = Reader.Reader PunktData

norm :: Text -> Text
norm = Text.toLower

is_initial :: Token -> Bool
is_initial (Token {entity=Word w True}) =
    Text.length w == 1 && isAlpha (Text.head w)
is_initial _ = False

is_word :: Token -> Bool
is_word tok = case entity tok of { Word _ _ -> True; _ -> False; }

-- | Dunning log likelihood modified by Kiss/Strunk
strunk_log :: Double -> Double -> Double -> Double -> Double
strunk_log a b ab n = -2 * (null - alt)
    where
    null = ab * log p1 + (a - ab) * log (1 - p1)
    alt = ab * log p2 + (a - ab) * log (1 - p2)
    (p1, p2) = (b / n, 0.99)

-- | Dunning's original log likelihood
dunning_log :: Double -> Double -> Double -> Double -> Double
dunning_log a b ab n | b == 0 || ab == 0 = 0
                     | otherwise = -2 * (s1 + s2 - s3 - s4)
    where
    (p0, p1, p2) = (b / n, ab / a, (b - ab) / (n - a))
    s1 = ab * log p0 + (a - ab) * log (1 - p0)
    s2 = (b - ab) * log p0 + (n - a - b + ab) * log (1 - p0)
    s3 = if a == ab then 0 else ab * log p1 + (a - ab) * log (1 - p1)
    s4 = if b == ab then 0 else
        (b - ab) * log p2 + (n - a - b + ab) * log (1 - p2)

ask_type_count :: Punkt (HashMap Text Int)
ask_type_count = Reader.liftM type_count Reader.ask

ask_total_toks :: Num a => Punkt a
ask_total_toks = Reader.liftM (fromIntegral . total_toks) Reader.ask

ask_total_enders :: Num a => Punkt a
ask_total_enders = Reader.liftM (fromIntegral . total_enders) Reader.ask

ask_ortho :: Text -> Punkt OrthoFreq
ask_ortho w_ = return . Map.lookupDefault (OrthoFreq 0 0 0 0 0) (norm w_)
               =<< fmap ortho_count Reader.ask

ask_colloc :: Text -> Text -> Punkt Double
ask_colloc w0_ w1_ =
    return . fromIntegral . Map.lookupDefault 0 (norm w0_, norm w1_)
    =<< collocations <$> Reader.ask

-- | Occurrences of a textual type, strictly ignoring trailing period.
-- @c(w, ~.)@. Case-insensitive.
freq :: Text -> Punkt Double
freq w_ = ask_type_count >>= return . fromIntegral . Map.lookupDefault 0 w
    where w = norm w_

-- | Occurrences of a textual type with trailing period. @c(w, .)@.
-- Case-insensitive.
freq_snoc_dot :: Text -> Punkt Double
freq_snoc_dot w_ = freq wdot where wdot = w_ `Text.snoc` '.'
-- potential slowdown if ghc doesn't know that norm "." == "."

-- | @c(w) == c(w, .) + c(w, ~.)@. Case-insensitive.
freq_type :: Text -> Punkt Double
freq_type w_ = (+) <$> freq w_ <*> freq_snoc_dot w_

dlen :: Text -> Double
dlen = fromIntegral . Text.length

-- | Returns the log likelihood that (w_ `snoc` '.') is an abbreviation.
-- Case-insensitive.
prob_abbr :: Text -> Punkt Double
prob_abbr w_ = compensate =<< strunk_log <$> freq_type w_ <*> freq "."
                                         <*> freq_snoc_dot w_ <*> ask_total_toks
    where
    compensate loglike = do
        f_penalty <- do
            p <- freq w_  -- c(w, ~.)
            return $ 1 / dlen (Text.filter (/= '.') w_) ** p
        return $ loglike * f_len * f_periods * f_penalty
    f_len = 1 / exp (dlen $ Text.filter (/= '.') w_)
    f_periods = 1 + dlen (Text.filter (== '.') w_)

-- | Decides if @w@ is a sentence ender based on its capitalization.
-- Case-insensitive.
decide_ortho :: Text -> Punkt (Maybe Bool)
decide_ortho w_ = ask_ortho w_ >>= return . decide' w_
    where
    decide' w_ wortho
        | title && ever_lower && never_title_internal = Just True
        | lower && (ever_title || never_lower_start) = Just False
        | otherwise = Nothing
        where
        (lower, title) = (isLower $ Text.head w_, not lower)
        ever_lower = freq_lower wortho > 0
        never_title_internal = freq_internal_upper wortho == 0
        ever_title = freq_upper wortho > 0
        never_lower_start = freq_first_lower wortho == 0

-- | Special orthographic heuristic for post-possible-initial tokens.
-- Case-insensitive.
decide_initial_ortho :: Text -> Punkt (Maybe Bool)
decide_initial_ortho w_ = do
    neverlower <- (== 0) . freq_lower <$> ask_ortho w_
    orthosays <- decide_ortho w_
    return $ orthosays <|> if neverlower then Just False else Nothing

-- | Log likelihood that @w@ is a frequent sentence starter. Case-insensitive.
prob_starter :: Text -> Punkt Double
prob_starter w_ = dunning_log <$> ask_total_enders <*> freq_type w_
                              <*> fafterend <*> ask_total_toks
    where fafterend = fromIntegral . freq_after_ender <$> ask_ortho w_

-- | Computes the collocational likelihood of @w@ and @x@. Case-insensitive.
prob_colloc :: Text -> Text -> Punkt Double
prob_colloc w_ x_ = dunning_log <$> freq_type w_ <*> freq_type x_
                                <*> ask_colloc w_ x_ <*> ask_total_toks

-- | Builds a dictionary of textual type frequencies from a stream of tokens.
build_type_count :: [Token] -> HashMap Text Int
build_type_count = List.foldl' update initcount
    where
    initcount = Map.singleton "." 0

    update ctr (Token {entity=(Word w per)})
        | per = Map.adjust (+ 1) "." ctr_
        | otherwise = ctr_
        where
        ctr_ = Map.insertWith (+) wnorm 1 ctr
        wnorm = norm $ if per then w `Text.snoc` '.' else w
    update ctr _ = ctr

    -- TODO: catch possible abbreviations wrapped in hyphenated and apostrophe
    -- forms in lexer

build_ortho_count :: [Token] -> HashMap Text OrthoFreq
build_ortho_count toks = List.foldl' update Map.empty $
                            zip (dummy : toks) toks
    where
    dummy = Token 0 0 (Word " " False) True False
    -- hack: add dummy to process first token

    update :: HashMap Text OrthoFreq -> (Token, Token) -> HashMap Text OrthoFreq
    update ctr (prev, Token {entity=(Word w _)}) = Map.insert wnorm wortho ctr
        where
        upd (OrthoFreq a b c d e) a' b' c' d' e' =
            OrthoFreq (a |+ a') (b |+ b') (c |+ c') (d |+ d') (e |+ e')
            where int |+ bool = if bool then int + 1 else int

        wortho = upd z lower (not lower) (first && lower)
                       (internal && not lower) first
        z = Map.lookupDefault (OrthoFreq 0 0 0 0 0) wnorm ctr
        wnorm = norm w
        lower = isLower $ Text.head w
        first = sentend prev && not (is_initial prev)
        internal = not (sentend prev) && not (abbrev prev)
                   && not (is_initial prev)
    update ctr _ = ctr

build_collocs :: [Token] -> HashMap (Text, Text) Int
build_collocs toks = List.foldl' update Map.empty $ zip toks (drop 1 toks)
    where
    update ctr (Token {entity=(Word u _)}, Token {entity=(Word v _)}) =
        Map.insertWith (+) (norm u, norm v) 1 ctr
    update ctr _ = ctr

to_tokens :: Text -> [Token]
to_tokens corpus = catMaybes . map (either tok_word add_delim) $
                        re_split_pos word_seps corpus
    where
    tok_word (w, pos)
        | trim == "" = Nothing
        | otherwise = Just $ Token pos (len trim) (Word s period) False False
        where
        trim = Text.dropAround (`elem` ",:()[]{}“”’\"\')") w
        period = Text.last trim == '.'
        s = if period then Text.init trim else trim

    add_delim (delim, pos)
        | d `elem` "—-" = Just $ Token pos (len delim) Dash False False
        | d `elem` ".…" = Just $ Token pos (len delim) Ellipsis False False
        | d `elem` ";!?" = Just $ Token pos (len delim) (Punct delim) True False
        | otherwise = Nothing
        where d = Text.head delim

    len = Text.length

build_punkt_data :: [Token] -> PunktData
build_punkt_data toks = PunktData typecnt orthocnt collocs nender totes
    where
    typecnt = build_type_count toks
    temppunkt = PunktData typecnt Map.empty Map.empty 0 (length toks)
    refined = runPunkt temppunkt $ mapM classify_by_type toks
    orthocnt = build_ortho_count refined
    collocs = build_collocs refined
    nender = length . filter (sentend . fst) $ zip (dummy : refined) refined
    dummy = Token 0 0 (Word " " False) True False
    totes = length $ filter is_word toks

classify_by_type :: Token -> Punkt Token
classify_by_type tok@(Token {entity=(Word w True)}) = do
    p <- prob_abbr w
    return $ tok { abbrev = p >= 0.3, sentend = p < 0.3}
classify_by_type tok = return tok

classify_by_next :: Token -> Token -> Punkt Token
classify_by_next this (Token _ _ (Word next _) _ _)
    | is_initial this = do
        let Word thisinitial _ = entity this
        colo <- prob_colloc thisinitial next
        startnext <- prob_starter next
        orthonext <- decide_initial_ortho next
        return $ if (colo >= 7.88 && startnext < 30) || orthonext == Just False
            then this { abbrev = True, sentend = False}
            else this  -- never reclassify as sentend
    | entity this == Ellipsis || abbrev this = do
        ortho_says <- decide_ortho next
        prob_says <- prob_starter next
        return $ case ortho_says of
            Nothing -> this { sentend = prob_says >= 30 }
            Just bool -> this { sentend = bool }
classify_by_next this _ = return this

classify_punkt :: Text -> [Token]
classify_punkt corpus = runPunkt (build_punkt_data toks) $ do
    abbrd <- mapM classify_by_type toks
    final <- Reader.zipWithM classify_by_next abbrd (drop 1 abbrd)
    return $ final ++ [last toks]
    where toks = to_tokens corpus

find_breaks :: Text -> [(Int, Int)]
find_breaks corpus = slices_from endpairs 0
    where
    pairs_of xs = zip xs $ drop 1 xs
    endpairs = filter (sentend . fst) . pairs_of $ classify_punkt corpus

    -- TODO: make this less convoluted
    slices_from [] n = [(n, Text.length corpus)]
    slices_from ((endtok, nexttok):pairs) n = (n, endpos + end) : slices_from pairs (endpos + n')
        where
        endpos = offset endtok + toklen endtok
        (end, n') = fromMaybe (endpos, endpos + 1) . match_spaces $
            substring corpus endpos (offset nexttok)

substring :: Text -> Int -> Int -> Text
substring c s e = Text.take (e - s) $ Text.drop s c

match_spaces :: Text -> Maybe (Int, Int)
match_spaces w = Text.findIndex isSpace w >>= \p ->
    case Text.break notSpace (Text.drop p w) of
        (spaces, _) -> Just (p, Text.length spaces + p)
    where notSpace = not . isSpace

-- | Main export of the entire package. Splits a corpus into its constituent
-- sentences.
split_sentences :: Text -> [Text]
split_sentences corpus = map (uncurry $ substring corpus) slices
    where slices = find_breaks corpus

-- | @runPunkt data computation@ runs @computation@ using @data@ collected from
-- a corpus using 'build_punkt_data'.
runPunkt :: PunktData -> Punkt a -> a
runPunkt = flip Reader.runReader