-- Copyright (C) 2009 Eric Kow <eric.kow@gmail.com>
--
-- Permission is hereby granted, free of charge, to any person
-- obtaining a copy of this software and associated documentation
-- files (the "Software"), to deal in the Software without
-- restriction, including without limitation the rights to use, copy,
-- modify, merge, publish, distribute, sublicense, and/or sell copies
-- of the Software, and to permit persons to whom the Software is
-- furnished to do so, subject to the following conditions:
--
-- The above copyright notice and this permission notice shall be
-- included in all copies or substantial portions of the Software.
--
-- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-- SOFTWARE.

module NLP.FullStop ( segment, testSuite ) where

import Data.Char
import Data.List
import Data.List.Split

import Test.HUnit
import Test.Framework
import Test.Framework.Providers.HUnit
import Test.Framework.Providers.QuickCheck2

-- ------------------------------------------------------------
--
-- ------------------------------------------------------------

testSuite :: Test.Framework.Test
testSuite =
 testGroup "NLP.FullStop"
  [ testGroup "basic sanity checking"
      [ testProperty "concat (segment s) == id s, modulo whitespace" prop_segment_concat
      ]
  , testGroup "segmentation"
     [ testCaseSegments "simple"  ["Foo.", "Bar."]   "Foo. Bar."
     , testCaseSegments "condense"  ["What?!", "Yeah"]   "What?! Yeah"
     , testCaseSegments "URLs"    ["Check out http://www.example.com.", "OK?"]
                                   "Check out http://www.example.com. OK?"
     , testCaseNoSplit "titles"    "Mr. Doe, Mrs. Durand and Dr. Singh"
     , testCaseNoSplit "initials"  "E. Y. Kow"
     , testCaseNoSplit "numbers"   "version 2.3.99.2"
     -- TODO: what's a good way of dealing with ellipsis?
     -- TODO: He said "Foo." Bar (tricky because Foo. "Bar" is legit)
     -- TODO: Very likely to be cases where it's just plain ambiguous
     ]
  ]

testCaseNoSplit d x = testCaseSegments d [x] x
testCaseSegments d xs x = testCase d $ assertEqual "" xs (segment x)

-- TODO: perhaps create a newtype that skews the random generation of tests
-- towards things that look more like text (but not too much, because we still
-- want to make sure we're covering edge-cases)

prop_segment_concat s =
   noWhite s == concatMap noWhite (segment s)
 where
   noWhite = filter (not . isSpace)

-- ------------------------------------------------------------
--
-- ------------------------------------------------------------

-- | 'segment' @s@ splits @s@ into a list of sentences.
--
--   It looks for punctuation characters that indicate an
--   end-of-sentence and tries to ignore some uses of
--   puncuation which do not correspond to ends of sentences
--
--   It's a good idea to view the source code to this module,
--   especially the test suite.
--
--   I imagine this sort of task is actually ambiguous and that
--   you actually won't be able to write an exact segmenter.
--
--   It may be a good idea to go see the literature on how to do
--   segmentation right, maybe implement something which returns
--   the N most probable segmentations instead.
segment :: String -> [String]
segment = map (dropWhile isSpace) . squish
        . ( split
          . condense       -- "huh?!"
          . dropFinalBlank -- strings that end with terminator
          . keepDelimsR    -- we want to preserve terminators
          $ oneOf stopPunctuation
          )

stopPunctuation :: [Char]
stopPunctuation = [ '.', '?', '!' ]

titles :: [String]
titles = [ "Mr.", "Mrs.", "Dr." ]

-- ------------------------------------------------------------
-- putting some pieces back together
-- ------------------------------------------------------------

squish = squishBy (\_ y -> not (startsWithSpace y))
       . squishBy (\x _ -> looksLikeAnInitial x)
       . squishBy (\x _ -> any (`isSuffixOf` x) titles)
       . squishBy (\x y -> endsWithDigit x  && startsWithDigit y)
 where
  looksLikeAnInitial [_,'.'] = True
  looksLikeAnInitial _ = False
  --
  startsW f [] = False
  startsW f (x:_) = f x
  --
  startsWithDigit = startsW isDigit
  startsWithSpace = startsW isSpace
  --
  endsWithDigit xs =
    case reverse xs of
     ('.':x:_) -> isDigit x
     _ -> False

squishBy f = map concat . groupBy f