-- Copyright (C) 2009 Eric Kow -- -- Permission is hereby granted, free of charge, to any person -- obtaining a copy of this software and associated documentation -- files (the "Software"), to deal in the Software without -- restriction, including without limitation the rights to use, copy, -- modify, merge, publish, distribute, sublicense, and/or sell copies -- of the Software, and to permit persons to whom the Software is -- furnished to do so, subject to the following conditions: -- -- The above copyright notice and this permission notice shall be -- included in all copies or substantial portions of the Software. -- -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -- SOFTWARE. module NLP.FullStop ( segment, testSuite ) where import Data.Char import Data.List import Data.List.Split import Test.HUnit import Test.Framework import Test.Framework.Providers.HUnit import Test.Framework.Providers.QuickCheck2 -- ------------------------------------------------------------ -- -- ------------------------------------------------------------ testSuite :: Test.Framework.Test testSuite = testGroup "NLP.FullStop" [ testGroup "basic sanity checking" [ testProperty "concat (segment s) == id s, modulo whitespace" prop_segment_concat ] , testGroup "segmentation" [ testCaseSegments "simple" ["Foo.", "Bar."] "Foo. Bar." , testCaseSegments "condense" ["What?!", "Yeah"] "What?! Yeah" , testCaseSegments "URLs" ["Check out http://www.example.com.", "OK?"] "Check out http://www.example.com. OK?" , testCaseNoSplit "titles" "Mr. Doe, Mrs. Durand and Dr. Singh" , testCaseNoSplit "initials" "E. Y. Kow" , testCaseNoSplit "numbers" "version 2.3.99.2" -- TODO: what's a good way of dealing with ellipsis? -- TODO: He said "Foo." Bar (tricky because Foo. "Bar" is legit) -- TODO: Very likely to be cases where it's just plain ambiguous ] ] testCaseNoSplit d x = testCaseSegments d [x] x testCaseSegments d xs x = testCase d $ assertEqual "" xs (segment x) -- TODO: perhaps create a newtype that skews the random generation of tests -- towards things that look more like text (but not too much, because we still -- want to make sure we're covering edge-cases) prop_segment_concat s = noWhite s == concatMap noWhite (segment s) where noWhite = filter (not . isSpace) -- ------------------------------------------------------------ -- -- ------------------------------------------------------------ -- | 'segment' @s@ splits @s@ into a list of sentences. -- -- It looks for punctuation characters that indicate an -- end-of-sentence and tries to ignore some uses of -- puncuation which do not correspond to ends of sentences -- -- It's a good idea to view the source code to this module, -- especially the test suite. -- -- I imagine this sort of task is actually ambiguous and that -- you actually won't be able to write an exact segmenter. -- -- It may be a good idea to go see the literature on how to do -- segmentation right, maybe implement something which returns -- the N most probable segmentations instead. segment :: String -> [String] segment = map (dropWhile isSpace) . squish . ( split . condense -- "huh?!" . dropFinalBlank -- strings that end with terminator . keepDelimsR -- we want to preserve terminators $ oneOf stopPunctuation ) stopPunctuation :: [Char] stopPunctuation = [ '.', '?', '!' ] titles :: [String] titles = [ "Mr.", "Mrs.", "Dr." ] -- ------------------------------------------------------------ -- putting some pieces back together -- ------------------------------------------------------------ squish = squishBy (\_ y -> not (startsWithSpace y)) . squishBy (\x _ -> looksLikeAnInitial x) . squishBy (\x _ -> any (`isSuffixOf` x) titles) . squishBy (\x y -> endsWithDigit x && startsWithDigit y) where looksLikeAnInitial [_,'.'] = True looksLikeAnInitial _ = False -- startsW f [] = False startsW f (x:_) = f x -- startsWithDigit = startsW isDigit startsWithSpace = startsW isSpace -- endsWithDigit xs = case reverse xs of ('.':x:_) -> isDigit x _ -> False squishBy f = map concat . groupBy f