-- Copyright (C) 2009 Eric Kow <eric.kow@gmail.com> -- -- Permission is hereby granted, free of charge, to any person -- obtaining a copy of this software and associated documentation -- files (the "Software"), to deal in the Software without -- restriction, including without limitation the rights to use, copy, -- modify, merge, publish, distribute, sublicense, and/or sell copies -- of the Software, and to permit persons to whom the Software is -- furnished to do so, subject to the following conditions: -- -- The above copyright notice and this permission notice shall be -- included in all copies or substantial portions of the Software. -- -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -- SOFTWARE. module NLP.FullStop ( segment, testSuite ) where import Data.Char import Data.List import Data.List.Split import Test.HUnit import Test.Framework import Test.Framework.Providers.HUnit import Test.Framework.Providers.QuickCheck2 -- ------------------------------------------------------------ -- -- ------------------------------------------------------------ testSuite :: Test.Framework.Test testSuite = testGroup "NLP.FullStop" [ testGroup "basic sanity checking" [ testProperty "concat (segment s) == id s, modulo whitespace" prop_segment_concat ] , testGroup "segmentation" [ testCaseSegments "simple" ["Foo.", "Bar."] "Foo. Bar." , testCaseSegments "condense" ["What?!", "Yeah"] "What?! Yeah" , testCaseSegments "URLs" ["Check out http://www.example.com.", "OK?"] "Check out http://www.example.com. OK?" , testCaseNoSplit "titles" "Mr. Doe, Mrs. Durand, St. Orolo and Dr. Singh" , testCaseNoSplit "abbreviations" "e.g., or eg., i.e. or ie. should not be split" , testCaseSegments "abbreviations 2" ["No lie.", "Honestly"] "No lie. Honestly" , testCaseNoSplit "initials" "E. Y. Kow" , testCaseNoSplit "initials 2" "Hello, E. Y. Kow" , testCaseSegments "initials counter" [ "E. Y. Kow.", "Hello" ] "E. Y. Kow. Hello" , testCaseNoSplit "numbers" "version 2.3.99.2" -- TODO: what's a good way of dealing with ellipsis (...)? -- TODO: He said "Foo." Bar (tricky because Foo. "Bar" is legit) -- TODO: Very likely to be cases where it's just plain ambiguous ] ] testCaseNoSplit d x = testCaseSegments d [x] x testCaseSegments d xs x = testCase d $ assertEqual "" xs (segment x) -- TODO: perhaps create a newtype that skews the random generation of tests -- towards things that look more like text (but not too much, because we still -- want to make sure we're covering edge-cases) prop_segment_concat s = noWhite s == concatMap noWhite (segment s) where noWhite = filter (not . isSpace) -- ------------------------------------------------------------ -- -- ------------------------------------------------------------ -- | 'segment' @s@ splits @s@ into a list of sentences. -- -- It looks for punctuation characters that indicate an -- end-of-sentence and tries to ignore some uses of -- puncuation which do not correspond to ends of sentences -- -- It's a good idea to view the source code to this module, -- especially the test suite. -- -- I imagine this sort of task is actually ambiguous and that -- you actually won't be able to write an exact segmenter. -- -- It may be a good idea to go see the literature on how to do -- segmentation right, maybe implement something which returns -- the N most probable segmentations instead. segment :: String -> [String] segment = map (dropWhile isSpace) . squish . breakup -- | Helper function to segment breakup :: String -> [String] breakup = split . condense -- "huh?!" . dropFinalBlank -- strings that end with terminator . keepDelimsR -- we want to preserve terminators $ oneOf stopPunctuation stopPunctuation :: [Char] stopPunctuation = [ '.', '?', '!' ] -- ------------------------------------------------------------ -- putting some pieces back together -- ------------------------------------------------------------ squish = squishBy (\x _ -> any (`isWordSuffixOf` x) abbreviations) . squishBy (\_ y -> not (startsWithSpace y)) . squishBy (\x _ -> looksLikeAnInitial (dropWhile isSpace x)) . squishBy (\x _ -> any (`isWordSuffixOf` x) titles) . squishBy (\x y -> endsWithDigit x && startsWithDigit y) where looksLikeAnInitial x = case reverse x of ('.':i:[]) -> isUpper i ('.':i:s:_) -> isSpace s && isUpper i _ -> False -- startsW f [] = False startsW f (x:_) = f x -- startsWithDigit = startsW isDigit startsWithSpace = startsW isSpace -- endsWithDigit xs = case reverse xs of ('.':x:_) -> isDigit x _ -> False -- | This is *not* (map concat . groupBy f) because the latter -- just checks equality on the first element of each group. -- We, on the other hand, want to check by the nearest neighbour squishBy :: (String -> String-> Bool) -> [String] -> [String] squishBy _ [] = [] squishBy eq (x:xs) = map concat (helper [] x xs) where helper acc x0 [] = [assemble acc x0] helper acc x0 (x1:xs) = if x0 `eq` x1 then helper (x0:acc) x1 xs else assemble acc x0 : helper [] x1 xs assemble acc x0 = reverse (x0 : acc) titles :: [String] titles = [ "Mr.", "Mrs.", "Dr.", "St." ] abbreviations :: [String] abbreviations = [ "cf.", "eg.", "ie.", "i.e.", "e.g." ] x `isWordSuffixOf` y | x `isSuffixOf` y = case drop (length x) (reverse y) of [] -> True -- x == y (z:_) -> not (isAlpha z) x `isWordSuffixOf` y = False