module Tests.NLP.FullStop ( suite ) where import Data.Char ( isSpace ) import Test.HUnit import Test.Tasty import Test.Tasty.HUnit import Test.Tasty.QuickCheck import NLP.FullStop -- ------------------------------------------------------------ -- -- ------------------------------------------------------------ suite :: Test.Tasty.TestTree suite = testGroup "NLP.FullStop" [ testGroup "basic sanity checking" [ testProperty "concat (segment s) == id s, modulo whitespace" prop_segment_concat ] , testGroup "segmentation" [ testCaseSegments "simple" ["Foo.", "Bar."] "Foo. Bar." , testCaseSegments "condense" ["What?!", "Yeah"] "What?! Yeah" , testCaseSegments "URLs" ["Check out http://www.example.com.", "OK?"] "Check out http://www.example.com. OK?" , testCaseNoSplit "titles" "Mr. Doe, Mrs. Durand, St. Orolo and Dr. Singh" , testCaseNoSplit "abbreviations" "e.g., or eg., i.e. or ie. should not be split" , testCaseSegments "abbreviations 2" ["No lie.", "Honestly"] "No lie. Honestly" , testCaseNoSplit "initials" "E. Y. Kow" , testCaseNoSplit "initials 2" "Hello, E. Y. Kow" , testCaseSegments "initials counter" [ "E. Y. Kow.", "Hello" ] "E. Y. Kow. Hello" , testCaseNoSplit "numbers" "version 2.3.99.2" -- TODO: what's a good way of dealing with ellipsis (...)? -- TODO: He said "Foo." Bar (tricky because Foo. "Bar" is legit) -- TODO: Very likely to be cases where it's just plain ambiguous ] ] testCaseNoSplit d x = testCaseSegments d [x] x testCaseSegments d xs x = testCase d $ assertEqual "" xs (segment x) -- TODO: perhaps create a newtype that skews the random generation of tests -- towards things that look more like text (but not too much, because we still -- want to make sure we're covering edge-cases) prop_segment_concat s = noWhite s == concatMap noWhite (segment s) where noWhite = filter (not . isSpace)