{-# LANGUAGE UnicodeSyntax #-} -- | Example usage: -- -- >>> dat <- loadData' -- >>> head $ guess dat "this is a teststring" -- ("en",0.49421052631578954) -- >>> take 2 $ guess dat "dette er en teststreng" -- [("no",0.5703030303030303),("da",0.5096969696969698)] -- >>> head $ guess dat "lorem ipsum dolor sit amet" -- ("la",0.34199999999999997) module Language.Guess where import Control.Applicative ((<$>)) import Data.Char import Data.Function (on) import qualified Data.Map as M import Data.Map (Map) import Data.Maybe (fromMaybe) #if MIN_VERSION_base(4,3,0) import Data.Tuple (swap) #else import Data.Tuple.HT (swap) #endif import Data.List (sort, sortBy) import qualified Data.ByteString.Char8 as BS import Data.Serialize import Paths_language_guess type Trigram = (Char, Char, Char) type Frequency = Int type Rank = Int type Language = String threshold ∷ Int threshold = 300 -- | Load a cerealized file. loadData ∷ FilePath → IO (Map Language (Map Trigram Rank)) loadData f = (\(Right x) → x) . decode <$> BS.readFile f -- | Load the default cerealized file. loadData' ∷ IO (Map Language (Map Trigram Rank)) loadData' = loadData =<< getDataFileName "lang.dat" -- | Guess the language of a string. guess ∷ Map Language (Map Trigram Rank) → String → [(Language, Double)] guess langData = sortBy (flip compare `on` swap) . M.toList . f . rank . parse where f x = M.map (flip distance x) langData -- | Calculate distance between ranked trigram sets. -- Cavnar & Trenkle (1994) distance ∷ Map Trigram Rank → Map Trigram Rank → Double distance x y = norm $ M.foldrWithKey f 0 y where f k n m = m + fromMaybe threshold (abs . (n-) <$> M.lookup k x) norm z = 1 - fromIntegral z / fromIntegral (M.size y) / fromIntegral (M.size x) -- | Convert a set of trigram frequencies to ranks. -- Maximum of 'threshold', uses alphabetical sort to break ties. rank ∷ Map Trigram Frequency → M.Map Trigram Rank rank = M.fromList . flip zip [1..] . map snd . take threshold . sortBy c . map swap . M.toList where (r, k) `c` (r', k') = if r == r' then compare k k' else compare r' r -- | Make a trigram frequency map out of a string. parse ∷ String → Map (Char, Char, Char) Frequency parse x = go M.empty $ clean (' ':x) where go m (x:y:z:xs) = go (M.alter f (x,y,z) m) (y:z:xs) go m _ = m f Nothing = Just 1 f (Just a) = Just (a+1) -- | Clean a string, removing punctiation, lowering cases, and collapsing -- adjacent spaces. clean ∷ String → String clean (x:y:xs) | isWhite x && isWhite y = clean (' ':xs) | isPunctuation y || isNumber y = clean (x:' ':xs) | isUpper y = clean (x:toLower y:xs) | otherwise = x:clean (y:xs) where isWhite x = isSpace x || isSeparator x clean (' ':[]) = " " clean (x:[]) = x:" " clean _ = ""