import HCL import Data.Char (isAlpha, isSpace) import Text.Regex (matchRegex, mkRegex) import Network.Browser (browse, setErrHandler, setOutHandler, request, defaultGETRequest) import Network.HTTP (urlEncode, rspBody) import Network.URI (parseURI) import Data.List (intersperse, unfoldr, isSuffixOf, isPrefixOf) import Numeric (readDec) import Data.Foldable (foldl') import Control.Monad (liftM) import Text.Printf (printf) main = execReq $ do reqIO $ putStrLn "Calculate n-grams using Google. Quit by hitting enter at any prompt." reqForever $ do phrase <- reqLift (words . map clean) $ prompt "Please enter a starting phrase: " reqResp alternatives <- reqLift phrasesAndWords $ prompt "Enter alternative words separated by spaces. Phrases can enclosed in quotes (e.g., \"...\"): " reqResp count <- reqUntil (return . (> 1)) $ prompt "What kind of n-gram would you like to analyze (> 1)? " reqInt ngs <- reqIO $ getProbabilities (ngrams count phrase alternatives) reqIO $ mapM_ (putStrLn . formatNGram) ngs reqIO $ mapM_ (\ng -> putStrLn ("Probability of phrase '" ++ mkPhrase ng ++ "': " ++ printf "%.6g" (totalProbability ng * 100) ++ "%")) (recoverPhrases (length alternatives) ngs) where clean c | isAlpha c = c | otherwise = ' ' -- Recover original phrase from ngram list. mkPhrase :: [NGram] -> String mkPhrase p = unwords $ map (head . ngPhrase) (init p) ++ (ngPhrase . last) p -- Converts a string to a list of words, but keeps words in quotes -- together. Nested quotes are not supported. phrasesAndWords :: String -> [String] phrasesAndWords = let startsWith w c = [c] `isPrefixOf` w endsWith w c = [c] `isSuffixOf` w buildPhrases [] = Nothing -- A word starts with a '"', now look for ending '"' buildPhrases (w:ws) | w `startsWith` '"' = let (p, rest) = break (`endsWith` '"') (w:ws) in case () of () | null p -> Just ((init . tail) w, ws) -- single word in quotes | null rest -> Just ((tail . unwords) p, rest) -- No ending quote -- Some amount of words in the phrase and some not | otherwise -> Just ((init . tail) (unwords (p ++ [head rest])), tail rest) | otherwise = Just (w, ws) in -- Use of words makes sure any valid quotes appear at beginning or -- end of a word. unfoldr buildPhrases . words -- ^ Stores an n-gram. That is, a structure that tells us -- the probability of the last word in ngPhrase occurrring, given -- the occurences of the previous words in the phrase and the -- occurrences of the entire phrase. data NGram = NGram { ngPhrase :: [String] -- The n-gram itself. The last word is , numeratorCnt :: Integer -- number of hits for phrase w\/o last word. , denomCnt :: Integer -- ^ number of hits for phrase w\/o last word. , probability :: Double -- ^ Probability that the last word in ngPhrase should be there, based on numeratorCnt and denomCnt. } deriving Show -- ^ Generalized ngrams. Takes a phrase and a list of possible -- ending words. Returns all n-grams for them. The behavior -- is undefined if the length requested is longer than -- the phrase given. ngrams :: Int -> [String] -> [String] -> [[String]] ngrams nsize phrase suffixes = let phraseGrams = gatherNGrams phrase altGrams = gatherAlts (drop (length phrase - (nsize - 1)) phrase) in phraseGrams ++ altGrams where -- Gathers the strings given into groups of the -- the length given. gatherNGrams :: [String] -> [[String]] gatherNGrams = unfoldr group where group p | null $ drop (max 0 (nsize - 1)) p = Nothing | otherwise = Just (take nsize p , drop 1 p) -- Takes a list of words and a list of alternate -- suffixes. Returns a list of lists, where -- each inner list is the original word list plus one -- of the alternate endings. gatherAlts :: [String] -> [[String]] gatherAlts prefix = zipWith addSuffix (repeat prefix) suffixes where addSuffix a b = a ++ [b] -- ^ Turn an n-gram into a nice string. formatNGram :: NGram -> String formatNGram (NGram ph n d p) = unwords ph ++ ": " ++ show n ++ " / " ++ show d ++ " = " ++ printf "%.6g" (p * 100) ++ "%" -- ^ Multiplies the probabilities of all n-grams together. Indicates the -- probability of the entire phrase. totalProbability :: [NGram] -> Double totalProbability ngs = foldl' (*) 1 (map probability ngs) -- ^ Recovers all possible phrases from the NGrams given. -- The first argument is the number of alternative endings that -- were given originally. recoverPhrases :: Int -> [NGram] -> [[NGram]] recoverPhrases numAlts ngs = let initial = take (ngLength - numAlts) ngs rest = drop (ngLength - numAlts) ngs ngLength = length ngs in map (\r -> initial ++ [r]) rest -- ^ Given a list of possible phrases (broken into n-sized chunks), -- calculate the ngram associated with each chunk. getProbabilities :: [[String]] -> IO [NGram] getProbabilities phrase = mapM getProbability phrase where getProbability phrase = do denomCount <- liftM (max 1) $ resultCount (init phrase) numCount <- liftM (max 1) $ resultCount phrase return $ NGram phrase numCount denomCount (fromIntegral numCount / fromIntegral denomCount) -- ^ Determine how many times the phrase given appears on the web, according -- to Google's estimate. resultCount :: [String] -> IO Integer resultCount terms = let makeQuery terms = "http://www.google.com/search?q=" ++ (urlEncode $ unwords (map ("+" ++) terms)) in case parseURI (makeQuery terms) of Just uri -> do browse $ do setErrHandler (const (return())) setOutHandler (const (return())) (_, result) <- request $ defaultGETRequest uri case matchRegex (mkRegex "of about ([[:digit:],]*)") (rspBody result) of Nothing -> return 0 Just strs -> return $ fst $ head $ readDec (filter (/=',') (head strs)) _ -> error $ "Could not make uri from the terms " ++ show terms