{- | Parser implemented using the Parsec library for reading from files in the format used by SVMLight. Currently assumes your file is a text file in Unix format. The extra characters in Windows text files confuse it. Copyright (C) 2011 Sean Holden. sbh11\@cl.cam.ac.uk. -} {- This file is part of HasGP. HasGP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. HasGP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with HasGP. If not, see . -} module HasGP.Parsers.SvmLight ( analyse, getMatrixExamplesFromFileC ) where import Text.ParserCombinators.Parsec import qualified IO import Data.List import qualified Numeric.LinearAlgebra as M type FullTarget = (Int,Double,Bool) type FullFeature = ((Int,String),Double) type FullExample = (FullTarget,[FullFeature],String) -- | Classes are denoted by plus or minus 1 or 0 oneZero :: Parser Int oneZero = do n <- oneOf "10" return (read [n]) -- | Targets either correspond to classes or to doubles for regression -- problems. classTarget :: Parser Int classTarget = do n <- oneZero return n <|> do char '+' n <- oneZero return n <|> do char '-' n <- oneZero return (-n) -- | Feature values are doubles. positiveDouble :: Parser Double positiveDouble = do char '.' s <- many1 digit return (read("0." ++ s)) <|> try ( do s1 <- many1 digit char '.' s2 <- many digit return (read(s1 ++ "." ++ if (s2=="") then "0" else s2)) ) <|> do s1 <- many1 digit return (read s1) -- | We may or may not want to automatically convert an integer, -- depending on the circumstances. In particular targets for -- regression problems can be doubles, so we dont want to read a -- class as a double. positiveDoubleNotInt :: Parser Double positiveDoubleNotInt = do char '.' s <- many1 digit return (read("0." ++ s)) <|> do s1 <- many1 digit char '.' s2 <- many digit return (read(s1 ++ "." ++ if (s2=="") then "0" else s2)) signedDouble :: Parser Double signedDouble = do n <- positiveDouble return n <|> do char '+' n <- positiveDouble return n <|> do char '-' n <- positiveDouble return (-n) signedDoubleNotInt :: Parser Double signedDoubleNotInt = do n <- positiveDoubleNotInt return n <|> do char '+' n <- positiveDoubleNotInt return n <|> do char '-' n <- positiveDoubleNotInt return (-n) -- | A target is -1, +1, 0 or a floating point number. The -- Boolean is True for one of the first three and false -- for the fourth. A look at Joachims' examples suggests -- the + is optional. target :: Parser FullTarget target = try ( do n <- signedDoubleNotInt return (0, n, False) ) <|> do n <- classTarget return (n, 0.0, True) -- | We need integers to number the features. integer :: Parser Int integer = do n <- many1 digit return (read n) <|> do char '+' n <- many1 digit return (read n) <|> do char '-' n <-many1 digit return (read ('-':n)) -- | A feature in this format can in fact be either numbered or -- labelled as "qid". feature :: Parser (Int, String) feature = do s <- string "qid" return (0,"qid") <|> do i <- integer return (i,"") -- | Most of a line is taken up with : separated features and their values. featureValuePair :: Parser FullFeature featureValuePair = do f <- feature char ':' v <- signedDouble return (f,v) -- | This is needed so that we can read pretty much anything we like from -- comments. generalLetter :: Parser Char generalLetter = alphaNum <|> oneOf ('"':'\\':"[]{};:'@#~/?.>,<|`¬!£$%^&*()-_=+") -- | The format describes comments in two ways. At the start of the file they -- are ignored, but after a line they need to be read. stringToLineEnd :: Parser [String] stringToLineEnd = do words <- sepEndBy (many1 generalLetter) (many (char ' ')) newline return words -- | At the end of a line, a # followed by text needs to be read. -- I assume you only take the string up to the end of the line. info :: Parser String info = do char '#' spaces s <- stringToLineEnd return (drop 1 (foldl (++) [] (map (' ':) s))) <|> do newline return "" -- | This reads a single line denoting a single example. line :: Parser FullExample line = do many (char ' ') t <- target many1 (char ' ') fvp <- sepEndBy (featureValuePair) (many (char ' ')) s <- info return (t, fvp, s) -- | This reads a file, ignoring comments at the beginning. file:: Parser [FullExample] file = do many info l <- many1 line spaces eof return l -- | A bunch of basic functions for extracting interesting things from -- the output of the parser in a format -- that's likely to be a bit easier to use. split1 :: (a,b,c) -> a split1 (a,_,_) = a split2 :: (a,b,c) -> b split2 (_,b,_) = b split3 :: (a,b,c) -> c split3 (_,_,c) = c fullExamplesSeparate :: [FullExample] -> ([FullTarget],[[FullFeature]],[String]) fullExamplesSeparate l = unzip3 l classificationProblem :: [FullTarget] -> Bool classificationProblem l = all (\x -> (split3 x) && (((split1 x)==1) || ((split1 x)==(-1)))) l getClassificationTargets :: [FullTarget] -> [Int] getClassificationTargets [] = [] getClassificationTargets ((c,_,_):rest) = c:(getClassificationTargets rest) regressionProblem :: [FullTarget] -> Bool regressionProblem l = all (\x -> (not $ split3 x)) l getRegressionTargets :: [FullTarget] -> [Double] getRegressionTargets [] = [] getRegressionTargets ((_,r,_):rest) = r:(getRegressionTargets rest) noQid :: [[FullFeature]] -> Bool noQid l = all no l where no l2 = all (\x -> ((snd $ fst x)/="quid")) l2 getExampleRange:: [FullExample] -> (Int,Int) getExampleRange [] = (0,0) getExampleRange x = ((minimum result), (maximum result)) where y = split2 $ fullExamplesSeparate x -- just the attribute vectors z = map (map (fst . fst)) y -- attribute numbers from vectors result = foldl (++) [] z -- | Now sort numbers and attributes at the same time -- so that the numbers are ascending. comp :: (Int,Double) -> (Int,Double) -> Ordering comp (a,b) (c,d) | (a > c) = LT | (a < c) = GT | otherwise = error "Numbers should not be equal." sortExamples :: [Int] -> [Double] -> [(Int, Double)] sortExamples numbers values = sortBy comp (zip numbers values) insertZeros :: (Int, Int) -> ([Int], [Double]) -> [Double] insertZeros (min_num, max_num) (i, d) = iz (unzip (sortExamples i d)) [] min_num max_num where iz ([], []) r m n = if (n == (m-1)) then r else iz ([], []) (0:r) m (n-1) iz ([], _) _ _ _ = error "insertZeros: arguments need to have equal lengths" iz (_, []) _ _ _ = error "insertZeros: arguments need to have equal lengths" iz ((h1:t1), (h2:t2)) r m n = if (h1 == n) then iz (t1, t2) (h2:r) m (n-1) else iz (h1:t1, h2:t2) (0:r) m (n-1) -- | Get the attribute vectors as a list of lists. -- Care required here as we need to insert 0 where there is no attribute. getExamples :: [FullExample] -> [[Double]] getExamples [] = [] getExamples x = map (insertZeros (getExampleRange x)) (zip numbers values) where y = split2 $ fullExamplesSeparate x -- attribute vectors numbers = map (map (fst . fst)) y -- attribute numbers from vectors values = map (map snd) y -- attribute values from the vectors -- | Does a matrix of Doubles make sense: that is, are all the rows the -- same length? dimensionsCorrect :: [[Double]] -> Bool dimensionsCorrect [] = True dimensionsCorrect [r] = True dimensionsCorrect (h:t) = all (==(length h)) (map length t) -- | Find the dimensions of a matrix represented as a list of lists of Doubles. dimensions :: [[a]] -> (Int, Int) dimensions [] = (0,0) dimensions m@(h:t) = (length m, length h) -- | Parse a file in SvmLight format and print some information about it. analyse :: String -> IO () analyse file_name = do x <- parseFromFile file file_name case x of Left error -> IO.putStrLn $ show error Right result -> do IO.putStrLn ("-------------------------------------------") IO.putStrLn ("Analysing SVMLight input file: " ++ file_name) IO.putStrLn ("-------------------------------------------") IO.putStrLn ("Classification problem: " ++ (if (classificationProblem r1) then "Yes" else "No")) IO.putStrLn ("Regression Problem: " ++ (if (regressionProblem r1) then "Yes" else "No")) IO.putStrLn ("First five classification target values: " ++ (show $ take 5 (getClassificationTargets $ r1))) IO.putStrLn ("First five regression target values: " ++ (show $ take 5 (getRegressionTargets $ r1))) IO.putStrLn ("-------------------------------------------") IO.putStrLn ("Maximum attribute number: " ++ (show $ snd $ r2)) IO.putStrLn ("Minimum attribute number: " ++ (show $ fst $ r2)) IO.putStrLn ("-------------------------------------------") IO.putStrLn ("Maximum attribute value: " ++ (show $ maximum $ map maximum r3)) IO.putStrLn ("Minimum attribute value: " ++ (show $ minimum $ map minimum r3)) IO.putStrLn ("qid present: " ++ (if (noQid $ r4) then "No" else "Yes")) IO.putStrLn ("Matrix correctly sized: " ++ (if (dimensionsCorrect r3) then "Yes" else "No")) IO.putStrLn ("Number of examples: " ++ (show $ fst $ dimensions r3)) IO.putStrLn ("Number of attributes: " ++ (show $ snd $ dimensions r3)) IO.putStrLn ("Beginning of first five examples: ") IO.putStrLn (show $ map (take 5) (take 5 r3)) IO.putStrLn ("------------------------------------------") where r = fullExamplesSeparate result r1 = split1 r r2 = getExampleRange result r3 = getExamples result r4 = split2 r getExamplesFromFile :: String -> IO (Either ParseError [FullExample]) getExamplesFromFile string = do x <- parseFromFile file string return x fullExampleToMatrixC::[FullExample] -> (M.Matrix Double, M.Vector Double) fullExampleToMatrixC fullExamples | classificationProblem targets = if (dimensionsCorrect attributes) then (M.fromRows attributes', M.fromList $ map convert intTargets) else error "Dimension not correct for making a matrix" | otherwise = error "This is not a classification problem." where targets = split1 $ fullExamplesSeparate fullExamples attributes = getExamples fullExamples attributes' = map M.fromList attributes intTargets = getClassificationTargets targets convert x | x==1 = 1.0 | x==(-1) = (-1.0) | otherwise = error "Unexpected class label." -- | Read examples from a file in SvmLight format and produce a corresponding -- matrix and vector, for a classification problem. Includes checks -- that all examples have the same number of attributes, and that the file -- does in fact correspond to a classification problem. getMatrixExamplesFromFileC::String -> IO (M.Matrix Double, M.Vector Double) getMatrixExamplesFromFileC fileName = do contents <- getExamplesFromFile fileName case contents of Left error' -> error $ show error' Right result -> return $ fullExampleToMatrixC result