src/HasGP/Parsers/SvmLight.hs

{- | Parser implemented using the Parsec library for reading from files in the 
     format used by SVMLight.

     Currently assumes your file is a text file in Unix format. The extra 
     characters in Windows text files confuse it.
     
     Copyright (C) 2011 Sean Holden. sbh11\@cl.cam.ac.uk.
-}
{- This file is part of HasGP.

   HasGP is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   HasGP is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with HasGP.  If not, see <http://www.gnu.org/licenses/>.
-}
module HasGP.Parsers.SvmLight 
   (
     analyse,
     getMatrixExamplesFromFileC
   ) where

import Text.ParserCombinators.Parsec
import qualified IO
import Data.List

import qualified Numeric.LinearAlgebra as M

type FullTarget = (Int,Double,Bool)
type FullFeature = ((Int,String),Double)
type FullExample = (FullTarget,[FullFeature],String)

-- | Classes are denoted by plus or minus 1 or 0
oneZero :: Parser Int
oneZero = do
  n <- oneOf "10"
  return (read [n])

-- | Targets either correspond to classes or to doubles for regression 
--   problems. 
classTarget :: Parser Int
classTarget = 
    do 
      n <- oneZero
      return n 
    <|> do 
      char '+'
      n <- oneZero
      return n
    <|> do 
      char '-' 
      n <- oneZero
      return (-n)

-- | Feature values are doubles. 
positiveDouble :: Parser Double
positiveDouble = 
    do 
      char '.'
      s <- many1 digit
      return (read("0." ++ s))
    <|> try ( do 
              s1 <- many1 digit
              char '.'
              s2 <- many digit
              return (read(s1 ++ "." ++ if (s2=="") then "0" else s2))
            )
            <|> do 
                s1 <- many1 digit
                return (read s1)
    
-- | We may or may not want to automatically convert an integer,
--   depending on the circumstances. In particular targets for 
--   regression problems can be doubles, so we dont want to read a 
--   class as a double.
positiveDoubleNotInt :: Parser Double
positiveDoubleNotInt = 
    do 
      char '.'
      s <- many1 digit
      return (read("0." ++ s))      
    <|> do 
      s1 <- many1 digit
      char '.'
      s2 <- many digit
      return (read(s1 ++ "." ++ if (s2=="") then "0" else s2))

signedDouble :: Parser Double
signedDouble = 
    do 
      n <- positiveDouble
      return n 
    <|> do 
      char '+'
      n <- positiveDouble
      return n
    <|> do 
      char '-' 
      n <- positiveDouble
      return (-n)

signedDoubleNotInt :: Parser Double
signedDoubleNotInt = 
    do 
      n <- positiveDoubleNotInt
      return n 
    <|> do 
      char '+'
      n <- positiveDoubleNotInt
      return n
    <|> do 
      char '-' 
      n <- positiveDoubleNotInt
      return (-n)

-- | A target is -1, +1, 0 or a floating point number. The
--   Boolean is True for one of the first three and false
--   for the fourth. A look at Joachims' examples suggests
--   the + is optional.
target :: Parser FullTarget              
target = try ( 
              do                                
                n <- signedDoubleNotInt
                return (0, n, False)        
             )
         <|> do 
           n <- classTarget
           return (n, 0.0, True)

-- | We need integers to number the features.
integer :: Parser Int
integer = 
    do 
      n <- many1 digit
      return (read n)
    <|> do 
      char '+'
      n <- many1 digit
      return (read n)
    <|> do 
      char '-'
      n <-many1 digit
      return (read ('-':n))

-- | A feature in this format can in fact be either numbered or 
--   labelled as "qid".
feature :: Parser (Int, String)
feature = 
    do 
      s <- string "qid"
      return (0,"qid")
    <|> do 
      i <- integer
      return (i,"")

-- | Most of a line is taken up with : separated features and their values.
featureValuePair :: Parser FullFeature
featureValuePair = 
    do 
      f <- feature
      char ':'
      v <- signedDouble
      return (f,v)

-- | This is needed so that we can read pretty much anything we like from 
--   comments.
generalLetter :: Parser Char
generalLetter = alphaNum 
                 <|> oneOf ('"':'\\':"[]{};:'@#~/?.>,<|`¬!£$%^&*()-_=+")

-- | The format describes comments in two ways. At the start of the file they 
--   are ignored, but after a line they need to be read.
stringToLineEnd :: Parser [String]
stringToLineEnd = 
    do 
      words <- sepEndBy (many1 generalLetter) (many (char ' '))
      newline
      return words

-- | At the end of a line, a # followed by text needs to be read.          
--   I assume you only take the string up to the end of the line.            
info :: Parser String
info = 
    do 
      char '#'
      spaces
      s <- stringToLineEnd
      return (drop 1 (foldl (++) [] (map (' ':) s)))
    <|> do 
      newline
      return ""

-- | This reads a single line denoting a single example.
line :: Parser FullExample
line = 
    do  
      many (char ' ')
      t <- target
      many1 (char ' ')
      fvp <- sepEndBy (featureValuePair) (many (char ' '))
      s <- info
      return (t, fvp, s)

-- | This reads a file, ignoring comments at the beginning.
file:: Parser [FullExample]
file = 
    do 
      many info
      l <- many1 line
      spaces
      eof
      return l

-- | A bunch of basic functions for extracting interesting things from
--   the output of the parser in a format
--   that's likely to be a bit easier to use.
split1 :: (a,b,c) -> a
split1 (a,_,_) = a

split2 :: (a,b,c) -> b
split2 (_,b,_) = b

split3 :: (a,b,c) -> c
split3 (_,_,c) = c

fullExamplesSeparate :: [FullExample] 
                       -> ([FullTarget],[[FullFeature]],[String])
fullExamplesSeparate l = unzip3 l

classificationProblem :: [FullTarget] -> Bool
classificationProblem l = 
    all (\x -> (split3 x) && (((split1 x)==1) || ((split1 x)==(-1)))) l

getClassificationTargets :: [FullTarget] -> [Int]
getClassificationTargets [] = []
getClassificationTargets ((c,_,_):rest) 
    = c:(getClassificationTargets rest)

regressionProblem :: [FullTarget] -> Bool
regressionProblem l = all (\x -> (not $ split3 x)) l

getRegressionTargets :: [FullTarget] -> [Double]
getRegressionTargets [] = []
getRegressionTargets ((_,r,_):rest) 
    = r:(getRegressionTargets rest)

noQid :: [[FullFeature]] -> Bool
noQid l = all no l
    where no l2 = all (\x -> ((snd $ fst x)/="quid")) l2

getExampleRange:: [FullExample] -> (Int,Int)
getExampleRange [] = (0,0)
getExampleRange x 
    = ((minimum result), (maximum result))
      where y = split2 $ fullExamplesSeparate x -- just the attribute vectors
            z = map (map (fst . fst)) y -- attribute numbers from vectors
            result = foldl (++) [] z

-- | Now sort numbers and attributes at the same time 
--   so that the numbers are ascending.
comp :: (Int,Double) -> (Int,Double) -> Ordering
comp (a,b) (c,d) 
    | (a > c) = LT
    | (a < c) = GT
    | otherwise = error "Numbers should not be equal."

sortExamples :: [Int] -> [Double] -> [(Int, Double)]  
sortExamples numbers values = sortBy comp (zip numbers values) 
      
insertZeros :: (Int, Int) -> ([Int], [Double]) -> [Double]
insertZeros (min_num, max_num) (i, d) 
    = iz (unzip (sortExamples i d)) [] min_num max_num
      where 
        iz ([], []) r m n           = if (n == (m-1))
                                      then r
                                      else iz ([], []) (0:r) m (n-1)
        iz ([], _) _ _ _            
            = error "insertZeros: arguments need to have equal lengths"
        iz (_, []) _ _ _            
            = error "insertZeros: arguments need to have equal lengths"
        iz ((h1:t1), (h2:t2)) r m n = if (h1 == n) 
                                      then iz (t1, t2) (h2:r) m (n-1)
                                      else iz (h1:t1, h2:t2) (0:r) m (n-1)

-- | Get the attribute vectors as a list of lists.
--   Care required here as we need to insert 0 where there is no attribute.
getExamples :: [FullExample] -> [[Double]]
getExamples [] = []
getExamples x = map (insertZeros (getExampleRange x)) (zip numbers values) 
    where 
      y = split2 $ 
          fullExamplesSeparate x        -- attribute vectors
      numbers = map (map (fst . fst)) y -- attribute numbers from vectors
      values = map (map snd) y          -- attribute values from the vectors

-- | Does a matrix of Doubles make sense: that is, are all the rows the 
--   same length?
dimensionsCorrect :: [[Double]] -> Bool
dimensionsCorrect [] = True
dimensionsCorrect [r] = True
dimensionsCorrect (h:t) = all (==(length h)) (map length t)

-- | Find the dimensions of a matrix represented as a list of lists of Doubles.
dimensions :: [[a]] -> (Int, Int)
dimensions [] = (0,0)
dimensions m@(h:t) = (length m, length h)
          
-- | Parse a file in SvmLight format and print some information about it.   
analyse :: String -> IO ()
analyse file_name = 
    do
      x <- parseFromFile file file_name
      case x of
        Left error -> IO.putStrLn $ show error
        Right result -> do
               IO.putStrLn ("-------------------------------------------")
               IO.putStrLn ("Analysing SVMLight input file: " ++ file_name)
               IO.putStrLn ("-------------------------------------------")
               IO.putStrLn ("Classification problem: " 
                            ++ (if (classificationProblem r1) 
                                then "Yes" 
                                else "No"))
               IO.putStrLn ("Regression Problem:     " 
                            ++ (if (regressionProblem r1) 
                                then "Yes" 
                                else "No"))
               IO.putStrLn ("First five classification target values: " 
                            ++ (show $ 
                                take 5 (getClassificationTargets $ r1)))
               IO.putStrLn ("First five regression target values:     " 
                            ++ (show $ take 5 (getRegressionTargets $ r1)))
               IO.putStrLn ("-------------------------------------------")
               IO.putStrLn ("Maximum attribute number: " ++ (show $ snd $ r2))
               IO.putStrLn ("Minimum attribute number: " ++ (show $ fst $ r2))
               IO.putStrLn ("-------------------------------------------")
               IO.putStrLn ("Maximum attribute value:  " 
                            ++ (show $ maximum $ map maximum r3))
               IO.putStrLn ("Minimum attribute value:  " 
                            ++ (show $ minimum $ map minimum r3))
               IO.putStrLn ("qid present:              " 
                            ++ (if (noQid $ r4) 
                                then "No" 
                                else "Yes"))
               IO.putStrLn ("Matrix correctly sized:   " 
                            ++ (if (dimensionsCorrect r3) 
                                then "Yes" 
                                else "No"))
               IO.putStrLn ("Number of examples:       " 
                            ++ (show $ fst $ dimensions r3))
               IO.putStrLn ("Number of attributes:     " 
                            ++ (show $ snd $ dimensions r3))
               IO.putStrLn ("Beginning of first five examples: ") 
               IO.putStrLn (show $ map (take 5) (take 5 r3))
               IO.putStrLn ("------------------------------------------")
                   where 
                     r = fullExamplesSeparate result
                     r1 = split1 r
                     r2 = getExampleRange result
                     r3 = getExamples result
                     r4 = split2 r

getExamplesFromFile :: String -> IO (Either ParseError [FullExample])
getExamplesFromFile string = 
    do 
      x <- parseFromFile file string
      return x

fullExampleToMatrixC::[FullExample] -> (M.Matrix Double, M.Vector Double)
fullExampleToMatrixC fullExamples 
    | classificationProblem targets = 
        if (dimensionsCorrect attributes)
        then (M.fromRows attributes', 
              M.fromList $ map convert intTargets)
        else error "Dimension not correct for making a matrix"
    | otherwise = error "This is not a classification problem."
    where 
      targets = split1 $ fullExamplesSeparate fullExamples
      attributes = getExamples fullExamples
      attributes' = map M.fromList attributes
      intTargets = getClassificationTargets targets
      convert x 
          | x==1 = 1.0
          | x==(-1) = (-1.0)
          | otherwise = error "Unexpected class label."

-- | Read examples from a file in SvmLight format and produce a corresponding 
--   matrix and vector, for a classification problem. Includes checks 
--   that all examples have the same number of attributes, and that the file 
--   does in fact correspond to a classification problem.
getMatrixExamplesFromFileC::String -> IO (M.Matrix Double, M.Vector Double)
getMatrixExamplesFromFileC fileName = 
    do
      contents <- getExamplesFromFile fileName
      case contents of
        Left error' -> error $ show error'
        Right result -> return $ fullExampleToMatrixC result