module HasGP.Parsers.SvmLight
(
analyse,
getMatrixExamplesFromFileC
) where
import Text.ParserCombinators.Parsec
import qualified IO
import Data.List
import qualified Numeric.LinearAlgebra as M
type FullTarget = (Int,Double,Bool)
type FullFeature = ((Int,String),Double)
type FullExample = (FullTarget,[FullFeature],String)
oneZero :: Parser Int
oneZero = do
n <- oneOf "10"
return (read [n])
classTarget :: Parser Int
classTarget =
do
n <- oneZero
return n
<|> do
char '+'
n <- oneZero
return n
<|> do
char '-'
n <- oneZero
return (n)
positiveDouble :: Parser Double
positiveDouble =
do
char '.'
s <- many1 digit
return (read("0." ++ s))
<|> try ( do
s1 <- many1 digit
char '.'
s2 <- many digit
return (read(s1 ++ "." ++ if (s2=="") then "0" else s2))
)
<|> do
s1 <- many1 digit
return (read s1)
positiveDoubleNotInt :: Parser Double
positiveDoubleNotInt =
do
char '.'
s <- many1 digit
return (read("0." ++ s))
<|> do
s1 <- many1 digit
char '.'
s2 <- many digit
return (read(s1 ++ "." ++ if (s2=="") then "0" else s2))
signedDouble :: Parser Double
signedDouble =
do
n <- positiveDouble
return n
<|> do
char '+'
n <- positiveDouble
return n
<|> do
char '-'
n <- positiveDouble
return (n)
signedDoubleNotInt :: Parser Double
signedDoubleNotInt =
do
n <- positiveDoubleNotInt
return n
<|> do
char '+'
n <- positiveDoubleNotInt
return n
<|> do
char '-'
n <- positiveDoubleNotInt
return (n)
target :: Parser FullTarget
target = try (
do
n <- signedDoubleNotInt
return (0, n, False)
)
<|> do
n <- classTarget
return (n, 0.0, True)
integer :: Parser Int
integer =
do
n <- many1 digit
return (read n)
<|> do
char '+'
n <- many1 digit
return (read n)
<|> do
char '-'
n <-many1 digit
return (read ('-':n))
feature :: Parser (Int, String)
feature =
do
s <- string "qid"
return (0,"qid")
<|> do
i <- integer
return (i,"")
featureValuePair :: Parser FullFeature
featureValuePair =
do
f <- feature
char ':'
v <- signedDouble
return (f,v)
generalLetter :: Parser Char
generalLetter = alphaNum
<|> oneOf ('"':'\\':"[]{};:'@#~/?.>,<|`¬!£$%^&*()-_=+")
stringToLineEnd :: Parser [String]
stringToLineEnd =
do
words <- sepEndBy (many1 generalLetter) (many (char ' '))
newline
return words
info :: Parser String
info =
do
char '#'
spaces
s <- stringToLineEnd
return (drop 1 (foldl (++) [] (map (' ':) s)))
<|> do
newline
return ""
line :: Parser FullExample
line =
do
many (char ' ')
t <- target
many1 (char ' ')
fvp <- sepEndBy (featureValuePair) (many (char ' '))
s <- info
return (t, fvp, s)
file:: Parser [FullExample]
file =
do
many info
l <- many1 line
spaces
eof
return l
split1 :: (a,b,c) -> a
split1 (a,_,_) = a
split2 :: (a,b,c) -> b
split2 (_,b,_) = b
split3 :: (a,b,c) -> c
split3 (_,_,c) = c
fullExamplesSeparate :: [FullExample]
-> ([FullTarget],[[FullFeature]],[String])
fullExamplesSeparate l = unzip3 l
classificationProblem :: [FullTarget] -> Bool
classificationProblem l =
all (\x -> (split3 x) && (((split1 x)==1) || ((split1 x)==(1)))) l
getClassificationTargets :: [FullTarget] -> [Int]
getClassificationTargets [] = []
getClassificationTargets ((c,_,_):rest)
= c:(getClassificationTargets rest)
regressionProblem :: [FullTarget] -> Bool
regressionProblem l = all (\x -> (not $ split3 x)) l
getRegressionTargets :: [FullTarget] -> [Double]
getRegressionTargets [] = []
getRegressionTargets ((_,r,_):rest)
= r:(getRegressionTargets rest)
noQid :: [[FullFeature]] -> Bool
noQid l = all no l
where no l2 = all (\x -> ((snd $ fst x)/="quid")) l2
getExampleRange:: [FullExample] -> (Int,Int)
getExampleRange [] = (0,0)
getExampleRange x
= ((minimum result), (maximum result))
where y = split2 $ fullExamplesSeparate x
z = map (map (fst . fst)) y
result = foldl (++) [] z
comp :: (Int,Double) -> (Int,Double) -> Ordering
comp (a,b) (c,d)
| (a > c) = LT
| (a < c) = GT
| otherwise = error "Numbers should not be equal."
sortExamples :: [Int] -> [Double] -> [(Int, Double)]
sortExamples numbers values = sortBy comp (zip numbers values)
insertZeros :: (Int, Int) -> ([Int], [Double]) -> [Double]
insertZeros (min_num, max_num) (i, d)
= iz (unzip (sortExamples i d)) [] min_num max_num
where
iz ([], []) r m n = if (n == (m1))
then r
else iz ([], []) (0:r) m (n1)
iz ([], _) _ _ _
= error "insertZeros: arguments need to have equal lengths"
iz (_, []) _ _ _
= error "insertZeros: arguments need to have equal lengths"
iz ((h1:t1), (h2:t2)) r m n = if (h1 == n)
then iz (t1, t2) (h2:r) m (n1)
else iz (h1:t1, h2:t2) (0:r) m (n1)
getExamples :: [FullExample] -> [[Double]]
getExamples [] = []
getExamples x = map (insertZeros (getExampleRange x)) (zip numbers values)
where
y = split2 $
fullExamplesSeparate x
numbers = map (map (fst . fst)) y
values = map (map snd) y
dimensionsCorrect :: [[Double]] -> Bool
dimensionsCorrect [] = True
dimensionsCorrect [r] = True
dimensionsCorrect (h:t) = all (==(length h)) (map length t)
dimensions :: [[a]] -> (Int, Int)
dimensions [] = (0,0)
dimensions m@(h:t) = (length m, length h)
analyse :: String -> IO ()
analyse file_name =
do
x <- parseFromFile file file_name
case x of
Left error -> IO.putStrLn $ show error
Right result -> do
IO.putStrLn ("-------------------------------------------")
IO.putStrLn ("Analysing SVMLight input file: " ++ file_name)
IO.putStrLn ("-------------------------------------------")
IO.putStrLn ("Classification problem: "
++ (if (classificationProblem r1)
then "Yes"
else "No"))
IO.putStrLn ("Regression Problem: "
++ (if (regressionProblem r1)
then "Yes"
else "No"))
IO.putStrLn ("First five classification target values: "
++ (show $
take 5 (getClassificationTargets $ r1)))
IO.putStrLn ("First five regression target values: "
++ (show $ take 5 (getRegressionTargets $ r1)))
IO.putStrLn ("-------------------------------------------")
IO.putStrLn ("Maximum attribute number: " ++ (show $ snd $ r2))
IO.putStrLn ("Minimum attribute number: " ++ (show $ fst $ r2))
IO.putStrLn ("-------------------------------------------")
IO.putStrLn ("Maximum attribute value: "
++ (show $ maximum $ map maximum r3))
IO.putStrLn ("Minimum attribute value: "
++ (show $ minimum $ map minimum r3))
IO.putStrLn ("qid present: "
++ (if (noQid $ r4)
then "No"
else "Yes"))
IO.putStrLn ("Matrix correctly sized: "
++ (if (dimensionsCorrect r3)
then "Yes"
else "No"))
IO.putStrLn ("Number of examples: "
++ (show $ fst $ dimensions r3))
IO.putStrLn ("Number of attributes: "
++ (show $ snd $ dimensions r3))
IO.putStrLn ("Beginning of first five examples: ")
IO.putStrLn (show $ map (take 5) (take 5 r3))
IO.putStrLn ("------------------------------------------")
where
r = fullExamplesSeparate result
r1 = split1 r
r2 = getExampleRange result
r3 = getExamples result
r4 = split2 r
getExamplesFromFile :: String -> IO (Either ParseError [FullExample])
getExamplesFromFile string =
do
x <- parseFromFile file string
return x
fullExampleToMatrixC::[FullExample] -> (M.Matrix Double, M.Vector Double)
fullExampleToMatrixC fullExamples
| classificationProblem targets =
if (dimensionsCorrect attributes)
then (M.fromRows attributes',
M.fromList $ map convert intTargets)
else error "Dimension not correct for making a matrix"
| otherwise = error "This is not a classification problem."
where
targets = split1 $ fullExamplesSeparate fullExamples
attributes = getExamples fullExamples
attributes' = map M.fromList attributes
intTargets = getClassificationTargets targets
convert x
| x==1 = 1.0
| x==(1) = (1.0)
| otherwise = error "Unexpected class label."
getMatrixExamplesFromFileC::String -> IO (M.Matrix Double, M.Vector Double)
getMatrixExamplesFromFileC fileName =
do
contents <- getExamplesFromFile fileName
case contents of
Left error' -> error $ show error'
Right result -> return $ fullExampleToMatrixC result