module Language.Haskell.HsColour.Classify
( TokenType(..)
, tokenise
) where
import Char
import List
-- Lex Haskell source code into an annotated token stream, without
-- discarding any characters or layout.
tokenise :: String -> [(TokenType,String)]
tokenise = map (\s-> (classify s,s)) . glue . chunk
-- Basic Haskell lexing, except we keep whitespace.
chunk :: String -> [String]
chunk [] = []
chunk ('\r':s) = chunk s -- get rid of DOS newline stuff
chunk ('\n':s) = "\n": chunk s
chunk (c:s) | isLinearSpace c
= (c:ss): chunk rest where (ss,rest) = span isLinearSpace s
chunk ('{':'-':s) = let (com,s') = nestcomment 0 s
in ('{':'-':com) : chunk s'
chunk s = case Prelude.lex s of
[] -> [head s]: chunk (tail s) -- e.g. inside comment
((tok@('-':'-':_),rest):_)
| all (=='-') tok -> (tok++com): chunk s'
where (com,s') = eolcomment rest
((tok,rest):_) -> tok: chunk rest
isLinearSpace c = c `elem` " \t" -- " \t\xa0"
-- Glue sequences of tokens into more useful blobs
--glue (q:".":n:rest) | Char.isUpper (head q) -- qualified names
-- = glue ((q++"."++n): rest)
glue ("`":rest) = -- `varid` -> varop
case glue rest of
(qn:"`":rest) -> ("`"++qn++"`"): glue rest
_ -> ("`": rest)
glue (s:ss) | all (=='-') s && length s >=2 -- eol comment
= (s++concat c): glue rest
where (c,rest) = break ('\n'`elem`) ss
--glue ("{":"-":ss) = ("{-"++c): glue rest -- nested comment
-- where (c,rest) = nestcomment 0 ss
glue (s:ss) = s: glue ss
glue [] = []
-- Deal with comments.
nestcomment :: Int -> String -> (String,String)
nestcomment n ('{':'-':ss) | n>=0 = (("{-"++cs),rm)
where (cs,rm) = nestcomment (n+1) ss
nestcomment n ('-':'}':ss) | n>0 = (("-}"++cs),rm)
where (cs,rm) = nestcomment (n-1) ss
nestcomment n ('-':'}':ss) | n==0 = ("-}",ss)
nestcomment n (s:ss) | n>=0 = ((s:cs),rm)
where (cs,rm) = nestcomment n ss
nestcomment n [] = ([],[])
eolcomment :: String -> (String,String)
eolcomment s@('\n':_) = ([], s)
eolcomment (c:s) = (c:cs, s') where (cs,s') = eolcomment s
eolcomment [] = ([],[])
-- Classify tokens
data TokenType =
Space | Keyword | Keyglyph | Layout | Comment | Conid | Varid |
Conop | Varop | String | Char | Number | Error
deriving (Eq,Show)
classify :: String -> TokenType
classify s@(h:_)
| isSpace h = Space
| all (=='-') s = Comment
| "--" `isPrefixOf` s
&& any isSpace s = Comment -- not fully correct
| "{-" `isPrefixOf` s = Comment
| s `elem` keywords = Keyword
| s `elem` keyglyphs = Keyglyph
| s `elem` layoutchars = Layout
| isUpper h = Conid
| isLower h = Varid
| h `elem` symbols = Varop
| h==':' = Conop
| h=='`' = Varop
| h=='"' = String
| h=='\'' = Char
| isDigit h = Number
| otherwise = Error
-- Haskell keywords
keywords =
["case","class","data","default","deriving","do","else"
,"if","import","in","infix","infixl","infixr","instance","let","module"
,"newtype","of","then","type","where","_","foreign","ccall","as"]
keyglyphs =
["..","::","=","\\","|","<-","->","@","~","=>","[","]"]
layoutchars =
map (:[]) ";{}(),"
symbols =
"!#$%&*+./<=>?@\\^|-~"