{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE BangPatterns #-}
module Commonmark.Tokens
( Tok(..)
, TokType(..)
, SourcePos
, tokenize
, untokenize
) where
import Unicode.Char (isAlphaNum)
import Unicode.Char.General.Compat (isSpace)
import Data.Text (Text)
import qualified Data.Text as T
import Data.Data (Data, Typeable)
import Text.Parsec.Pos
import Data.Text.Normalize (normalize, NormalizationMode(NFC))
data Tok = Tok { Tok -> TokType
tokType :: !TokType
, Tok -> SourcePos
tokPos :: !SourcePos
, Tok -> Text
tokContents :: {-# UNPACK #-} !Text
}
deriving (Int -> Tok -> ShowS
[Tok] -> ShowS
Tok -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Tok] -> ShowS
$cshowList :: [Tok] -> ShowS
show :: Tok -> String
$cshow :: Tok -> String
showsPrec :: Int -> Tok -> ShowS
$cshowsPrec :: Int -> Tok -> ShowS
Show, Tok -> Tok -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Tok -> Tok -> Bool
$c/= :: Tok -> Tok -> Bool
== :: Tok -> Tok -> Bool
$c== :: Tok -> Tok -> Bool
Eq, Typeable Tok
Tok -> DataType
Tok -> Constr
(forall b. Data b => b -> b) -> Tok -> Tok
forall a.
Typeable a
-> (forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> a -> c a)
-> (forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c a)
-> (a -> Constr)
-> (a -> DataType)
-> (forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c a))
-> (forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c a))
-> ((forall b. Data b => b -> b) -> a -> a)
-> (forall r r'.
(r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> a -> r)
-> (forall r r'.
(r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> a -> r)
-> (forall u. (forall d. Data d => d -> u) -> a -> [u])
-> (forall u. Int -> (forall d. Data d => d -> u) -> a -> u)
-> (forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> a -> m a)
-> (forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> a -> m a)
-> (forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> a -> m a)
-> Data a
forall u. Int -> (forall d. Data d => d -> u) -> Tok -> u
forall u. (forall d. Data d => d -> u) -> Tok -> [u]
forall r r'.
(r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> Tok -> r
forall r r'.
(r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> Tok -> r
forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> Tok -> m Tok
forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Tok -> m Tok
forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c Tok
forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> Tok -> c Tok
forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c Tok)
forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c Tok)
gmapMo :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Tok -> m Tok
$cgmapMo :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Tok -> m Tok
gmapMp :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Tok -> m Tok
$cgmapMp :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Tok -> m Tok
gmapM :: forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> Tok -> m Tok
$cgmapM :: forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> Tok -> m Tok
gmapQi :: forall u. Int -> (forall d. Data d => d -> u) -> Tok -> u
$cgmapQi :: forall u. Int -> (forall d. Data d => d -> u) -> Tok -> u
gmapQ :: forall u. (forall d. Data d => d -> u) -> Tok -> [u]
$cgmapQ :: forall u. (forall d. Data d => d -> u) -> Tok -> [u]
gmapQr :: forall r r'.
(r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> Tok -> r
$cgmapQr :: forall r r'.
(r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> Tok -> r
gmapQl :: forall r r'.
(r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> Tok -> r
$cgmapQl :: forall r r'.
(r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> Tok -> r
gmapT :: (forall b. Data b => b -> b) -> Tok -> Tok
$cgmapT :: (forall b. Data b => b -> b) -> Tok -> Tok
dataCast2 :: forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c Tok)
$cdataCast2 :: forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c Tok)
dataCast1 :: forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c Tok)
$cdataCast1 :: forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c Tok)
dataTypeOf :: Tok -> DataType
$cdataTypeOf :: Tok -> DataType
toConstr :: Tok -> Constr
$ctoConstr :: Tok -> Constr
gunfold :: forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c Tok
$cgunfold :: forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c Tok
gfoldl :: forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> Tok -> c Tok
$cgfoldl :: forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> Tok -> c Tok
Data, Typeable)
data TokType =
Spaces
| UnicodeSpace
| LineEnd
| WordChars
| Symbol {-# UNPACK #-} !Char
deriving (Int -> TokType -> ShowS
[TokType] -> ShowS
TokType -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [TokType] -> ShowS
$cshowList :: [TokType] -> ShowS
show :: TokType -> String
$cshow :: TokType -> String
showsPrec :: Int -> TokType -> ShowS
$cshowsPrec :: Int -> TokType -> ShowS
Show, TokType -> TokType -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: TokType -> TokType -> Bool
$c/= :: TokType -> TokType -> Bool
== :: TokType -> TokType -> Bool
$c== :: TokType -> TokType -> Bool
Eq, Eq TokType
TokType -> TokType -> Bool
TokType -> TokType -> Ordering
TokType -> TokType -> TokType
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: TokType -> TokType -> TokType
$cmin :: TokType -> TokType -> TokType
max :: TokType -> TokType -> TokType
$cmax :: TokType -> TokType -> TokType
>= :: TokType -> TokType -> Bool
$c>= :: TokType -> TokType -> Bool
> :: TokType -> TokType -> Bool
$c> :: TokType -> TokType -> Bool
<= :: TokType -> TokType -> Bool
$c<= :: TokType -> TokType -> Bool
< :: TokType -> TokType -> Bool
$c< :: TokType -> TokType -> Bool
compare :: TokType -> TokType -> Ordering
$ccompare :: TokType -> TokType -> Ordering
Ord, Typeable TokType
TokType -> DataType
TokType -> Constr
(forall b. Data b => b -> b) -> TokType -> TokType
forall a.
Typeable a
-> (forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> a -> c a)
-> (forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c a)
-> (a -> Constr)
-> (a -> DataType)
-> (forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c a))
-> (forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c a))
-> ((forall b. Data b => b -> b) -> a -> a)
-> (forall r r'.
(r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> a -> r)
-> (forall r r'.
(r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> a -> r)
-> (forall u. (forall d. Data d => d -> u) -> a -> [u])
-> (forall u. Int -> (forall d. Data d => d -> u) -> a -> u)
-> (forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> a -> m a)
-> (forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> a -> m a)
-> (forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> a -> m a)
-> Data a
forall u. Int -> (forall d. Data d => d -> u) -> TokType -> u
forall u. (forall d. Data d => d -> u) -> TokType -> [u]
forall r r'.
(r -> r' -> r)
-> r -> (forall d. Data d => d -> r') -> TokType -> r
forall r r'.
(r' -> r -> r)
-> r -> (forall d. Data d => d -> r') -> TokType -> r
forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> TokType -> m TokType
forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> TokType -> m TokType
forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c TokType
forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> TokType -> c TokType
forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c TokType)
forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c TokType)
gmapMo :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> TokType -> m TokType
$cgmapMo :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> TokType -> m TokType
gmapMp :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> TokType -> m TokType
$cgmapMp :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> TokType -> m TokType
gmapM :: forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> TokType -> m TokType
$cgmapM :: forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> TokType -> m TokType
gmapQi :: forall u. Int -> (forall d. Data d => d -> u) -> TokType -> u
$cgmapQi :: forall u. Int -> (forall d. Data d => d -> u) -> TokType -> u
gmapQ :: forall u. (forall d. Data d => d -> u) -> TokType -> [u]
$cgmapQ :: forall u. (forall d. Data d => d -> u) -> TokType -> [u]
gmapQr :: forall r r'.
(r' -> r -> r)
-> r -> (forall d. Data d => d -> r') -> TokType -> r
$cgmapQr :: forall r r'.
(r' -> r -> r)
-> r -> (forall d. Data d => d -> r') -> TokType -> r
gmapQl :: forall r r'.
(r -> r' -> r)
-> r -> (forall d. Data d => d -> r') -> TokType -> r
$cgmapQl :: forall r r'.
(r -> r' -> r)
-> r -> (forall d. Data d => d -> r') -> TokType -> r
gmapT :: (forall b. Data b => b -> b) -> TokType -> TokType
$cgmapT :: (forall b. Data b => b -> b) -> TokType -> TokType
dataCast2 :: forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c TokType)
$cdataCast2 :: forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c TokType)
dataCast1 :: forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c TokType)
$cdataCast1 :: forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c TokType)
dataTypeOf :: TokType -> DataType
$cdataTypeOf :: TokType -> DataType
toConstr :: TokType -> Constr
$ctoConstr :: TokType -> Constr
gunfold :: forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c TokType
$cgunfold :: forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c TokType
gfoldl :: forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> TokType -> c TokType
$cgfoldl :: forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> TokType -> c TokType
Data, Typeable)
tokenize :: String -> Text -> [Tok]
tokenize :: String -> Text -> [Tok]
tokenize String
name =
{-# SCC tokenize #-} SourcePos -> [Text] -> [Tok]
go (String -> SourcePos
initialPos String
name) forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Char -> Bool) -> Text -> [Text]
T.groupBy Char -> Char -> Bool
f forall b c a. (b -> c) -> (a -> b) -> a -> c
. NormalizationMode -> Text -> Text
normalize NormalizationMode
NFC
where
f :: Char -> Char -> Bool
f Char
'\r' Char
'\n' = Bool
True
f Char
' ' Char
' ' = Bool
True
f Char
x Char
y = Char -> Bool
isAlphaNum Char
x Bool -> Bool -> Bool
&& Char -> Bool
isAlphaNum Char
y
go :: SourcePos -> [Text] -> [Tok]
go !SourcePos
_pos [] = []
go !SourcePos
pos (!Text
t:[Text]
ts) =
case Text -> Char
T.head Text
t of
Char
' ' -> TokType -> SourcePos -> Text -> Tok
Tok TokType
Spaces SourcePos
pos Text
t forall a. a -> [a] -> [a]
:
SourcePos -> [Text] -> [Tok]
go (SourcePos -> Int -> SourcePos
incSourceColumn SourcePos
pos (Text -> Int
T.length Text
t)) [Text]
ts
Char
'\t' -> TokType -> SourcePos -> Text -> Tok
Tok TokType
Spaces SourcePos
pos Text
t forall a. a -> [a] -> [a]
:
SourcePos -> [Text] -> [Tok]
go (SourcePos -> Int -> SourcePos
incSourceColumn SourcePos
pos
(Int
4 forall a. Num a => a -> a -> a
- (SourcePos -> Int
sourceColumn SourcePos
pos forall a. Num a => a -> a -> a
- Int
1) forall a. Integral a => a -> a -> a
`mod` Int
4)) [Text]
ts
Char
'\r' -> TokType -> SourcePos -> Text -> Tok
Tok TokType
LineEnd SourcePos
pos Text
t forall a. a -> [a] -> [a]
:
SourcePos -> [Text] -> [Tok]
go (SourcePos -> Int -> SourcePos
incSourceLine (SourcePos -> Int -> SourcePos
setSourceColumn SourcePos
pos Int
1) Int
1) [Text]
ts
Char
'\n' -> TokType -> SourcePos -> Text -> Tok
Tok TokType
LineEnd SourcePos
pos Text
t forall a. a -> [a] -> [a]
:
SourcePos -> [Text] -> [Tok]
go (SourcePos -> Int -> SourcePos
incSourceLine (SourcePos -> Int -> SourcePos
setSourceColumn SourcePos
pos Int
1) Int
1) [Text]
ts
Char
thead
| Char -> Bool
isAlphaNum Char
thead ->
TokType -> SourcePos -> Text -> Tok
Tok TokType
WordChars SourcePos
pos Text
t forall a. a -> [a] -> [a]
:
SourcePos -> [Text] -> [Tok]
go (SourcePos -> Int -> SourcePos
incSourceColumn SourcePos
pos (Text -> Int
T.length Text
t)) [Text]
ts
| Char -> Bool
isSpace Char
thead ->
TokType -> SourcePos -> Text -> Tok
Tok TokType
UnicodeSpace SourcePos
pos Text
t forall a. a -> [a] -> [a]
:
SourcePos -> [Text] -> [Tok]
go (SourcePos -> Int -> SourcePos
incSourceColumn SourcePos
pos Int
1) [Text]
ts
| Bool
otherwise ->
TokType -> SourcePos -> Text -> Tok
Tok (Char -> TokType
Symbol Char
thead) SourcePos
pos Text
t forall a. a -> [a] -> [a]
:
SourcePos -> [Text] -> [Tok]
go (SourcePos -> Int -> SourcePos
incSourceColumn SourcePos
pos Int
1) [Text]
ts
untokenize :: [Tok] -> Text
untokenize :: [Tok] -> Text
untokenize = {-# SCC untokenize #-} forall a. Monoid a => [a] -> a
mconcat forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a -> b) -> [a] -> [b]
map Tok -> Text
tokContents