{- | Parsers for strings in TOML format, including basic and literal strings both singleline and multiline. -} module Toml.Parser.String ( textP , basicStringP , literalStringP ) where import Control.Applicative (Alternative (..)) import Control.Applicative.Combinators (count, manyTill, optional) import Data.Char (chr, isControl) import Data.Semigroup ((<>)) import Data.Text (Text) import Toml.Parser.Core (Parser, anySingle, char, eol, hexDigitChar, lexeme, satisfy, space, string, tab, try, ()) import qualified Data.Text as Text {- | Parser for TOML text. Includes: 1. Basic single-line string. 2. Literal single-line string. 3. Basic multiline string. 4. Literal multiline string. -} textP :: Parser Text textP = (multilineBasicStringP "multiline basic string") <|> (multilineLiteralStringP "multiline literal string") <|> (literalStringP "literal string") <|> (basicStringP "basic string") "text" {- | Parse a non-control character (control character is a non-printing character of the Latin-1 subset of Unicode). -} nonControlCharP :: Parser Text nonControlCharP = Text.singleton <$> satisfy (not . isControl) "non-control char" -- | Parse escape sequences inside basic strings. escapeSequenceP :: Parser Text escapeSequenceP = char '\\' *> anySingle >>= \case 'b' -> pure "\b" 't' -> pure "\t" 'n' -> pure "\n" 'f' -> pure "\f" 'r' -> pure "\r" '"' -> pure "\"" '\\' -> pure "\\" 'u' -> hexUnicodeP 4 'U' -> hexUnicodeP 8 c -> fail $ "Invalid escape sequence: " <> "\\" <> [c] where hexUnicodeP :: Int -> Parser Text hexUnicodeP n = count n hexDigitChar >>= \x -> case toUnicode $ hexToInt x of Just c -> pure (Text.singleton c) Nothing -> fail $ "Invalid unicode character: \\" <> (if n == 4 then "u" else "U") <> x where hexToInt :: String -> Int hexToInt xs = read $ "0x" ++ xs toUnicode :: Int -> Maybe Char toUnicode x -- Ranges from "The Unicode Standard". -- See definition D76 in Section 3.9, Unicode Encoding Forms. | x >= 0 && x <= 0xD7FF = Just (chr x) | x >= 0xE000 && x <= 0x10FFFF = Just (chr x) | otherwise = Nothing -- | Parser for basic string in double quotes. basicStringP :: Parser Text basicStringP = lexeme $ mconcat <$> (char '"' *> charP `manyTill` char '"') where charP :: Parser Text charP = escapeSequenceP <|> nonControlCharP -- | Parser for literal string in single quotes. literalStringP :: Parser Text literalStringP = lexeme $ Text.pack <$> (char '\'' *> nonEolCharP `manyTill` char '\'') where nonEolCharP :: Parser Char nonEolCharP = satisfy (\c -> c /= '\n' && c /= '\r') -- | Generic parser for multiline string. Used in 'multilineBasicStringP' and -- 'multilineLiteralStringP'. multilineP :: Parser Text -> Parser Text -> Parser Text multilineP quotesP allowedCharP = lexeme $ mconcat <$> (quotesP *> optional eol *> allowedCharP `manyTill` quotesP) -- Parser for basic multiline string in """ quotes. multilineBasicStringP :: Parser Text multilineBasicStringP = multilineP quotesP allowedCharP where quotesP :: Parser Text quotesP = string "\"\"\"" allowedCharP :: Parser Text allowedCharP = lineEndingBackslashP <|> escapeSequenceP <|> nonControlCharP <|> eol lineEndingBackslashP :: Parser Text lineEndingBackslashP = Text.empty <$ try (char '\\' >> eol >> space) -- Parser for literal multiline string in ''' quotes. multilineLiteralStringP :: Parser Text multilineLiteralStringP = multilineP quotesP allowedCharP where quotesP :: Parser Text quotesP = string "'''" allowedCharP :: Parser Text allowedCharP = nonControlCharP <|> eol <|> Text.singleton <$> tab