module OpenTheory.Unicode.UTF8
where
import qualified OpenTheory.Natural.Bits as Bits
import qualified OpenTheory.Parser as Parser
import qualified OpenTheory.Parser.Stream as Stream
import qualified OpenTheory.Primitive.Byte as Byte
import qualified OpenTheory.Primitive.Natural as Natural
import qualified OpenTheory.Unicode as Unicode
parseAscii :: Parser.Parser Byte.Byte Natural.Natural
parseAscii =
Parser.token
(\b -> if Byte.bit b 7 then Nothing else Just (Byte.toNatural b))
isContinuationByte :: Byte.Byte -> Bool
isContinuationByte b = Byte.bit b 7 && not (Byte.bit b 6)
parseMultibyte :: Parser.Parser Byte.Byte Natural.Natural
parseMultibyte =
Parser.sequenceParser
(Parser.token
(\b ->
if Byte.bit b 6 then
if Byte.bit b 5 then
if Byte.bit b 4 then
if Byte.bit b 3 then Nothing else Just (parse4 b)
else Just (parse3 b)
else Just (parse2 b)
else Nothing))
where
parse2 b =
Parser.filterParser
(Parser.foldN addContinuationByte 0
(Byte.toNatural (Byte.and b 31))) (\n -> 128 <= n)
parse3 b =
Parser.filterParser
(Parser.foldN addContinuationByte 1
(Byte.toNatural (Byte.and b 15))) (\n -> 2048 <= n)
parse4 b =
Parser.filterParser
(Parser.foldN addContinuationByte 2
(Byte.toNatural (Byte.and b 7))) (\n -> 65536 <= n)
addContinuationByte b n =
if isContinuationByte b then
Just (Byte.toNatural (Byte.and b 63) + Natural.shiftLeft n 6)
else Nothing
parseNatural :: Parser.Parser Byte.Byte Natural.Natural
parseNatural = Parser.orelse parseAscii parseMultibyte
parseUnicode :: Parser.Parser Byte.Byte Unicode.Unicode
parseUnicode =
Parser.mapPartial parseNatural
(\n ->
if Unicode.invariant n then Just (Unicode.Unicode n) else Nothing)
parse :: Parser.Parser Byte.Byte (Either Byte.Byte Unicode.Unicode)
parse =
Parser.orelse (Parser.mapParser parseUnicode Right)
(Parser.mapParser Parser.anyToken Left)
decode :: [Byte.Byte] -> [Either Byte.Byte Unicode.Unicode]
decode b = fst (Stream.toList (Parser.parse parse (Stream.fromList b)))
encodeAscii :: Natural.Natural -> [Byte.Byte]
encodeAscii n = Byte.fromNatural n : []
encodeUnicode :: Unicode.Unicode -> [Byte.Byte]
encodeUnicode =
\c ->
let n = Unicode.unUnicode c in
if n < 128 then encodeAscii n
else if n < 2048 then encode2 n
else if n < 65536 then encode3 n
else encode4 n
where
encode2 n =
let n1 = Natural.shiftRight n 6 in
let b0 = Byte.or 192 (Byte.fromNatural n1) in
let b1 = Byte.or 128 (Byte.fromNatural (Bits.bound n 6)) in
b0 : b1 : []
encode3 n =
let n1 = Natural.shiftRight n 6 in
let n2 = Natural.shiftRight n1 6 in
let b0 = Byte.or 224 (Byte.fromNatural n2) in
let b1 = Byte.or 128 (Byte.fromNatural (Bits.bound n1 6)) in
let b2 = Byte.or 128 (Byte.fromNatural (Bits.bound n 6)) in
b0 : b1 : b2 : []
encode4 n =
let n1 = Natural.shiftRight n 6 in
let n2 = Natural.shiftRight n1 6 in
let n3 = Natural.shiftRight n2 6 in
let b0 = Byte.or 240 (Byte.fromNatural n3) in
let b1 = Byte.or 128 (Byte.fromNatural (Bits.bound n2 6)) in
let b2 = Byte.or 128 (Byte.fromNatural (Bits.bound n1 6)) in
let b3 = Byte.or 128 (Byte.fromNatural (Bits.bound n 6)) in
b0 : b1 : b2 : b3 : []
encode :: [Unicode.Unicode] -> [Byte.Byte]
encode c = concat (map encodeUnicode c)
reencodeUnicode :: Either Byte.Byte Unicode.Unicode -> [Byte.Byte]
reencodeUnicode x =
case x of
Left b -> b : []
Right c -> encodeUnicode c
reencode :: [Either Byte.Byte Unicode.Unicode] -> [Byte.Byte]
reencode c = concat (map reencodeUnicode c)