{-# LANGUAGE BangPatterns #-} {-# LANGUAGE MagicHash #-} {-# LANGUAGE TypeApplications #-} {-# LANGUAGE UnboxedSums #-} {-# LANGUAGE UnboxedTuples #-} -- | This module treats 'Bytes' data as holding text encoded in ISO-8859-1. This -- encoding can only encode codepoints strictly below @U+0100@, but this allows -- each codepoint to be placed directly into a single byte. This range consists -- of Unicode Basic Latin, Latin-1 Supplement and C0+C1 Controls, which includes -- ASCII. -- -- Strictly, ISO-8859-1 is not to be confused with ISO/IEC 8859-1 (which was the -- default encoding for webpages before HTML5). ISO/IEC 8859-1 lacks encodings -- for the C0 and C1 control characters. -- -- With HTML5, the default encoding of webpages was changed to Windows-1252, -- which is _not_ compatible with ISO-8859-1. Windows-1252 uses the C1 Control -- range (@U+0080@ -- @U+009F@) mostly to encode a variety of printable -- characters. For this encoding, see 'Data.Bytes.Text.Windows1252'. module Data.Bytes.Text.Latin1 ( toString , fromString , decodeDecWord -- * Specialized Comparisons , equals1 , equals2 , equals3 , equals4 , equals5 , equals6 , equals7 , equals8 , equals9 , equals10 , equals11 , equals12 , equals13 , equals14 , equals15 ) where import Prelude hiding (length) import Data.Bytes.Types (Bytes(..)) import Data.Char (ord,chr) import Data.Primitive (ByteArray(ByteArray)) import Data.Word (Word8) import GHC.Exts (Int(I#),Char(C#),or#,ltWord#,int2Word#) import GHC.Exts (Word(W#),Word#) import qualified Data.Primitive as PM import qualified Data.Bytes.Pure as Bytes import qualified GHC.Exts as Exts -- | Convert a 'String' consisting of only characters representable -- by ISO-8859-1. These are encoded with ISO-8859-1. Any character -- with a codepoint above @U+00FF@ is replaced by an unspecified byte. fromString :: String -> Bytes fromString = Bytes.fromByteArray . Exts.fromList . map (fromIntegral @Int @Word8 . ord) -- | Interpret a byte sequence as text encoded by ISO-8859-1. toString :: Bytes -> String {-# INLINE toString #-} toString = Bytes.foldr (\w xs -> chr (fromIntegral @Word8 @Int w) : xs) [] -- TODO presumably also fromText and fromShortText -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a singleton whose element matches the character? equals1 :: Char -> Bytes -> Bool {-# INLINE equals1 #-} equals1 !c0 (Bytes arr off len) = case len of 1 -> c0 == indexCharArray arr off _ -> False -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a doubleton whose elements match the characters? equals2 :: Char -> Char -> Bytes -> Bool equals2 !c0 !c1 (Bytes arr off len) = case len of 2 -> c0 == indexCharArray arr off && c1 == indexCharArray arr (off + 1) _ -> False -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a tripleton whose elements match the characters? equals3 :: Char -> Char -> Char -> Bytes -> Bool equals3 !c0 !c1 !c2 (Bytes arr off len) = case len of 3 -> c0 == indexCharArray arr off && c1 == indexCharArray arr (off + 1) && c2 == indexCharArray arr (off + 2) _ -> False -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a quadrupleton whose elements match the characters? equals4 :: Char -> Char -> Char -> Char -> Bytes -> Bool equals4 !c0 !c1 !c2 !c3 (Bytes arr off len) = case len of 4 -> c0 == indexCharArray arr off && c1 == indexCharArray arr (off + 1) && c2 == indexCharArray arr (off + 2) && c3 == indexCharArray arr (off + 3) _ -> False -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a quintupleton whose elements match the characters? equals5 :: Char -> Char -> Char -> Char -> Char -> Bytes -> Bool equals5 !c0 !c1 !c2 !c3 !c4 (Bytes arr off len) = case len of 5 -> c0 == indexCharArray arr off && c1 == indexCharArray arr (off + 1) && c2 == indexCharArray arr (off + 2) && c3 == indexCharArray arr (off + 3) && c4 == indexCharArray arr (off + 4) _ -> False -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a sextupleton whose elements match the characters? equals6 :: Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool equals6 !c0 !c1 !c2 !c3 !c4 !c5 (Bytes arr off len) = case len of 6 -> c0 == indexCharArray arr off && c1 == indexCharArray arr (off + 1) && c2 == indexCharArray arr (off + 2) && c3 == indexCharArray arr (off + 3) && c4 == indexCharArray arr (off + 4) && c5 == indexCharArray arr (off + 5) _ -> False -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a septupleton whose elements match the characters? equals7 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool equals7 !c0 !c1 !c2 !c3 !c4 !c5 !c6 (Bytes arr off len) = case len of 7 -> c0 == indexCharArray arr off && c1 == indexCharArray arr (off + 1) && c2 == indexCharArray arr (off + 2) && c3 == indexCharArray arr (off + 3) && c4 == indexCharArray arr (off + 4) && c5 == indexCharArray arr (off + 5) && c6 == indexCharArray arr (off + 6) _ -> False -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- an octupleton whose elements match the characters? equals8 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool equals8 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 (Bytes arr off len) = case len of 8 -> c0 == indexCharArray arr off && c1 == indexCharArray arr (off + 1) && c2 == indexCharArray arr (off + 2) && c3 == indexCharArray arr (off + 3) && c4 == indexCharArray arr (off + 4) && c5 == indexCharArray arr (off + 5) && c6 == indexCharArray arr (off + 6) && c7 == indexCharArray arr (off + 7) _ -> False -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a 9-tuple whose elements match the characters? equals9 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool equals9 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 (Bytes arr off len) = case len of 9 -> c0 == indexCharArray arr off && c1 == indexCharArray arr (off + 1) && c2 == indexCharArray arr (off + 2) && c3 == indexCharArray arr (off + 3) && c4 == indexCharArray arr (off + 4) && c5 == indexCharArray arr (off + 5) && c6 == indexCharArray arr (off + 6) && c7 == indexCharArray arr (off + 7) && c8 == indexCharArray arr (off + 8) _ -> False -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a 10-tuple whose elements match the characters? equals10 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool equals10 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 (Bytes arr off len) = case len of 10 -> c0 == indexCharArray arr off && c1 == indexCharArray arr (off + 1) && c2 == indexCharArray arr (off + 2) && c3 == indexCharArray arr (off + 3) && c4 == indexCharArray arr (off + 4) && c5 == indexCharArray arr (off + 5) && c6 == indexCharArray arr (off + 6) && c7 == indexCharArray arr (off + 7) && c8 == indexCharArray arr (off + 8) && c9 == indexCharArray arr (off + 9) _ -> False -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a 11-tuple whose elements match the characters? equals11 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool equals11 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 !c10 (Bytes arr off len) = case len of 11 -> c0 == indexCharArray arr off && c1 == indexCharArray arr (off + 1) && c2 == indexCharArray arr (off + 2) && c3 == indexCharArray arr (off + 3) && c4 == indexCharArray arr (off + 4) && c5 == indexCharArray arr (off + 5) && c6 == indexCharArray arr (off + 6) && c7 == indexCharArray arr (off + 7) && c8 == indexCharArray arr (off + 8) && c9 == indexCharArray arr (off + 9) && c10 == indexCharArray arr (off + 10) _ -> False -- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, -- a 12-tuple whose elements match the characters? equals12 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool equals12 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 !c10 !c11 (Bytes arr off len) = case len of 12 -> c0 == indexCharArray arr off && c1 == indexCharArray arr (off + 1) && c2 == indexCharArray arr (off + 2) && c3 == indexCharArray arr (off + 3) && c4 == indexCharArray arr (off + 4) && c5 == indexCharArray arr (off + 5) && c6 == indexCharArray arr (off + 6) && c7 == indexCharArray arr (off + 7) && c8 == indexCharArray arr (off + 8) && c9 == indexCharArray arr (off + 9) && c10 == indexCharArray arr (off + 10) && c11 == indexCharArray arr (off + 11) _ -> False equals13 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool equals13 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 !c10 !c11 !c12 (Bytes arr off len) = case len of 13 -> c0 == indexCharArray arr off && c1 == indexCharArray arr (off + 1) && c2 == indexCharArray arr (off + 2) && c3 == indexCharArray arr (off + 3) && c4 == indexCharArray arr (off + 4) && c5 == indexCharArray arr (off + 5) && c6 == indexCharArray arr (off + 6) && c7 == indexCharArray arr (off + 7) && c8 == indexCharArray arr (off + 8) && c9 == indexCharArray arr (off + 9) && c10 == indexCharArray arr (off + 10) && c11 == indexCharArray arr (off + 11) && c12 == indexCharArray arr (off + 12) _ -> False equals14 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool equals14 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 !c10 !c11 !c12 !c13 (Bytes arr off len) = case len of 14 -> c0 == indexCharArray arr off && c1 == indexCharArray arr (off + 1) && c2 == indexCharArray arr (off + 2) && c3 == indexCharArray arr (off + 3) && c4 == indexCharArray arr (off + 4) && c5 == indexCharArray arr (off + 5) && c6 == indexCharArray arr (off + 6) && c7 == indexCharArray arr (off + 7) && c8 == indexCharArray arr (off + 8) && c9 == indexCharArray arr (off + 9) && c10 == indexCharArray arr (off + 10) && c11 == indexCharArray arr (off + 11) && c12 == indexCharArray arr (off + 12) && c13 == indexCharArray arr (off + 13) _ -> False equals15 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool equals15 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 !c10 !c11 !c12 !c13 !c14 (Bytes arr off len) = case len of 15 -> c0 == indexCharArray arr off && c1 == indexCharArray arr (off + 1) && c2 == indexCharArray arr (off + 2) && c3 == indexCharArray arr (off + 3) && c4 == indexCharArray arr (off + 4) && c5 == indexCharArray arr (off + 5) && c6 == indexCharArray arr (off + 6) && c7 == indexCharArray arr (off + 7) && c8 == indexCharArray arr (off + 8) && c9 == indexCharArray arr (off + 9) && c10 == indexCharArray arr (off + 10) && c11 == indexCharArray arr (off + 11) && c12 == indexCharArray arr (off + 12) && c13 == indexCharArray arr (off + 13) && c14 == indexCharArray arr (off + 14) _ -> False indexCharArray :: ByteArray -> Int -> Char indexCharArray (ByteArray arr) (I# off) = C# (Exts.indexCharArray# arr off) -- | Decode machine-sized word from decimal representation. Returns -- Nothing on overflow. Allows any number of leading zeros. Trailing -- non-digit bytes cause Nothing to be returned. decodeDecWord :: Bytes -> Maybe Word {-# inline decodeDecWord #-} decodeDecWord !b = case decWordStart b of (# (# #) | #) -> Nothing (# | w #) -> Just (W# w) decWordStart :: Bytes -- Chunk -> (# (# #) | Word# #) {-# noinline decWordStart #-} decWordStart !chunk0 = if length chunk0 > 0 then let !w = fromIntegral @Word8 @Word (PM.indexByteArray (array chunk0) (offset chunk0)) - 48 in if w < 10 then decWordMore w (Bytes.unsafeDrop 1 chunk0) else (# (# #) | #) else (# (# #) | #) where decWordMore :: Word -- Accumulator -> Bytes -- Chunk -> (# (# #) | Word# #) decWordMore !acc !chunk = let len = length chunk in case len of 0 -> (# | unW (fromIntegral acc) #) _ -> let !w = fromIntegral @Word8 @Word (PM.indexByteArray (array chunk) (offset chunk)) - 48 in if w < 10 then let (overflow,acc') = unsignedPushBase10 acc w in if overflow then (# (# #) | #) else decWordMore acc' (Bytes.unsafeDrop 1 chunk) else (# (# #) | #) unsignedPushBase10 :: Word -> Word -> (Bool,Word) {-# inline unsignedPushBase10 #-} unsignedPushBase10 (W# a) (W# b) = let !(# ca, r0 #) = Exts.timesWord2# a 10## !r1 = Exts.plusWord# r0 b !cb = int2Word# (ltWord# r1 r0) !c = ca `or#` cb in (case c of { 0## -> False; _ -> True }, W# r1) unW :: Word -> Word# {-# inline unW #-} unW (W# w) = w