| 1 | {-# LANGUAGE BangPatterns #-} |
|---|
| 2 | {-# OPTIONS_GHC -O #-} |
|---|
| 3 | -- We always optimise this, otherwise performance of a non-optimised |
|---|
| 4 | -- compiler is severely affected |
|---|
| 5 | |
|---|
| 6 | -- ----------------------------------------------------------------------------- |
|---|
| 7 | -- |
|---|
| 8 | -- (c) The University of Glasgow, 1997-2006 |
|---|
| 9 | -- |
|---|
| 10 | -- Character encodings |
|---|
| 11 | -- |
|---|
| 12 | -- ----------------------------------------------------------------------------- |
|---|
| 13 | |
|---|
| 14 | module Encoding ( |
|---|
| 15 | -- * UTF-8 |
|---|
| 16 | utf8DecodeChar#, |
|---|
| 17 | utf8PrevChar, |
|---|
| 18 | utf8CharStart, |
|---|
| 19 | utf8DecodeChar, |
|---|
| 20 | utf8DecodeString, |
|---|
| 21 | utf8EncodeChar, |
|---|
| 22 | utf8EncodeString, |
|---|
| 23 | utf8EncodedLength, |
|---|
| 24 | countUTF8Chars, |
|---|
| 25 | |
|---|
| 26 | -- * Z-encoding |
|---|
| 27 | zEncodeString, |
|---|
| 28 | zDecodeString |
|---|
| 29 | ) where |
|---|
| 30 | |
|---|
| 31 | #include "HsVersions.h" |
|---|
| 32 | import Foreign |
|---|
| 33 | import Data.Char |
|---|
| 34 | import Numeric |
|---|
| 35 | import GHC.Ptr ( Ptr(..) ) |
|---|
| 36 | import GHC.Base |
|---|
| 37 | |
|---|
| 38 | -- ----------------------------------------------------------------------------- |
|---|
| 39 | -- UTF-8 |
|---|
| 40 | |
|---|
| 41 | -- We can't write the decoder as efficiently as we'd like without |
|---|
| 42 | -- resorting to unboxed extensions, unfortunately. I tried to write |
|---|
| 43 | -- an IO version of this function, but GHC can't eliminate boxed |
|---|
| 44 | -- results from an IO-returning function. |
|---|
| 45 | -- |
|---|
| 46 | -- We assume we can ignore overflow when parsing a multibyte character here. |
|---|
| 47 | -- To make this safe, we add extra sentinel bytes to unparsed UTF-8 sequences |
|---|
| 48 | -- before decoding them (see StringBuffer.hs). |
|---|
| 49 | |
|---|
| 50 | {-# INLINE utf8DecodeChar# #-} |
|---|
| 51 | utf8DecodeChar# :: Addr# -> (# Char#, Addr# #) |
|---|
| 52 | utf8DecodeChar# a# = |
|---|
| 53 | let !ch0 = word2Int# (indexWord8OffAddr# a# 0#) in |
|---|
| 54 | case () of |
|---|
| 55 | _ | ch0 <=# 0x7F# -> (# chr# ch0, a# `plusAddr#` 1# #) |
|---|
| 56 | |
|---|
| 57 | | ch0 >=# 0xC0# && ch0 <=# 0xDF# -> |
|---|
| 58 | let !ch1 = word2Int# (indexWord8OffAddr# a# 1#) in |
|---|
| 59 | if ch1 <# 0x80# || ch1 >=# 0xC0# then fail 1# else |
|---|
| 60 | (# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +# |
|---|
| 61 | (ch1 -# 0x80#)), |
|---|
| 62 | a# `plusAddr#` 2# #) |
|---|
| 63 | |
|---|
| 64 | | ch0 >=# 0xE0# && ch0 <=# 0xEF# -> |
|---|
| 65 | let !ch1 = word2Int# (indexWord8OffAddr# a# 1#) in |
|---|
| 66 | if ch1 <# 0x80# || ch1 >=# 0xC0# then fail 1# else |
|---|
| 67 | let !ch2 = word2Int# (indexWord8OffAddr# a# 2#) in |
|---|
| 68 | if ch2 <# 0x80# || ch2 >=# 0xC0# then fail 2# else |
|---|
| 69 | (# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +# |
|---|
| 70 | ((ch1 -# 0x80#) `uncheckedIShiftL#` 6#) +# |
|---|
| 71 | (ch2 -# 0x80#)), |
|---|
| 72 | a# `plusAddr#` 3# #) |
|---|
| 73 | |
|---|
| 74 | | ch0 >=# 0xF0# && ch0 <=# 0xF8# -> |
|---|
| 75 | let !ch1 = word2Int# (indexWord8OffAddr# a# 1#) in |
|---|
| 76 | if ch1 <# 0x80# || ch1 >=# 0xC0# then fail 1# else |
|---|
| 77 | let !ch2 = word2Int# (indexWord8OffAddr# a# 2#) in |
|---|
| 78 | if ch2 <# 0x80# || ch2 >=# 0xC0# then fail 2# else |
|---|
| 79 | let !ch3 = word2Int# (indexWord8OffAddr# a# 3#) in |
|---|
| 80 | if ch3 <# 0x80# || ch3 >=# 0xC0# then fail 3# else |
|---|
| 81 | (# chr# (((ch0 -# 0xF0#) `uncheckedIShiftL#` 18#) +# |
|---|
| 82 | ((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +# |
|---|
| 83 | ((ch2 -# 0x80#) `uncheckedIShiftL#` 6#) +# |
|---|
| 84 | (ch3 -# 0x80#)), |
|---|
| 85 | a# `plusAddr#` 4# #) |
|---|
| 86 | |
|---|
| 87 | | otherwise -> fail 1# |
|---|
| 88 | where |
|---|
| 89 | -- all invalid sequences end up here: |
|---|
| 90 | fail n = (# '\0'#, a# `plusAddr#` n #) |
|---|
| 91 | -- '\xFFFD' would be the usual replacement character, but |
|---|
| 92 | -- that's a valid symbol in Haskell, so will result in a |
|---|
| 93 | -- confusing parse error later on. Instead we use '\0' which |
|---|
| 94 | -- will signal a lexer error immediately. |
|---|
| 95 | |
|---|
| 96 | utf8DecodeChar :: Ptr Word8 -> (Char, Ptr Word8) |
|---|
| 97 | utf8DecodeChar (Ptr a#) = |
|---|
| 98 | case utf8DecodeChar# a# of (# c#, b# #) -> ( C# c#, Ptr b# ) |
|---|
| 99 | |
|---|
| 100 | -- UTF-8 is cleverly designed so that we can always figure out where |
|---|
| 101 | -- the start of the current character is, given any position in a |
|---|
| 102 | -- stream. This function finds the start of the previous character, |
|---|
| 103 | -- assuming there *is* a previous character. |
|---|
| 104 | utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8) |
|---|
| 105 | utf8PrevChar p = utf8CharStart (p `plusPtr` (-1)) |
|---|
| 106 | |
|---|
| 107 | utf8CharStart :: Ptr Word8 -> IO (Ptr Word8) |
|---|
| 108 | utf8CharStart p = go p |
|---|
| 109 | where go p = do w <- peek p |
|---|
| 110 | if w >= 0x80 && w < 0xC0 |
|---|
| 111 | then go (p `plusPtr` (-1)) |
|---|
| 112 | else return p |
|---|
| 113 | |
|---|
| 114 | utf8DecodeString :: Ptr Word8 -> Int -> IO [Char] |
|---|
| 115 | STRICT2(utf8DecodeString) |
|---|
| 116 | utf8DecodeString (Ptr a#) (I# len#) |
|---|
| 117 | = unpack a# |
|---|
| 118 | where |
|---|
| 119 | !end# = addr2Int# (a# `plusAddr#` len#) |
|---|
| 120 | |
|---|
| 121 | unpack p# |
|---|
| 122 | | addr2Int# p# >=# end# = return [] |
|---|
| 123 | | otherwise = |
|---|
| 124 | case utf8DecodeChar# p# of |
|---|
| 125 | (# c#, q# #) -> do |
|---|
| 126 | chs <- unpack q# |
|---|
| 127 | return (C# c# : chs) |
|---|
| 128 | |
|---|
| 129 | countUTF8Chars :: Ptr Word8 -> Int -> IO Int |
|---|
| 130 | countUTF8Chars ptr bytes = go ptr 0 |
|---|
| 131 | where |
|---|
| 132 | end = ptr `plusPtr` bytes |
|---|
| 133 | |
|---|
| 134 | STRICT2(go) |
|---|
| 135 | go ptr n |
|---|
| 136 | | ptr >= end = return n |
|---|
| 137 | | otherwise = do |
|---|
| 138 | case utf8DecodeChar# (unPtr ptr) of |
|---|
| 139 | (# _, a #) -> go (Ptr a) (n+1) |
|---|
| 140 | |
|---|
| 141 | unPtr :: Ptr a -> Addr# |
|---|
| 142 | unPtr (Ptr a) = a |
|---|
| 143 | |
|---|
| 144 | utf8EncodeChar :: Char -> Ptr Word8 -> IO (Ptr Word8) |
|---|
| 145 | utf8EncodeChar c ptr = |
|---|
| 146 | let x = ord c in |
|---|
| 147 | case () of |
|---|
| 148 | _ | x > 0 && x <= 0x007f -> do |
|---|
| 149 | poke ptr (fromIntegral x) |
|---|
| 150 | return (ptr `plusPtr` 1) |
|---|
| 151 | -- NB. '\0' is encoded as '\xC0\x80', not '\0'. This is so that we |
|---|
| 152 | -- can have 0-terminated UTF-8 strings (see GHC.Base.unpackCStringUtf8). |
|---|
| 153 | | x <= 0x07ff -> do |
|---|
| 154 | poke ptr (fromIntegral (0xC0 .|. ((x `shiftR` 6) .&. 0x1F))) |
|---|
| 155 | pokeElemOff ptr 1 (fromIntegral (0x80 .|. (x .&. 0x3F))) |
|---|
| 156 | return (ptr `plusPtr` 2) |
|---|
| 157 | | x <= 0xffff -> do |
|---|
| 158 | poke ptr (fromIntegral (0xE0 .|. (x `shiftR` 12) .&. 0x0F)) |
|---|
| 159 | pokeElemOff ptr 1 (fromIntegral (0x80 .|. (x `shiftR` 6) .&. 0x3F)) |
|---|
| 160 | pokeElemOff ptr 2 (fromIntegral (0x80 .|. (x .&. 0x3F))) |
|---|
| 161 | return (ptr `plusPtr` 3) |
|---|
| 162 | | otherwise -> do |
|---|
| 163 | poke ptr (fromIntegral (0xF0 .|. (x `shiftR` 18))) |
|---|
| 164 | pokeElemOff ptr 1 (fromIntegral (0x80 .|. ((x `shiftR` 12) .&. 0x3F))) |
|---|
| 165 | pokeElemOff ptr 2 (fromIntegral (0x80 .|. ((x `shiftR` 6) .&. 0x3F))) |
|---|
| 166 | pokeElemOff ptr 3 (fromIntegral (0x80 .|. (x .&. 0x3F))) |
|---|
| 167 | return (ptr `plusPtr` 4) |
|---|
| 168 | |
|---|
| 169 | utf8EncodeString :: Ptr Word8 -> String -> IO () |
|---|
| 170 | utf8EncodeString ptr str = go ptr str |
|---|
| 171 | where STRICT2(go) |
|---|
| 172 | go _ [] = return () |
|---|
| 173 | go ptr (c:cs) = do |
|---|
| 174 | ptr' <- utf8EncodeChar c ptr |
|---|
| 175 | go ptr' cs |
|---|
| 176 | |
|---|
| 177 | utf8EncodedLength :: String -> Int |
|---|
| 178 | utf8EncodedLength str = go 0 str |
|---|
| 179 | where STRICT2(go) |
|---|
| 180 | go n [] = n |
|---|
| 181 | go n (c:cs) |
|---|
| 182 | | ord c > 0 && ord c <= 0x007f = go (n+1) cs |
|---|
| 183 | | ord c <= 0x07ff = go (n+2) cs |
|---|
| 184 | | ord c <= 0xffff = go (n+3) cs |
|---|
| 185 | | otherwise = go (n+4) cs |
|---|
| 186 | |
|---|
| 187 | -- ----------------------------------------------------------------------------- |
|---|
| 188 | -- The Z-encoding |
|---|
| 189 | |
|---|
| 190 | {- |
|---|
| 191 | This is the main name-encoding and decoding function. It encodes any |
|---|
| 192 | string into a string that is acceptable as a C name. This is done |
|---|
| 193 | right before we emit a symbol name into the compiled C or asm code. |
|---|
| 194 | Z-encoding of strings is cached in the FastString interface, so we |
|---|
| 195 | never encode the same string more than once. |
|---|
| 196 | |
|---|
| 197 | The basic encoding scheme is this. |
|---|
| 198 | |
|---|
| 199 | * Tuples (,,,) are coded as Z3T |
|---|
| 200 | |
|---|
| 201 | * Alphabetic characters (upper and lower) and digits |
|---|
| 202 | all translate to themselves; |
|---|
| 203 | except 'Z', which translates to 'ZZ' |
|---|
| 204 | and 'z', which translates to 'zz' |
|---|
| 205 | We need both so that we can preserve the variable/tycon distinction |
|---|
| 206 | |
|---|
| 207 | * Most other printable characters translate to 'zx' or 'Zx' for some |
|---|
| 208 | alphabetic character x |
|---|
| 209 | |
|---|
| 210 | * The others translate as 'znnnU' where 'nnn' is the decimal number |
|---|
| 211 | of the character |
|---|
| 212 | |
|---|
| 213 | Before After |
|---|
| 214 | -------------------------- |
|---|
| 215 | Trak Trak |
|---|
| 216 | foo_wib foozuwib |
|---|
| 217 | > zg |
|---|
| 218 | >1 zg1 |
|---|
| 219 | foo# foozh |
|---|
| 220 | foo## foozhzh |
|---|
| 221 | foo##1 foozhzh1 |
|---|
| 222 | fooZ fooZZ |
|---|
| 223 | :+ ZCzp |
|---|
| 224 | () Z0T 0-tuple |
|---|
| 225 | (,,,,) Z5T 5-tuple |
|---|
| 226 | (# #) Z1H unboxed 1-tuple (note the space) |
|---|
| 227 | (#,,,,#) Z5H unboxed 5-tuple |
|---|
| 228 | (NB: There is no Z1T nor Z0H.) |
|---|
| 229 | -} |
|---|
| 230 | |
|---|
| 231 | type UserString = String -- As the user typed it |
|---|
| 232 | type EncodedString = String -- Encoded form |
|---|
| 233 | |
|---|
| 234 | |
|---|
| 235 | zEncodeString :: UserString -> EncodedString |
|---|
| 236 | zEncodeString cs = case maybe_tuple cs of |
|---|
| 237 | Just n -> n -- Tuples go to Z2T etc |
|---|
| 238 | Nothing -> go cs |
|---|
| 239 | where |
|---|
| 240 | go [] = [] |
|---|
| 241 | go (c:cs) = encode_digit_ch c ++ go' cs |
|---|
| 242 | go' [] = [] |
|---|
| 243 | go' (c:cs) = encode_ch c ++ go' cs |
|---|
| 244 | |
|---|
| 245 | unencodedChar :: Char -> Bool -- True for chars that don't need encoding |
|---|
| 246 | unencodedChar 'Z' = False |
|---|
| 247 | unencodedChar 'z' = False |
|---|
| 248 | unencodedChar c = c >= 'a' && c <= 'z' |
|---|
| 249 | || c >= 'A' && c <= 'Z' |
|---|
| 250 | || c >= '0' && c <= '9' |
|---|
| 251 | |
|---|
| 252 | -- If a digit is at the start of a symbol then we need to encode it. |
|---|
| 253 | -- Otherwise package names like 9pH-0.1 give linker errors. |
|---|
| 254 | encode_digit_ch :: Char -> EncodedString |
|---|
| 255 | encode_digit_ch c | c >= '0' && c <= '9' = encode_as_unicode_char c |
|---|
| 256 | encode_digit_ch c | otherwise = encode_ch c |
|---|
| 257 | |
|---|
| 258 | encode_ch :: Char -> EncodedString |
|---|
| 259 | encode_ch c | unencodedChar c = [c] -- Common case first |
|---|
| 260 | |
|---|
| 261 | -- Constructors |
|---|
| 262 | encode_ch '(' = "ZL" -- Needed for things like (,), and (->) |
|---|
| 263 | encode_ch ')' = "ZR" -- For symmetry with ( |
|---|
| 264 | encode_ch '[' = "ZM" |
|---|
| 265 | encode_ch ']' = "ZN" |
|---|
| 266 | encode_ch ':' = "ZC" |
|---|
| 267 | encode_ch 'Z' = "ZZ" |
|---|
| 268 | |
|---|
| 269 | -- Variables |
|---|
| 270 | encode_ch 'z' = "zz" |
|---|
| 271 | encode_ch '&' = "za" |
|---|
| 272 | encode_ch '|' = "zb" |
|---|
| 273 | encode_ch '^' = "zc" |
|---|
| 274 | encode_ch '$' = "zd" |
|---|
| 275 | encode_ch '=' = "ze" |
|---|
| 276 | encode_ch '>' = "zg" |
|---|
| 277 | encode_ch '#' = "zh" |
|---|
| 278 | encode_ch '.' = "zi" |
|---|
| 279 | encode_ch '<' = "zl" |
|---|
| 280 | encode_ch '-' = "zm" |
|---|
| 281 | encode_ch '!' = "zn" |
|---|
| 282 | encode_ch '+' = "zp" |
|---|
| 283 | encode_ch '\'' = "zq" |
|---|
| 284 | encode_ch '\\' = "zr" |
|---|
| 285 | encode_ch '/' = "zs" |
|---|
| 286 | encode_ch '*' = "zt" |
|---|
| 287 | encode_ch '_' = "zu" |
|---|
| 288 | encode_ch '%' = "zv" |
|---|
| 289 | encode_ch c = encode_as_unicode_char c |
|---|
| 290 | |
|---|
| 291 | encode_as_unicode_char :: Char -> EncodedString |
|---|
| 292 | encode_as_unicode_char c = 'z' : if isDigit (head hex_str) then hex_str |
|---|
| 293 | else '0':hex_str |
|---|
| 294 | where hex_str = showHex (ord c) "U" |
|---|
| 295 | -- ToDo: we could improve the encoding here in various ways. |
|---|
| 296 | -- eg. strings of unicode characters come out as 'z1234Uz5678U', we |
|---|
| 297 | -- could remove the 'U' in the middle (the 'z' works as a separator). |
|---|
| 298 | |
|---|
| 299 | zDecodeString :: EncodedString -> UserString |
|---|
| 300 | zDecodeString [] = [] |
|---|
| 301 | zDecodeString ('Z' : d : rest) |
|---|
| 302 | | isDigit d = decode_tuple d rest |
|---|
| 303 | | otherwise = decode_upper d : zDecodeString rest |
|---|
| 304 | zDecodeString ('z' : d : rest) |
|---|
| 305 | | isDigit d = decode_num_esc d rest |
|---|
| 306 | | otherwise = decode_lower d : zDecodeString rest |
|---|
| 307 | zDecodeString (c : rest) = c : zDecodeString rest |
|---|
| 308 | |
|---|
| 309 | decode_upper, decode_lower :: Char -> Char |
|---|
| 310 | |
|---|
| 311 | decode_upper 'L' = '(' |
|---|
| 312 | decode_upper 'R' = ')' |
|---|
| 313 | decode_upper 'M' = '[' |
|---|
| 314 | decode_upper 'N' = ']' |
|---|
| 315 | decode_upper 'C' = ':' |
|---|
| 316 | decode_upper 'Z' = 'Z' |
|---|
| 317 | decode_upper ch = {-pprTrace "decode_upper" (char ch)-} ch |
|---|
| 318 | |
|---|
| 319 | decode_lower 'z' = 'z' |
|---|
| 320 | decode_lower 'a' = '&' |
|---|
| 321 | decode_lower 'b' = '|' |
|---|
| 322 | decode_lower 'c' = '^' |
|---|
| 323 | decode_lower 'd' = '$' |
|---|
| 324 | decode_lower 'e' = '=' |
|---|
| 325 | decode_lower 'g' = '>' |
|---|
| 326 | decode_lower 'h' = '#' |
|---|
| 327 | decode_lower 'i' = '.' |
|---|
| 328 | decode_lower 'l' = '<' |
|---|
| 329 | decode_lower 'm' = '-' |
|---|
| 330 | decode_lower 'n' = '!' |
|---|
| 331 | decode_lower 'p' = '+' |
|---|
| 332 | decode_lower 'q' = '\'' |
|---|
| 333 | decode_lower 'r' = '\\' |
|---|
| 334 | decode_lower 's' = '/' |
|---|
| 335 | decode_lower 't' = '*' |
|---|
| 336 | decode_lower 'u' = '_' |
|---|
| 337 | decode_lower 'v' = '%' |
|---|
| 338 | decode_lower ch = {-pprTrace "decode_lower" (char ch)-} ch |
|---|
| 339 | |
|---|
| 340 | -- Characters not having a specific code are coded as z224U (in hex) |
|---|
| 341 | decode_num_esc :: Char -> EncodedString -> UserString |
|---|
| 342 | decode_num_esc d rest |
|---|
| 343 | = go (digitToInt d) rest |
|---|
| 344 | where |
|---|
| 345 | go n (c : rest) | isHexDigit c = go (16*n + digitToInt c) rest |
|---|
| 346 | go n ('U' : rest) = chr n : zDecodeString rest |
|---|
| 347 | go n other = error ("decode_num_esc: " ++ show n ++ ' ':other) |
|---|
| 348 | |
|---|
| 349 | decode_tuple :: Char -> EncodedString -> UserString |
|---|
| 350 | decode_tuple d rest |
|---|
| 351 | = go (digitToInt d) rest |
|---|
| 352 | where |
|---|
| 353 | -- NB. recurse back to zDecodeString after decoding the tuple, because |
|---|
| 354 | -- the tuple might be embedded in a longer name. |
|---|
| 355 | go n (c : rest) | isDigit c = go (10*n + digitToInt c) rest |
|---|
| 356 | go 0 ('T':rest) = "()" ++ zDecodeString rest |
|---|
| 357 | go n ('T':rest) = '(' : replicate (n-1) ',' ++ ")" ++ zDecodeString rest |
|---|
| 358 | go 1 ('H':rest) = "(# #)" ++ zDecodeString rest |
|---|
| 359 | go n ('H':rest) = '(' : '#' : replicate (n-1) ',' ++ "#)" ++ zDecodeString rest |
|---|
| 360 | go n other = error ("decode_tuple: " ++ show n ++ ' ':other) |
|---|
| 361 | |
|---|
| 362 | {- |
|---|
| 363 | Tuples are encoded as |
|---|
| 364 | Z3T or Z3H |
|---|
| 365 | for 3-tuples or unboxed 3-tuples respectively. No other encoding starts |
|---|
| 366 | Z<digit> |
|---|
| 367 | |
|---|
| 368 | * "(# #)" is the tycon for an unboxed 1-tuple (not 0-tuple) |
|---|
| 369 | There are no unboxed 0-tuples. |
|---|
| 370 | |
|---|
| 371 | * "()" is the tycon for a boxed 0-tuple. |
|---|
| 372 | There are no boxed 1-tuples. |
|---|
| 373 | -} |
|---|
| 374 | |
|---|
| 375 | maybe_tuple :: UserString -> Maybe EncodedString |
|---|
| 376 | |
|---|
| 377 | maybe_tuple "(# #)" = Just("Z1H") |
|---|
| 378 | maybe_tuple ('(' : '#' : cs) = case count_commas (0::Int) cs of |
|---|
| 379 | (n, '#' : ')' : _) -> Just ('Z' : shows (n+1) "H") |
|---|
| 380 | _ -> Nothing |
|---|
| 381 | maybe_tuple "()" = Just("Z0T") |
|---|
| 382 | maybe_tuple ('(' : cs) = case count_commas (0::Int) cs of |
|---|
| 383 | (n, ')' : _) -> Just ('Z' : shows (n+1) "T") |
|---|
| 384 | _ -> Nothing |
|---|
| 385 | maybe_tuple _ = Nothing |
|---|
| 386 | |
|---|
| 387 | count_commas :: Int -> String -> (Int, String) |
|---|
| 388 | count_commas n (',' : cs) = count_commas (n+1) cs |
|---|
| 389 | count_commas n cs = (n,cs) |
|---|