/* * Copyright (c) 2014 Daniel P. Wright * * Based heavily on the utf8 decoding functions found in Data.Text. See: * https://github.com/bos/text/blob/master/cbits/cbits.c * to see what I mean. */ #include #include #include #include #define UTF7_RESULT 0x000000ff #define UTF7_STATE 0x0000ff00 #define UTF7_REM 0xffff0000 #define UTF7_ACCEPT 0 #define UTF7_CONT 1 #define UTF7_REJECT 2 #define UTF7_ASCII 0 #define UTF7_B64_1 1 #define UTF7_B64_2 2 #define UTF7_B64_3 3 #define UTF7_B64_4 4 #define UTF7_B64_5 5 #define UTF7_B64_6 6 #define UTF7_B64_7 7 #define UTF7_B64_8 8 #define UTF7_B64_AMP 9 #define UTF7_B64_STATES 10 #define BASE64D_OFFSET 0x2b #define BASE64D_FINAL 0x7a #define BASE64D_INVALID 0xff #define BASE64D_SIZE (BASE64D_FINAL - BASE64D_OFFSET) + 1 #define STATE_MODE_MASK 0x0f00 #define STATE_RES_MASK 0x00c0 #define STATE_AMP_MASK 0x0020 #define STATE_SR_MASK 0x0010 #define STATE_OFFSET_MASK 0x000f #define PACK_STATE(next_mode, result_if_hyphen, amp_if_hyphen, cp_shift_right, cp_offset) \ ( (next_mode << 8) | (result_if_hyphen << 6) | (amp_if_hyphen) << 5 \ | (cp_shift_right << 4) | cp_offset) static const uint8_t base64e[64] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; static const uint8_t base64d[BASE64D_SIZE] = { 62, 63, BASE64D_INVALID, BASE64D_INVALID, BASE64D_INVALID, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, BASE64D_INVALID, BASE64D_INVALID, BASE64D_INVALID, BASE64D_INVALID, BASE64D_INVALID, BASE64D_INVALID, BASE64D_INVALID, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, BASE64D_INVALID, BASE64D_INVALID, BASE64D_INVALID, BASE64D_INVALID, BASE64D_INVALID, BASE64D_INVALID, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 }; static const uint16_t base64states[UTF7_B64_STATES] = { 0, //invalid PACK_STATE(UTF7_B64_2, UTF7_CONT, false, false, 10), PACK_STATE(UTF7_B64_3, UTF7_REJECT, false, false, 4), PACK_STATE(UTF7_B64_4, UTF7_REJECT, false, true, 2), PACK_STATE(UTF7_B64_5, UTF7_CONT, false, false, 8), PACK_STATE(UTF7_B64_6, UTF7_REJECT, false, false, 2), PACK_STATE(UTF7_B64_7, UTF7_REJECT, false, true, 4), PACK_STATE(UTF7_B64_8, UTF7_CONT, false, false, 6), PACK_STATE(UTF7_B64_1, UTF7_REJECT, false, true, 0), PACK_STATE(UTF7_B64_2, UTF7_ACCEPT, true, false, 10) }; static inline _Bool is_printable_ascii(uint16_t c) { return (c >= 0x20 && c <= 0x7e); } static inline uint8_t lookupd(uint8_t c) { uint8_t i = c - BASE64D_OFFSET; return (i > BASE64D_SIZE) ? BASE64D_INVALID : base64d[i]; } static inline uint16_t decode(uint32_t *state, uint16_t *codep, uint8_t byte) { uint16_t rem = *state >> 16; uint8_t st = (*state & UTF7_STATE) >> 8; uint8_t r = *state & UTF7_RESULT; if(st == UTF7_ASCII) { if(byte == '&') { *codep = 0; st = UTF7_B64_AMP; r = UTF7_CONT; } else { *codep = byte; r = UTF7_ACCEPT; } } else { uint16_t packed_state = base64states[st]; uint8_t next_mode = (packed_state & STATE_MODE_MASK) >> 8; uint8_t result_if_hyphen = (packed_state & STATE_RES_MASK) >> 6; bool amp_if_hyphen = (packed_state & STATE_AMP_MASK) >> 5; bool cp_shift_right = (packed_state & STATE_SR_MASK) >> 4; uint8_t cp_offset = packed_state & STATE_OFFSET_MASK; if(byte == '-') { if(amp_if_hyphen) *codep = '&'; st = UTF7_ASCII; r = result_if_hyphen; } else { uint8_t b = lookupd(byte); if(b != BASE64D_INVALID) { if(!cp_shift_right) { *codep |= rem | (b << cp_offset); rem = 0; r = UTF7_CONT; } else { *codep |= b >> cp_offset; rem = b << 16 - cp_offset; r = UTF7_ACCEPT; } st = next_mode; } else { r = UTF7_REJECT; } } } *state = rem << 16 | st << 8 | r; return *state; } static inline uint8_t const * _hs_text_decode_utf7_int(uint16_t *const dest, size_t *destoff, const uint8_t **src, const uint8_t *srcend, uint16_t *codepoint0, uint32_t *state0) { uint16_t *d = dest + *destoff; const uint8_t *s = *src, *last = *src; uint32_t state = *state0; uint16_t codepoint = *codepoint0; while (s < srcend) { if (decode(&state, &codepoint, *s++) & UTF7_RESULT != UTF7_ACCEPT) { if (state & UTF7_RESULT != UTF7_REJECT) continue; break; } *d++ = codepoint; last = s; codepoint = 0; } *destoff = d - dest; *codepoint0 = codepoint; *state0 = state; *src = last; return s; } uint8_t const * _hs_text_decode_utf7_state(uint16_t *const dest, size_t *destoff, const uint8_t **src, const uint8_t *srcend, uint16_t *codepoint0, uint32_t *state0) { uint8_t const *ret = _hs_text_decode_utf7_int(dest, destoff, src, srcend, codepoint0, state0); if(*state0 & UTF7_RESULT == UTF7_REJECT) ret -= 1; return ret; } /* * Helper to decode buffer and discard final decoder state */ const uint8_t * _hs_text_decode_utf7(uint16_t *const dest, size_t *destoff, const uint8_t *src, const uint8_t *const srcend) { uint16_t codepoint; uint32_t state = UTF7_ACCEPT; uint8_t const *ret = _hs_text_decode_utf7_int(dest, destoff, &src, srcend, &codepoint, &state); /*Back up if we have an invalid encoding */ if (state & UTF7_RESULT == UTF7_REJECT) { ret -= 1; } return ret; } void _hs_text_encode_utf7(uint8_t **destp, const uint16_t *src, size_t srcoff, size_t srclen) { const uint16_t *srcend; uint8_t *dest = *destp; _Bool inAsciiBlock = true; int i = 0; src += srcoff; srcend = src + srclen; while (src < srcend) { uint16_t w = *src; if(inAsciiBlock) { if(is_printable_ascii(w)) { if(w == '&') { *dest++ = '&'; *dest++ = '-'; } else { *dest++ = w; } src++; } else { inAsciiBlock = false; *dest++ = '&'; } } else { if(is_printable_ascii(w)) { inAsciiBlock = true; *dest++ = '-'; } else { uint8_t indices[8]; uint8_t bytes = 3; indices[0] = (w & 0xfc00) >> 10; indices[1] = (w & 0x03f0) >> 4; indices[2] = (w & 0x000f) << 2; w = *++src; if(src < srcend) { if(!is_printable_ascii(w)) { bytes = 6; indices[2] |= (w & 0xc000) >> 14; indices[3] = (w & 0x3f00) >> 8; indices[4] = (w & 0x00fc) >> 2; indices[5] = (w & 0x0003) << 4; w = *++src; if(src < srcend) { if(!is_printable_ascii(w)) { bytes = 8; indices[5] |= (w & 0xf000) >> 12; indices[6] = (w & 0x0fc0) >> 6; indices[7] = w & 0x003f; src++; } } } } for(i = 0; i < bytes; ++i) *dest++ = base64e[indices[i]]; } } } if(!inAsciiBlock) *dest++ = '-'; *destp = dest; }