// Copyright (c) 2008-2009 Bjoern Hoehrmann // Copyright (c) 2015, Ondrej Palkovsky // Copyright (c) 2016, Winterland #include #include #include #define UTF8_ACCEPT 0 #define UTF8_REJECT 12 static const uint8_t utf8d[] = { // The first part of the table maps bytes to character classes that // to reduce the size of the transition table and create bitmasks. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, // The second part is a transition table that maps a combination // of a state of the automaton and a character class to a state. 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,12,12,12,12,12, }; static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) { uint32_t type = utf8d[byte]; *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6) : (0xff >> type) & (byte); *state = utf8d[256 + *state + type]; return *state; } static inline uint16_t decode_hex(uint32_t c) { if (c >= '0' && c <= '9') return c - '0'; else if (c >= 'a' && c <= 'f') return c - 'a' + 10; else if (c >= 'A' && c <= 'F') return c - 'A' + 10; return 0xFFFF; // Should not happen } // Decode, return non-zero value on error int _js_decode_string(uint16_t *const dest, size_t *destoff, const uint8_t *s, const uint8_t *const srcend) { uint16_t *d = dest + *destoff; uint32_t state = 0; uint32_t codepoint; uint8_t surrogate = 0; uint16_t temp_hex = 0; uint16_t unidata; // Optimized version of dispatch when just an ASCII char is expected #define DISPATCH_ASCII(label) {\ if (s >= srcend) {\ return -1;\ }\ codepoint = *s++;\ goto label;\ } standard: // Test end of stream while (s < srcend) { if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) { if (state == UTF8_REJECT) { return -1; } continue; } if (codepoint == '\\') DISPATCH_ASCII(backslash) else if (codepoint <= 0xffff) *d++ = (uint16_t) codepoint; else { *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10)); *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF)); } } *destoff = d - dest; // Exit point return (state != UTF8_ACCEPT); backslash: switch (codepoint) { case '"': case '\\': case '/': *d++ = (uint16_t) codepoint; goto standard; break; case 'b': *d++ = '\b';goto standard; case 'f': *d++ = '\f';goto standard; case 'n': *d++ = '\n';goto standard; case 'r': *d++ = '\r';goto standard; case 't': *d++ = '\t';goto standard; case 'u': DISPATCH_ASCII(unicode1);;break; default: return -1; } unicode1: temp_hex = decode_hex(codepoint); if (temp_hex == 0xFFFF) { return -1; } else unidata = temp_hex << 12; DISPATCH_ASCII(unicode2); unicode2: temp_hex = decode_hex(codepoint); if (temp_hex == 0xFFFF) { return -1; } else unidata |= temp_hex << 8; DISPATCH_ASCII(unicode3); unicode3: temp_hex = decode_hex(codepoint); if (temp_hex == 0xFFFF) { return -1; } else unidata |= temp_hex << 4; DISPATCH_ASCII(unicode4); unicode4: temp_hex = decode_hex(codepoint); if (temp_hex == 0xFFFF) { return -1; } else unidata |= temp_hex; *d++ = (uint16_t) unidata; if (surrogate) { if (unidata < 0xDC00 || unidata > 0xDFFF) // is not low surrogate return -1; surrogate = 0; } else if (unidata >= 0xD800 && unidata <= 0xDBFF ) { // is high surrogate surrogate = 1; DISPATCH_ASCII(surrogate1); } else if (unidata >= 0xDC00 && unidata <= 0xDFFF) { // is low surrogate return -1; } goto standard; surrogate1: if (codepoint != '\\') { return -1; } DISPATCH_ASCII(surrogate2) surrogate2: if (codepoint != 'u') { return -1; } DISPATCH_ASCII(unicode1) }