Branch data Line data Source code
# 1 : : // Copyright 2016 Wladimir J. van der Laan # 2 : : // Distributed under the MIT software license, see the accompanying # 3 : : // file COPYING or https://opensource.org/licenses/mit-license.php. # 4 : : #ifndef UNIVALUE_UTFFILTER_H # 5 : : #define UNIVALUE_UTFFILTER_H # 6 : : # 7 : : #include <string> # 8 : : # 9 : : /** # 10 : : * Filter that generates and validates UTF-8, as well as collates UTF-16 # 11 : : * surrogate pairs as specified in RFC4627. # 12 : : */ # 13 : : class JSONUTF8StringFilter # 14 : : { # 15 : : public: # 16 : : explicit JSONUTF8StringFilter(std::string &s): # 17 : : str(s), is_valid(true), codepoint(0), state(0), surpair(0) # 18 : 1029314 : { # 19 : 1029314 : } # 20 : : // Write single 8-bit char (may be part of UTF-8 sequence) # 21 : : void push_back(unsigned char ch) # 22 : 286066509 : { # 23 [ + + ]: 286066509 : if (state == 0) { # 24 [ + + ]: 286061957 : if (ch < 0x80) // 7-bit ASCII, fast direct pass-through # 25 : 286060136 : str.push_back(ch); # 26 [ - + ]: 1821 : else if (ch < 0xc0) // Mid-sequence character, invalid in this state # 27 : 0 : is_valid = false; # 28 [ - + ]: 1821 : else if (ch < 0xe0) { // Start of 2-byte sequence # 29 : 0 : codepoint = (ch & 0x1f) << 6; # 30 : 0 : state = 6; # 31 [ + + ]: 1821 : } else if (ch < 0xf0) { // Start of 3-byte sequence # 32 : 911 : codepoint = (ch & 0x0f) << 12; # 33 : 911 : state = 12; # 34 [ + - ]: 911 : } else if (ch < 0xf8) { // Start of 4-byte sequence # 35 : 910 : codepoint = (ch & 0x07) << 18; # 36 : 910 : state = 18; # 37 : 910 : } else // Reserved, invalid # 38 : 0 : is_valid = false; # 39 : 286061957 : } else { # 40 [ - + ]: 4552 : if ((ch & 0xc0) != 0x80) // Not a continuation, invalid # 41 : 0 : is_valid = false; # 42 : 4552 : state -= 6; # 43 : 4552 : codepoint |= (ch & 0x3f) << state; # 44 [ + + ]: 4552 : if (state == 0) # 45 : 1821 : push_back_u(codepoint); # 46 : 4552 : } # 47 : 286066509 : } # 48 : : // Write codepoint directly, possibly collating surrogate pairs # 49 : : void push_back_u(unsigned int codepoint_) # 50 : 2162 : { # 51 [ - + ]: 2162 : if (state) // Only accept full codepoints in open state # 52 : 0 : is_valid = false; # 53 [ + + ][ + + ]: 2162 : if (codepoint_ >= 0xD800 && codepoint_ < 0xDC00) { // First half of surrogate pair # 54 [ - + ]: 94 : if (surpair) // Two subsequent surrogate pair openers - fail # 55 : 0 : is_valid = false; # 56 : 94 : else # 57 : 94 : surpair = codepoint_; # 58 [ + + ][ + + ]: 2068 : } else if (codepoint_ >= 0xDC00 && codepoint_ < 0xE000) { // Second half of surrogate pair # 59 [ + - ]: 94 : if (surpair) { // Open surrogate pair, expect second half # 60 : : // Compute code point from UTF-16 surrogate pair # 61 : 94 : append_codepoint(0x10000 | ((surpair - 0xD800)<<10) | (codepoint_ - 0xDC00)); # 62 : 94 : surpair = 0; # 63 : 94 : } else // Second half doesn't follow a first half - fail # 64 : 0 : is_valid = false; # 65 : 1974 : } else { # 66 [ - + ]: 1974 : if (surpair) // First half of surrogate pair not followed by second - fail # 67 : 0 : is_valid = false; # 68 : 1974 : else # 69 : 1974 : append_codepoint(codepoint_); # 70 : 1974 : } # 71 : 2162 : } # 72 : : // Check that we're in a state where the string can be ended # 73 : : // No open sequences, no open surrogate pairs, etc # 74 : : bool finalize() # 75 : 1029314 : { # 76 [ - + ][ - + ]: 1029314 : if (state || surpair) # 77 : 0 : is_valid = false; # 78 : 1029314 : return is_valid; # 79 : 1029314 : } # 80 : : private: # 81 : : std::string &str; # 82 : : bool is_valid; # 83 : : // Current UTF-8 decoding state # 84 : : unsigned int codepoint; # 85 : : int state; // Top bit to be filled in for next UTF-8 byte, or 0 # 86 : : # 87 : : // Keep track of the following state to handle the following section of # 88 : : // RFC4627: # 89 : : // # 90 : : // To escape an extended character that is not in the Basic Multilingual # 91 : : // Plane, the character is represented as a twelve-character sequence, # 92 : : // encoding the UTF-16 surrogate pair. So, for example, a string # 93 : : // containing only the G clef character (U+1D11E) may be represented as # 94 : : // "\uD834\uDD1E". # 95 : : // # 96 : : // Two subsequent \u.... may have to be replaced with one actual codepoint. # 97 : : unsigned int surpair; // First half of open UTF-16 surrogate pair, or 0 # 98 : : # 99 : : void append_codepoint(unsigned int codepoint_) # 100 : 2068 : { # 101 [ + + ]: 2068 : if (codepoint_ <= 0x7f) # 102 : 27 : str.push_back((char)codepoint_); # 103 [ + + ]: 2041 : else if (codepoint_ <= 0x7FF) { # 104 : 16 : str.push_back((char)(0xC0 | (codepoint_ >> 6))); # 105 : 16 : str.push_back((char)(0x80 | (codepoint_ & 0x3F))); # 106 [ + + ]: 2025 : } else if (codepoint_ <= 0xFFFF) { # 107 : 1021 : str.push_back((char)(0xE0 | (codepoint_ >> 12))); # 108 : 1021 : str.push_back((char)(0x80 | ((codepoint_ >> 6) & 0x3F))); # 109 : 1021 : str.push_back((char)(0x80 | (codepoint_ & 0x3F))); # 110 [ + - ]: 1021 : } else if (codepoint_ <= 0x1FFFFF) { # 111 : 1004 : str.push_back((char)(0xF0 | (codepoint_ >> 18))); # 112 : 1004 : str.push_back((char)(0x80 | ((codepoint_ >> 12) & 0x3F))); # 113 : 1004 : str.push_back((char)(0x80 | ((codepoint_ >> 6) & 0x3F))); # 114 : 1004 : str.push_back((char)(0x80 | (codepoint_ & 0x3F))); # 115 : 1004 : } # 116 : 2068 : } # 117 : : }; # 118 : : # 119 : : #endif