From 1d055261b4144dbf86b2658437015b15d4dd9bff Mon Sep 17 00:00:00 2001 From: Richard Date: Sun, 4 Sep 2022 00:32:56 +0100 Subject: initial --- include/jsoncons/unicode_traits.hpp | 1330 +++++++++++++++++++++++++++++++++++ 1 file changed, 1330 insertions(+) create mode 100644 include/jsoncons/unicode_traits.hpp (limited to 'include/jsoncons/unicode_traits.hpp') diff --git a/include/jsoncons/unicode_traits.hpp b/include/jsoncons/unicode_traits.hpp new file mode 100644 index 0000000..f45bafe --- /dev/null +++ b/include/jsoncons/unicode_traits.hpp @@ -0,0 +1,1330 @@ +// Copyright 2016 Daniel Parker +// Distributed under the Boost license, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +// See https://github.com/danielaparker/unicode_traits for latest version + +/* + * Includes code derived from Unicode, Inc decomposition code in ConvertUTF.h and ConvertUTF.c + * http://www.unicode.org/ + * + * "Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard." +*/ + +#ifndef JSONCONS_UNICODE_TRAITS_HPP +#define JSONCONS_UNICODE_TRAITS_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace jsoncons { namespace unicode_traits { + + enum class encoding_kind {undetected,utf8,utf16le,utf16be,utf32le,utf32be}; + + inline + std::string to_string(encoding_kind encoding) + { + switch (encoding) + { + case encoding_kind::utf8: + return "utf8"; + case encoding_kind::utf16le: + return "utf16le"; + case encoding_kind::utf16be: + return "utf16be"; + case encoding_kind::utf32le: + return "utf32le"; + case encoding_kind::utf32be: + return "utf32be"; + default: + return "undetected"; + } + } + + template + struct detect_encoding_result + { + const Byte* ptr; + encoding_kind encoding; + }; + + template + typename std::enable_if::value,detect_encoding_result>::type + detect_encoding_from_bom(const CharT* data, std::size_t length) + { + const uint8_t bom_utf8[] = {0xef,0xbb,0xbf}; + const uint8_t bom_utf16le[] = {0xff,0xfe}; + const uint8_t bom_utf16be[] = {0xfe,0xff}; + const uint8_t bom_utf32le[] = {0xff,0xfe,0x00,0x00}; + const uint8_t bom_utf32be[] = {0x00,0x00,0xfe,0xff}; + + if (length >= 4 && !memcmp(data,bom_utf32le,4)) + { + return detect_encoding_result{data+4,encoding_kind::utf32le}; + } + else if (length >= 4 && !memcmp(data,bom_utf32be,4)) + { + return detect_encoding_result{data+4,encoding_kind::utf32be}; + } + else if (length >= 2 && !memcmp(data,bom_utf16le,2)) + { + return detect_encoding_result{data+2,encoding_kind::utf16le}; + } + else if (length >= 2 && !memcmp(data,bom_utf16be,2)) + { + return detect_encoding_result{data+2,encoding_kind::utf16be}; + } + else if (length >= 3 && !memcmp(data,bom_utf8,3)) + { + return detect_encoding_result{data+3,encoding_kind::utf8}; + } + else + { + return detect_encoding_result{data,encoding_kind::undetected}; + } + } + + template + typename std::enable_if::value || type_traits::is_char32::value,detect_encoding_result>::type + detect_encoding_from_bom(const CharT* data, std::size_t) + { + return detect_encoding_result{data,encoding_kind::undetected}; + } + + template + typename std::enable_if::value,detect_encoding_result>::type + detect_json_encoding(const CharT* data, std::size_t length) + { + detect_encoding_result r = detect_encoding_from_bom(data,length); + if (r.encoding != encoding_kind::undetected) + { + return r; + } + else if (length < 4) + { + return detect_encoding_result{data,encoding_kind::utf8}; + } + else if (*data == 0 && *(data+1) == 0 && *(data+2) == 0) + { + return detect_encoding_result{data,encoding_kind::utf32be}; + } + else if (*data == 0 && *(data+2) == 0) + { + return detect_encoding_result{data,encoding_kind::utf16be}; + } + else if (*(data+1) == 0 && *(data+2) == 0 && *(data+3) == 0) + { + return detect_encoding_result{data,encoding_kind::utf32le}; + } + else if (*(data+1) == 0 && *(data+3) == 0) + { + return detect_encoding_result{data,encoding_kind::utf16le}; + } + else + { + return detect_encoding_result{data,encoding_kind::utf8}; + } + } + + template + typename std::enable_if::value || type_traits::is_char32::value,detect_encoding_result>::type + detect_json_encoding(const CharT* data, std::size_t) + { + return detect_encoding_result{data,encoding_kind::undetected}; + } + + /* + * Magic values subtracted from a buffer value during UTF8 conversion. + * This table contains as many values as there might be trailing bytes + * in a UTF-8 sequence. Source: ConvertUTF.c + */ + const uint32_t offsets_from_utf8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, + 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; + + /* + * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed + * into the first byte, depending on how many bytes follow. There are + * as many entries in this table as there are UTF-8 sequence types. + * (I.e., one byte sequence, two byte... etc.). Remember that sequencs + * for *legal* UTF-8 will be 4 or fewer bytes total. Source: ConvertUTF.c + */ + const uint8_t first_byte_mark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; + + /* + * Index into the table below with the first byte of a UTF-8 sequence to + * get the number of trailing bytes that are supposed to follow it. + * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is + * left as-is for anyone who may want to do such conversion, which was + * allowed in earlier algorithms. Source: ConvertUTF.c + */ + const uint8_t trailing_bytes_for_utf8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 + }; + + // Some fundamental constants. Source: ConvertUTF.h + const uint32_t replacement_char = 0x0000FFFD; + const uint32_t max_bmp = 0x0000FFFF; + const uint32_t max_utf16 = 0x0010FFFF; + const uint32_t max_utf32 = 0x7FFFFFFF; + const uint32_t max_legal_utf32 = 0x0010FFFF; + + const int half_shift = 10; // used for shifting by 10 bits + const uint32_t half_base = 0x0010000UL; + const uint32_t half_mask = 0x3FFUL; + + const uint16_t sur_high_start = 0xD800; + const uint16_t sur_high_end = 0xDBFF; + const uint16_t sur_low_start = 0xDC00; + const uint16_t sur_low_end = 0xDFFF; + + inline + static bool is_continuation_byte(unsigned char ch) + { + return (ch & 0xC0) == 0x80; + } + + inline + bool is_high_surrogate(uint32_t ch) noexcept + { + return (ch >= sur_high_start && ch <= sur_high_end); + } + + inline + bool is_low_surrogate(uint32_t ch) noexcept + { + return (ch >= sur_low_start && ch <= sur_low_end); + } + + inline + bool is_surrogate(uint32_t ch) noexcept + { + return (ch >= sur_high_start && ch <= sur_low_end); + } + + enum class conv_flags + { + strict = 0, + lenient + }; + + // conv_errc + + enum class conv_errc + { + success = 0, + over_long_utf8_sequence = 1, // over long utf8 sequence + expected_continuation_byte, // expected continuation byte + unpaired_high_surrogate, // unpaired high surrogate UTF-16 + illegal_surrogate_value, // UTF-16 surrogate values are illegal in UTF-32 + source_exhausted, // partial character in source, but hit end + source_illegal // source sequence is illegal/malformed + }; + + class Unicode_traits_error_category_impl_ + : public std::error_category + { + public: + virtual const char* name() const noexcept + { + return "unicode_traits conversion error"; + } + virtual std::string message(int ev) const + { + switch (static_cast(ev)) + { + case conv_errc::over_long_utf8_sequence: + return "Over long utf8 sequence"; + case conv_errc::expected_continuation_byte: + return "Expected continuation byte"; + case conv_errc::unpaired_high_surrogate: + return "Unpaired high surrogate UTF-16"; + case conv_errc::illegal_surrogate_value: + return "UTF-16 surrogate values are illegal in UTF-32"; + case conv_errc::source_exhausted: + return "Partial character in source, but hit end"; + case conv_errc::source_illegal: + return "Source sequence is illegal/malformed"; + default: + return ""; + break; + } + } + }; + + inline + const std::error_category& unicode_traits_error_category() + { + static Unicode_traits_error_category_impl_ instance; + return instance; + } + + inline + std::error_code make_error_code(conv_errc result) + { + return std::error_code(static_cast(result),unicode_traits_error_category()); + } + +} // unicode_traits +} // jsoncons + +namespace std { + template<> + struct is_error_code_enum : public true_type + { + }; +} + +namespace jsoncons { namespace unicode_traits { + + // utf8 + + template + typename std::enable_if::value, conv_errc>::type + is_legal_utf8(const CharT* first, std::size_t length) + { + uint8_t a; + const CharT* srcptr = first+length; + switch (length) { + default: + return conv_errc::over_long_utf8_sequence; + case 4: + if (((a = (*--srcptr))& 0xC0) != 0x80) + return conv_errc::expected_continuation_byte; + JSONCONS_FALLTHROUGH; + case 3: + if (((a = (*--srcptr))& 0xC0) != 0x80) + return conv_errc::expected_continuation_byte; + JSONCONS_FALLTHROUGH; + case 2: + if (((a = (*--srcptr))& 0xC0) != 0x80) + return conv_errc::expected_continuation_byte; + + switch (static_cast(*first)) + { + // no fall-through in this inner switch + case 0xE0: if (a < 0xA0) return conv_errc::source_illegal; break; + case 0xED: if (a > 0x9F) return conv_errc::source_illegal; break; + case 0xF0: if (a < 0x90) return conv_errc::source_illegal; break; + case 0xF4: if (a > 0x8F) return conv_errc::source_illegal; break; + default: if (a < 0x80) return conv_errc::source_illegal; + } + + JSONCONS_FALLTHROUGH; + case 1: + if (static_cast(*first) >= 0x80 && static_cast(*first) < 0xC2) + return conv_errc::source_illegal; + break; + } + if (static_cast(*first) > 0xF4) + return conv_errc::source_illegal; + + return conv_errc(); + } + + template using void_t = void; + + template + struct is_output_iterator : std::false_type {}; + + template + struct is_output_iterator::iterator_category, + decltype(*std::declval() = std::declval())>> : std::true_type {}; + + // is_same_size fixes issue with vs2013 + + // primary template + template + struct is_same_size : std::false_type + { + }; + + // specialization for non void types + template + struct is_same_size::value && !std::is_void::value>::type> + { + static constexpr bool value = (sizeof(T1) == sizeof(T2)); + }; + + // convert + + template + struct convert_result + { + const CharT* ptr; + conv_errc ec; + }; + + // to_codepoint + + template + typename std::enable_if::value && type_traits::is_char32::value, + convert_result>::type + to_codepoint(const CharT* first, const CharT* last, + CodepointT& ch, + conv_flags flags = conv_flags::strict) noexcept + { + ch = 0; + if (first >= last) + { + return convert_result{first, conv_errc::source_exhausted}; + } + conv_errc result = conv_errc(); + + unsigned short extra_bytes_to_read = trailing_bytes_for_utf8[static_cast(*first)]; + if (extra_bytes_to_read >= last - first) + { + result = conv_errc::source_exhausted; + return convert_result{first, result}; + } + // Do this check whether lenient or strict + if ((result=is_legal_utf8(first, extra_bytes_to_read+1)) != conv_errc()) + { + return convert_result{first, result}; + } + // The cases all fall through. See "Note A" below. + switch (extra_bytes_to_read) + { + case 5: + ch += static_cast(*first++); + ch <<= 6; + JSONCONS_FALLTHROUGH; + case 4: + ch += static_cast(*first++); + ch <<= 6; + JSONCONS_FALLTHROUGH; + case 3: + ch += static_cast(*first++); + ch <<= 6; + JSONCONS_FALLTHROUGH; + case 2: + ch += static_cast(*first++); + ch <<= 6; + JSONCONS_FALLTHROUGH; + case 1: + ch += static_cast(*first++); + ch <<= 6; + JSONCONS_FALLTHROUGH; + case 0: + ch += static_cast(*first++); + break; + } + ch -= offsets_from_utf8[extra_bytes_to_read]; + + if (ch <= max_legal_utf32) { + /* + * UTF-16 surrogate values are illegal in UTF-32, and anything + * over Plane 17 (> 0x10FFFF) is illegal. + */ + if (is_surrogate(ch) ) + { + if (flags == conv_flags::strict) + { + first -= (extra_bytes_to_read+1); // return to the illegal value itself + result = conv_errc::source_illegal; + return convert_result{first, result}; + } + else + { + ch = replacement_char; + } + } + } + else // i.e., ch > max_legal_utf32 + { + result = conv_errc::source_illegal; + ch = replacement_char; + } + + return convert_result{first,result} ; + } + + template + typename std::enable_if::value && type_traits::is_char32::value, + convert_result>::type + to_codepoint(const CharT* first, const CharT* last, + CodepointT& ch, + conv_flags flags = conv_flags::strict) noexcept + { + ch = 0; + if (first >= last) + { + return convert_result{first, conv_errc::source_exhausted}; + } + conv_errc result = conv_errc(); + + ch = *first++; + // If we have a surrogate pair, convert to UTF32 first. + if (is_high_surrogate(ch)) + { + // If the 16 bits following the high surrogate are in the first buffer... + if (first < last) + { + uint32_t ch2 = *first; + // If ptr's a low surrogate, convert to UTF32. + if (ch2 >= sur_low_start && ch2 <= sur_low_end ) + { + ch = ((ch - sur_high_start) << half_shift) + + (ch2 - sur_low_start) + half_base; + ++first; + } + else if (flags == conv_flags::strict) // ptr's an unpaired high surrogate + { + --first; /* return to the illegal value itself */ + result = conv_errc::source_illegal; + return convert_result{first, result}; + } + } + else + { /* We don't have the 16 bits following the high surrogate. */ + --first; /* return to the high surrogate */ + result = conv_errc::source_exhausted; + return convert_result{first, result}; + } + } else if (flags == conv_flags::strict) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (is_low_surrogate(ch) ) + { + --first; /* return to the illegal value itself */ + result = conv_errc::source_illegal; + return convert_result{first, result}; + } + } + + return convert_result{first,result} ; + } + + template + typename std::enable_if::value && type_traits::is_char32::value, + convert_result>::type + to_codepoint(const CharT* first, const CharT* last, + CodepointT& ch, + conv_flags flags = conv_flags::strict) noexcept + { + ch = 0; + if (first >= last) + { + return convert_result{first, conv_errc::source_exhausted}; + } + conv_errc result = conv_errc(); + + ch = *first++; + if (flags == conv_flags::strict ) + { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (is_surrogate(ch)) + { + --first; /* return to the illegal value itself */ + result = conv_errc::illegal_surrogate_value; + return convert_result{first,result} ; + } + } + if (!(ch <= max_legal_utf32)) + { + ch = replacement_char; + result = conv_errc::source_illegal; + } + + return convert_result{first,result} ; + } + + // convert + + template + typename std::enable_if::value + && type_traits::is_back_insertable::value + && type_traits::is_char8::value, + convert_result>::type + convert(const CharT* data, std::size_t length, Container& target, conv_flags flags=conv_flags::strict) + { + (void)flags; + + conv_errc result = conv_errc(); + const CharT* last = data + length; + while (data != last) + { + std::size_t len = trailing_bytes_for_utf8[static_cast(*data)] + 1; + if (len > (std::size_t)(last - data)) + { + return convert_result{data, conv_errc::source_exhausted}; + } + if ((result=is_legal_utf8(data, len)) != conv_errc()) + { + return convert_result{data,result}; + } + + switch (len) { + case 4: target.push_back(static_cast(*data++)); + JSONCONS_FALLTHROUGH; + case 3: target.push_back(static_cast(*data++)); + JSONCONS_FALLTHROUGH; + case 2: target.push_back(static_cast(*data++)); + JSONCONS_FALLTHROUGH; + case 1: target.push_back(static_cast(*data++)); + } + } + return convert_result{data,result} ; + } + + template + typename std::enable_if::value + && type_traits::is_back_insertable::value + && type_traits::is_char16::value, + convert_result>::type + convert(const CharT* data, std::size_t length, + Container& target, + conv_flags flags = conv_flags::strict) + { + conv_errc result = conv_errc(); + + const CharT* last = data + length; + while (data != last) + { + unsigned short extra_bytes_to_read = trailing_bytes_for_utf8[static_cast(*data)]; + if (extra_bytes_to_read >= last - data) + { + result = conv_errc::source_exhausted; + break; + } + /* Do this check whether lenient or strict */ + if ((result=is_legal_utf8(data, extra_bytes_to_read+1)) != conv_errc()) + { + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + uint32_t ch = 0; + switch (extra_bytes_to_read) { + case 5: ch += static_cast(*data++); ch <<= 6; /* remember, illegal UTF-8 */ + JSONCONS_FALLTHROUGH; + case 4: ch += static_cast(*data++); ch <<= 6; /* remember, illegal UTF-8 */ + JSONCONS_FALLTHROUGH; + case 3: ch += static_cast(*data++); ch <<= 6; + JSONCONS_FALLTHROUGH; + case 2: ch += static_cast(*data++); ch <<= 6; + JSONCONS_FALLTHROUGH; + case 1: ch += static_cast(*data++); ch <<= 6; + JSONCONS_FALLTHROUGH; + case 0: ch += static_cast(*data++); + break; + } + ch -= offsets_from_utf8[extra_bytes_to_read]; + + if (ch <= max_bmp) { /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (is_surrogate(ch) ) + { + if (flags == conv_flags::strict) { + data -= (extra_bytes_to_read+1); /* return to the illegal value itself */ + result = conv_errc::source_illegal; + break; + } else { + target.push_back(replacement_char); + } + } else { + target.push_back((uint16_t)ch); /* normal case */ + } + } else if (ch > max_utf16) { + if (flags == conv_flags::strict) { + result = conv_errc::source_illegal; + data -= (extra_bytes_to_read+1); /* return to the start */ + break; /* Bail out; shouldn't continue */ + } else { + target.push_back(replacement_char); + } + } else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + ch -= half_base; + target.push_back((uint16_t)((ch >> half_shift) + sur_high_start)); + target.push_back((uint16_t)((ch & half_mask) + sur_low_start)); + } + } + return convert_result{data,result} ; + } + + template + typename std::enable_if::value + && type_traits::is_back_insertable::value + && type_traits::is_char32::value, + convert_result>::type + convert(const CharT* data, std::size_t length, + Container& target, + conv_flags flags = conv_flags::strict) + { + conv_errc result = conv_errc(); + + const CharT* last = data + length; + while (data < last) + { + uint32_t ch = 0; + unsigned short extra_bytes_to_read = trailing_bytes_for_utf8[static_cast(*data)]; + if (extra_bytes_to_read >= last - data) + { + result = conv_errc::source_exhausted; + break; + } + /* Do this check whether lenient or strict */ + if ((result=is_legal_utf8(data, extra_bytes_to_read+1)) != conv_errc()) + { + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extra_bytes_to_read) + { + case 5: + ch += static_cast(*data++); + ch <<= 6; + JSONCONS_FALLTHROUGH; + case 4: + ch += static_cast(*data++); + ch <<= 6; + JSONCONS_FALLTHROUGH; + case 3: + ch += static_cast(*data++); + ch <<= 6; + JSONCONS_FALLTHROUGH; + case 2: + ch += static_cast(*data++); + ch <<= 6; + JSONCONS_FALLTHROUGH; + case 1: + ch += static_cast(*data++); + ch <<= 6; + JSONCONS_FALLTHROUGH; + case 0: + ch += static_cast(*data++); + break; + } + ch -= offsets_from_utf8[extra_bytes_to_read]; + + if (ch <= max_legal_utf32) { + /* + * UTF-16 surrogate values are illegal in UTF-32, and anything + * over Plane 17 (> 0x10FFFF) is illegal. + */ + if (is_surrogate(ch) ) + { + if (flags == conv_flags::strict) { + data -= (extra_bytes_to_read+1); /* return to the illegal value itself */ + result = conv_errc::source_illegal; + break; + } else { + target.push_back(replacement_char); + } + } else { + target.push_back(ch); + } + } else { /* i.e., ch > max_legal_utf32 */ + result = conv_errc::source_illegal; + target.push_back(replacement_char); + } + } + return convert_result{data,result} ; + } + + // utf16 + + template + typename std::enable_if::value + && type_traits::is_back_insertable::value + && type_traits::is_char8::value, + convert_result>::type + convert(const CharT* data, std::size_t length, + Container& target, + conv_flags flags = conv_flags::strict) { + conv_errc result = conv_errc(); + + const CharT* last = data + length; + while (data < last) { + unsigned short bytes_to_write = 0; + const uint32_t byteMask = 0xBF; + const uint32_t byteMark = 0x80; + uint32_t ch = *data++; + /* If we have a surrogate pair, convert to uint32_t data. */ + if (is_high_surrogate(ch)) + { + /* If the 16 bits following the high surrogate are in the data buffer... */ + if (data < last) { + uint32_t ch2 = *data; + /* If ptr's a low surrogate, convert to uint32_t. */ + if (ch2 >= sur_low_start && ch2 <= sur_low_end) { + ch = ((ch - sur_high_start) << half_shift) + + (ch2 - sur_low_start) + half_base; + ++data; + } else if (flags == conv_flags::strict) { /* ptr's an unpaired high surrogate */ + --data; /* return to the illegal value itself */ + result = conv_errc::unpaired_high_surrogate; + break; + } + } else { /* We don't have the 16 bits following the high surrogate. */ + --data; /* return to the high surrogate */ + result = conv_errc::source_exhausted; + break; + } + } else if (flags == conv_flags::strict) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (is_low_surrogate(ch)) + { + --data; /* return to the illegal value itself */ + result = conv_errc::source_illegal; + break; + } + } + /* Figure out how many bytes the result will require */ + if (ch < (uint32_t)0x80) { + bytes_to_write = 1; + } else if (ch < (uint32_t)0x800) { + bytes_to_write = 2; + } else if (ch < (uint32_t)0x10000) { + bytes_to_write = 3; + } else if (ch < (uint32_t)0x110000) { + bytes_to_write = 4; + } else { + bytes_to_write = 3; + ch = replacement_char; + } + + uint8_t byte1 = 0; + uint8_t byte2 = 0; + uint8_t byte3 = 0; + uint8_t byte4 = 0; + + switch (bytes_to_write) { // note: everything falls through + case 4: byte4 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6; + JSONCONS_FALLTHROUGH; + case 3: byte3 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6; + JSONCONS_FALLTHROUGH; + case 2: byte2 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6; + JSONCONS_FALLTHROUGH; + case 1: byte1 = (uint8_t)(ch | first_byte_mark[bytes_to_write]); + break; + } + switch (bytes_to_write) + { + case 4: + target.push_back(byte1); + target.push_back(byte2); + target.push_back(byte3); + target.push_back(byte4); + break; + case 3: + target.push_back(byte1); + target.push_back(byte2); + target.push_back(byte3); + break; + case 2: + target.push_back(byte1); + target.push_back(byte2); + break; + case 1: + target.push_back(byte1); + break; + } + } + return convert_result{data,result} ; + } + + template + typename std::enable_if::value + && type_traits::is_back_insertable::value + && type_traits::is_char16::value, + convert_result>::type + convert(const CharT* data, std::size_t length, + Container& target, + conv_flags flags = conv_flags::strict) + { + conv_errc result = conv_errc(); + + const CharT* last = data + length; + while (data != last) + { + uint32_t ch = *data++; + /* If we have a surrogate pair, convert to uint32_t data. */ + if (is_high_surrogate(ch)) + { + /* If the 16 bits following the high surrogate are in the data buffer... */ + if (data < last) { + uint32_t ch2 = *data; + /* If ptr's a low surrogate, */ + if (ch2 >= sur_low_start && ch2 <= sur_low_end) { + target.push_back((uint16_t)ch); + target.push_back((uint16_t)ch2); + ++data; + } else if (flags == conv_flags::strict) { /* ptr's an unpaired high surrogate */ + --data; /* return to the illegal value itself */ + result = conv_errc::unpaired_high_surrogate; + break; + } + } else { /* We don't have the 16 bits following the high surrogate. */ + --data; /* return to the high surrogate */ + result = conv_errc::source_exhausted; + break; + } + } else if (is_low_surrogate(ch)) + { + // illegal leading low surrogate + if (flags == conv_flags::strict) { + --data; /* return to the illegal value itself */ + result = conv_errc::source_illegal; + break; + } + else + { + target.push_back((uint16_t)ch); + } + } + else + { + target.push_back((uint16_t)ch); + } + } + return convert_result{data,result} ; + } + + template + typename std::enable_if::value + && type_traits::is_back_insertable::value + && type_traits::is_char32::value, + convert_result>::type + convert(const CharT* data, std::size_t length, + Container& target, + conv_flags flags = conv_flags::strict) + { + conv_errc result = conv_errc(); + + const CharT* last = data + length; + while (data != last) + { + uint32_t ch = *data++; + /* If we have a surrogate pair, convert to UTF32 data. */ + if (is_high_surrogate(ch)) + { + /* If the 16 bits following the high surrogate are in the data buffer... */ + if (data < last) { + uint32_t ch2 = *data; + /* If ptr's a low surrogate, convert to UTF32. */ + if (ch2 >= sur_low_start && ch2 <= sur_low_end ) + { + ch = ((ch - sur_high_start) << half_shift) + + (ch2 - sur_low_start) + half_base; + ++data; + } else if (flags == conv_flags::strict) { /* ptr's an unpaired high surrogate */ + --data; /* return to the illegal value itself */ + result = conv_errc::source_illegal; + break; + } + } else { /* We don't have the 16 bits following the high surrogate. */ + --data; /* return to the high surrogate */ + result = conv_errc::source_exhausted; + break; + } + } else if (flags == conv_flags::strict) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (is_low_surrogate(ch) ) + { + --data; /* return to the illegal value itself */ + result = conv_errc::source_illegal; + break; + } + } + target.push_back(ch); + } + return convert_result{data,result} ; + } + + // utf32 + + template + typename std::enable_if::value + && type_traits::is_back_insertable::value + && type_traits::is_char8::value, + convert_result>::type + convert(const CharT* data, std::size_t length, + Container& target, + conv_flags flags = conv_flags::strict) + { + conv_errc result = conv_errc(); + const CharT* last = data + length; + while (data < last) + { + unsigned short bytes_to_write = 0; + const uint32_t byteMask = 0xBF; + const uint32_t byteMark = 0x80; + uint32_t ch = *data++; + if (flags == conv_flags::strict ) + { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (is_surrogate(ch)) + { + --data; /* return to the illegal value itself */ + result = conv_errc::illegal_surrogate_value; + break; + } + } + /* + * Figure out how many bytes the result will require. Turn any + * illegally large UTF32 things (> Plane 17) into replacement chars. + */ + if (ch < (uint32_t)0x80) { bytes_to_write = 1; + } else if (ch < (uint32_t)0x800) { bytes_to_write = 2; + } else if (ch < (uint32_t)0x10000) { bytes_to_write = 3; + } else if (ch <= max_legal_utf32) { bytes_to_write = 4; + } else { + bytes_to_write = 3; + ch = replacement_char; + result = conv_errc::source_illegal; + } + + uint8_t byte1 = 0; + uint8_t byte2 = 0; + uint8_t byte3 = 0; + uint8_t byte4 = 0; + + switch (bytes_to_write) { + case 4: + byte4 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6; + JSONCONS_FALLTHROUGH; + case 3: + byte3 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6; + JSONCONS_FALLTHROUGH; + case 2: + byte2 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6; + JSONCONS_FALLTHROUGH; + case 1: + byte1 = (uint8_t) (ch | first_byte_mark[bytes_to_write]); + break; + } + + switch (bytes_to_write) + { + case 4: + target.push_back(byte1); + target.push_back(byte2); + target.push_back(byte3); + target.push_back(byte4); + break; + case 3: + target.push_back(byte1); + target.push_back(byte2); + target.push_back(byte3); + break; + case 2: + target.push_back(byte1); + target.push_back(byte2); + break; + case 1: + target.push_back(byte1); + break; + } + } + return convert_result{data,result} ; + } + + template + typename std::enable_if::value + && type_traits::is_back_insertable::value + && type_traits::is_char16::value, + convert_result>::type + convert(const CharT* data, std::size_t length, + Container& target, + conv_flags flags = conv_flags::strict) + { + conv_errc result = conv_errc(); + + const CharT* last = data + length; + while (data != last) + { + uint32_t ch = *data++; + if (ch <= max_bmp) { /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ + if (is_surrogate(ch) ) + { + if (flags == conv_flags::strict) { + --data; /* return to the illegal value itself */ + result = conv_errc::source_illegal; + break; + } else { + target.push_back(replacement_char); + } + } else { + target.push_back((uint16_t)ch); /* normal case */ + } + } else if (ch > max_legal_utf32) { + if (flags == conv_flags::strict) { + result = conv_errc::source_illegal; + } else { + target.push_back(replacement_char); + } + } else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + ch -= half_base; + target.push_back((uint16_t)((ch >> half_shift) + sur_high_start)); + target.push_back((uint16_t)((ch & half_mask) + sur_low_start)); + } + } + return convert_result{data,result} ; + } + + template + typename std::enable_if::value + && type_traits::is_back_insertable::value + && type_traits::is_char32::value, + convert_result>::type + convert(const CharT* data, std::size_t length, + Container& target, + conv_flags flags = conv_flags::strict) + { + conv_errc result = conv_errc(); + + const CharT* last = data + length; + while (data != last) + { + uint32_t ch = *data++; + if (flags == conv_flags::strict ) + { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (is_surrogate(ch)) + { + --data; /* return to the illegal value itself */ + result = conv_errc::illegal_surrogate_value; + break; + } + } + if (ch <= max_legal_utf32) + { + target.push_back(ch); + } + else + { + target.push_back(replacement_char); + result = conv_errc::source_illegal; + } + } + return convert_result{data,result} ; + } + + // validate + + template + typename std::enable_if::value, + convert_result>::type + validate(const CharT* data, std::size_t length) noexcept + { + conv_errc result = conv_errc(); + const CharT* last = data + length; + while (data != last) + { + std::size_t len = static_cast(trailing_bytes_for_utf8[static_cast(*data)]) + 1; + if (len > (std::size_t)(last - data)) + { + return convert_result{data, conv_errc::source_exhausted}; + } + if ((result=is_legal_utf8(data, len)) != conv_errc()) + { + return convert_result{data,result} ; + } + data += len; + } + return convert_result{data,result} ; + } + + // utf16 + + template + typename std::enable_if::value, + convert_result>::type + validate(const CharT* data, std::size_t length) noexcept + { + conv_errc result = conv_errc(); + + const CharT* last = data + length; + while (data != last) + { + uint32_t ch = *data++; + /* If we have a surrogate pair, validate to uint32_t data. */ + if (is_high_surrogate(ch)) + { + /* If the 16 bits following the high surrogate are in the data buffer... */ + if (data < last) { + uint32_t ch2 = *data; + /* If ptr's a low surrogate, */ + if (ch2 >= sur_low_start && ch2 <= sur_low_end) { + ++data; + } else { + --data; /* return to the illegal value itself */ + result = conv_errc::unpaired_high_surrogate; + break; + } + } + else // We don't have the 16 bits following the high surrogate. + { + --data; /* return to the high surrogate */ + result = conv_errc::source_exhausted; + break; + } + } + else if (is_low_surrogate(ch)) + { + /* UTF-16 surrogate values are illegal in UTF-32 */ + --data; /* return to the illegal value itself */ + result = conv_errc::source_illegal; + break; + } + } + return convert_result{data,result} ; + } + + // utf32 + + template + typename std::enable_if::value, + convert_result>::type + validate(const CharT* data, std::size_t length) noexcept + { + conv_errc result = conv_errc(); + + const CharT* last = data + length; + while (data != last) + { + uint32_t ch = *data++; + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (is_surrogate(ch)) + { + --data; /* return to the illegal value itself */ + result = conv_errc::illegal_surrogate_value; + break; + } + if (!(ch <= max_legal_utf32)) + { + result = conv_errc::source_illegal; + } + } + return convert_result{data, result} ; + } + + enum class encoding {u8,u16le,u16be,u32le,u32be,undetected}; + + template + struct determine_encoding_result + { + Iterator it; + encoding ec; + }; + + template + typename std::enable_if::value_type>::value && sizeof(typename std::iterator_traits::value_type) == sizeof(uint8_t), + determine_encoding_result>::type + detect_encoding(Iterator first, Iterator last) noexcept + { + Iterator it1 = first; + if (std::distance(first,last) < 4) + { + if (std::distance(first,last) == 3) + { + Iterator it2 = ++first; + Iterator it3 = ++first; + if (static_cast(*it1) == 0xEF && static_cast(*it2) == 0xBB && static_cast(*it3) == 0xBF) + { + return determine_encoding_result{last,encoding::u8}; + } + } + return determine_encoding_result{it1,encoding::undetected}; + } + else + { + Iterator it2 = ++first; + Iterator it3 = ++first; + Iterator it4 = ++first; + + uint32_t bom = static_cast(*it1) | (static_cast(*it2) << 8) | (static_cast(*it3) << 16) | (static_cast(*it4) << 24); + if (bom == 0xFFFE0000) + { + return determine_encoding_result{it4++,encoding::u32be}; + } + else if (bom == 0x0000FEFF) + { + return determine_encoding_result{first,encoding::u32le}; + } + else if ((bom & 0xFFFF) == 0xFFFE) + { + return determine_encoding_result{it3,encoding::u16be}; + } + else if ((bom & 0xFFFF) == 0xFEFF) + { + return determine_encoding_result{it3,encoding::u16le}; + } + else if ((bom & 0xFFFFFF) == 0xBFBBEF) + { + return determine_encoding_result{it4,encoding::u8}; + } + else + { + uint32_t pattern = (static_cast(*it1) ? 1 : 0) | (static_cast(*it2) ? 2 : 0) | (static_cast(*it3) ? 4 : 0) | (static_cast(*it4) ? 8 : 0); + switch (pattern) { + case 0x08: + return determine_encoding_result{it1,encoding::u32be}; + case 0x0A: + return determine_encoding_result{it1,encoding::u16be}; + case 0x01: + return determine_encoding_result{it1,encoding::u32le}; + case 0x05: + return determine_encoding_result{it1,encoding::u16le}; + case 0x0F: + return determine_encoding_result{it1,encoding::u8}; + default: + return determine_encoding_result{it1,encoding::undetected}; + } + } + } + } + + // count_codepoints + + template + typename std::enable_if::value || type_traits::is_char16::value || type_traits::is_char32::value, std::size_t>::type + count_codepoints(const CharT* data, std::size_t length, + conv_flags flags = conv_flags::strict) noexcept + { + conv_errc ec = conv_errc(); + + std::size_t count = 0; + const CharT* ptr = data; + const CharT* last = data + length; + + for (; ptr < last; ++count) + { + uint32_t cp = 0; + auto r = to_codepoint(ptr, last, cp, flags); + if (r.ec != conv_errc()) + { + ec = r.ec; + break; + } + ptr = r.ptr; + } + return ec == conv_errc() && ptr == last ? count : 0; + } + +} // unicode_traits +} // jsoncons + +#endif + -- cgit v1.2.3