aboutsummaryrefslogtreecommitdiff
path: root/include/jsoncons/unicode_traits.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'include/jsoncons/unicode_traits.hpp')
-rw-r--r--include/jsoncons/unicode_traits.hpp1330
1 files changed, 1330 insertions, 0 deletions
diff --git a/include/jsoncons/unicode_traits.hpp b/include/jsoncons/unicode_traits.hpp
new file mode 100644
index 0000000..f45bafe
--- /dev/null
+++ b/include/jsoncons/unicode_traits.hpp
@@ -0,0 +1,1330 @@
+// Copyright 2016 Daniel Parker
+// Distributed under the Boost license, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+// See https://github.com/danielaparker/unicode_traits for latest version
+
+/*
+ * Includes code derived from Unicode, Inc decomposition code in ConvertUTF.h and ConvertUTF.c
+ * http://www.unicode.org/
+ *
+ * "Unicode, Inc. hereby grants the right to freely use the information
+ * supplied in this file in the creation of products supporting the
+ * Unicode Standard."
+*/
+
+#ifndef JSONCONS_UNICODE_TRAITS_HPP
+#define JSONCONS_UNICODE_TRAITS_HPP
+
+#include <cstring>
+#include <string>
+#include <iterator>
+#include <type_traits>
+#include <system_error>
+#include <limits>
+#include <jsoncons/config/compiler_support.hpp>
+#include <jsoncons/more_type_traits.hpp>
+
+namespace jsoncons { namespace unicode_traits {
+
+ enum class encoding_kind {undetected,utf8,utf16le,utf16be,utf32le,utf32be};
+
+ inline
+ std::string to_string(encoding_kind encoding)
+ {
+ switch (encoding)
+ {
+ case encoding_kind::utf8:
+ return "utf8";
+ case encoding_kind::utf16le:
+ return "utf16le";
+ case encoding_kind::utf16be:
+ return "utf16be";
+ case encoding_kind::utf32le:
+ return "utf32le";
+ case encoding_kind::utf32be:
+ return "utf32be";
+ default:
+ return "undetected";
+ }
+ }
+
+ template <class Byte>
+ struct detect_encoding_result
+ {
+ const Byte* ptr;
+ encoding_kind encoding;
+ };
+
+ template <class CharT>
+ typename std::enable_if<type_traits::is_char8<CharT>::value,detect_encoding_result<CharT>>::type
+ detect_encoding_from_bom(const CharT* data, std::size_t length)
+ {
+ const uint8_t bom_utf8[] = {0xef,0xbb,0xbf};
+ const uint8_t bom_utf16le[] = {0xff,0xfe};
+ const uint8_t bom_utf16be[] = {0xfe,0xff};
+ const uint8_t bom_utf32le[] = {0xff,0xfe,0x00,0x00};
+ const uint8_t bom_utf32be[] = {0x00,0x00,0xfe,0xff};
+
+ if (length >= 4 && !memcmp(data,bom_utf32le,4))
+ {
+ return detect_encoding_result<CharT>{data+4,encoding_kind::utf32le};
+ }
+ else if (length >= 4 && !memcmp(data,bom_utf32be,4))
+ {
+ return detect_encoding_result<CharT>{data+4,encoding_kind::utf32be};
+ }
+ else if (length >= 2 && !memcmp(data,bom_utf16le,2))
+ {
+ return detect_encoding_result<CharT>{data+2,encoding_kind::utf16le};
+ }
+ else if (length >= 2 && !memcmp(data,bom_utf16be,2))
+ {
+ return detect_encoding_result<CharT>{data+2,encoding_kind::utf16be};
+ }
+ else if (length >= 3 && !memcmp(data,bom_utf8,3))
+ {
+ return detect_encoding_result<CharT>{data+3,encoding_kind::utf8};
+ }
+ else
+ {
+ return detect_encoding_result<CharT>{data,encoding_kind::undetected};
+ }
+ }
+
+ template <class CharT>
+ typename std::enable_if<type_traits::is_char16<CharT>::value || type_traits::is_char32<CharT>::value,detect_encoding_result<CharT>>::type
+ detect_encoding_from_bom(const CharT* data, std::size_t)
+ {
+ return detect_encoding_result<CharT>{data,encoding_kind::undetected};
+ }
+
+ template <class CharT>
+ typename std::enable_if<type_traits::is_char8<CharT>::value,detect_encoding_result<CharT>>::type
+ detect_json_encoding(const CharT* data, std::size_t length)
+ {
+ detect_encoding_result<CharT> r = detect_encoding_from_bom(data,length);
+ if (r.encoding != encoding_kind::undetected)
+ {
+ return r;
+ }
+ else if (length < 4)
+ {
+ return detect_encoding_result<CharT>{data,encoding_kind::utf8};
+ }
+ else if (*data == 0 && *(data+1) == 0 && *(data+2) == 0)
+ {
+ return detect_encoding_result<CharT>{data,encoding_kind::utf32be};
+ }
+ else if (*data == 0 && *(data+2) == 0)
+ {
+ return detect_encoding_result<CharT>{data,encoding_kind::utf16be};
+ }
+ else if (*(data+1) == 0 && *(data+2) == 0 && *(data+3) == 0)
+ {
+ return detect_encoding_result<CharT>{data,encoding_kind::utf32le};
+ }
+ else if (*(data+1) == 0 && *(data+3) == 0)
+ {
+ return detect_encoding_result<CharT>{data,encoding_kind::utf16le};
+ }
+ else
+ {
+ return detect_encoding_result<CharT>{data,encoding_kind::utf8};
+ }
+ }
+
+ template <class CharT>
+ typename std::enable_if<type_traits::is_char16<CharT>::value || type_traits::is_char32<CharT>::value,detect_encoding_result<CharT>>::type
+ detect_json_encoding(const CharT* data, std::size_t)
+ {
+ return detect_encoding_result<CharT>{data,encoding_kind::undetected};
+ }
+
+ /*
+ * Magic values subtracted from a buffer value during UTF8 conversion.
+ * This table contains as many values as there might be trailing bytes
+ * in a UTF-8 sequence. Source: ConvertUTF.c
+ */
+ const uint32_t offsets_from_utf8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
+ 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
+
+ /*
+ * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
+ * into the first byte, depending on how many bytes follow. There are
+ * as many entries in this table as there are UTF-8 sequence types.
+ * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
+ * for *legal* UTF-8 will be 4 or fewer bytes total. Source: ConvertUTF.c
+ */
+ const uint8_t first_byte_mark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+
+ /*
+ * Index into the table below with the first byte of a UTF-8 sequence to
+ * get the number of trailing bytes that are supposed to follow it.
+ * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
+ * left as-is for anyone who may want to do such conversion, which was
+ * allowed in earlier algorithms. Source: ConvertUTF.c
+ */
+ const uint8_t trailing_bytes_for_utf8[256] = {
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
+ };
+
+ // Some fundamental constants. Source: ConvertUTF.h
+ const uint32_t replacement_char = 0x0000FFFD;
+ const uint32_t max_bmp = 0x0000FFFF;
+ const uint32_t max_utf16 = 0x0010FFFF;
+ const uint32_t max_utf32 = 0x7FFFFFFF;
+ const uint32_t max_legal_utf32 = 0x0010FFFF;
+
+ const int half_shift = 10; // used for shifting by 10 bits
+ const uint32_t half_base = 0x0010000UL;
+ const uint32_t half_mask = 0x3FFUL;
+
+ const uint16_t sur_high_start = 0xD800;
+ const uint16_t sur_high_end = 0xDBFF;
+ const uint16_t sur_low_start = 0xDC00;
+ const uint16_t sur_low_end = 0xDFFF;
+
+ inline
+ static bool is_continuation_byte(unsigned char ch)
+ {
+ return (ch & 0xC0) == 0x80;
+ }
+
+ inline
+ bool is_high_surrogate(uint32_t ch) noexcept
+ {
+ return (ch >= sur_high_start && ch <= sur_high_end);
+ }
+
+ inline
+ bool is_low_surrogate(uint32_t ch) noexcept
+ {
+ return (ch >= sur_low_start && ch <= sur_low_end);
+ }
+
+ inline
+ bool is_surrogate(uint32_t ch) noexcept
+ {
+ return (ch >= sur_high_start && ch <= sur_low_end);
+ }
+
+ enum class conv_flags
+ {
+ strict = 0,
+ lenient
+ };
+
+ // conv_errc
+
+ enum class conv_errc
+ {
+ success = 0,
+ over_long_utf8_sequence = 1, // over long utf8 sequence
+ expected_continuation_byte, // expected continuation byte
+ unpaired_high_surrogate, // unpaired high surrogate UTF-16
+ illegal_surrogate_value, // UTF-16 surrogate values are illegal in UTF-32
+ source_exhausted, // partial character in source, but hit end
+ source_illegal // source sequence is illegal/malformed
+ };
+
+ class Unicode_traits_error_category_impl_
+ : public std::error_category
+ {
+ public:
+ virtual const char* name() const noexcept
+ {
+ return "unicode_traits conversion error";
+ }
+ virtual std::string message(int ev) const
+ {
+ switch (static_cast<conv_errc>(ev))
+ {
+ case conv_errc::over_long_utf8_sequence:
+ return "Over long utf8 sequence";
+ case conv_errc::expected_continuation_byte:
+ return "Expected continuation byte";
+ case conv_errc::unpaired_high_surrogate:
+ return "Unpaired high surrogate UTF-16";
+ case conv_errc::illegal_surrogate_value:
+ return "UTF-16 surrogate values are illegal in UTF-32";
+ case conv_errc::source_exhausted:
+ return "Partial character in source, but hit end";
+ case conv_errc::source_illegal:
+ return "Source sequence is illegal/malformed";
+ default:
+ return "";
+ break;
+ }
+ }
+ };
+
+ inline
+ const std::error_category& unicode_traits_error_category()
+ {
+ static Unicode_traits_error_category_impl_ instance;
+ return instance;
+ }
+
+ inline
+ std::error_code make_error_code(conv_errc result)
+ {
+ return std::error_code(static_cast<int>(result),unicode_traits_error_category());
+ }
+
+} // unicode_traits
+} // jsoncons
+
+namespace std {
+ template<>
+ struct is_error_code_enum<jsoncons::unicode_traits::conv_errc> : public true_type
+ {
+ };
+}
+
+namespace jsoncons { namespace unicode_traits {
+
+ // utf8
+
+ template <class CharT>
+ typename std::enable_if<type_traits::is_char8<CharT>::value, conv_errc>::type
+ is_legal_utf8(const CharT* first, std::size_t length)
+ {
+ uint8_t a;
+ const CharT* srcptr = first+length;
+ switch (length) {
+ default:
+ return conv_errc::over_long_utf8_sequence;
+ case 4:
+ if (((a = (*--srcptr))& 0xC0) != 0x80)
+ return conv_errc::expected_continuation_byte;
+ JSONCONS_FALLTHROUGH;
+ case 3:
+ if (((a = (*--srcptr))& 0xC0) != 0x80)
+ return conv_errc::expected_continuation_byte;
+ JSONCONS_FALLTHROUGH;
+ case 2:
+ if (((a = (*--srcptr))& 0xC0) != 0x80)
+ return conv_errc::expected_continuation_byte;
+
+ switch (static_cast<uint8_t>(*first))
+ {
+ // no fall-through in this inner switch
+ case 0xE0: if (a < 0xA0) return conv_errc::source_illegal; break;
+ case 0xED: if (a > 0x9F) return conv_errc::source_illegal; break;
+ case 0xF0: if (a < 0x90) return conv_errc::source_illegal; break;
+ case 0xF4: if (a > 0x8F) return conv_errc::source_illegal; break;
+ default: if (a < 0x80) return conv_errc::source_illegal;
+ }
+
+ JSONCONS_FALLTHROUGH;
+ case 1:
+ if (static_cast<uint8_t>(*first) >= 0x80 && static_cast<uint8_t>(*first) < 0xC2)
+ return conv_errc::source_illegal;
+ break;
+ }
+ if (static_cast<uint8_t>(*first) > 0xF4)
+ return conv_errc::source_illegal;
+
+ return conv_errc();
+ }
+
+ template <class...> using void_t = void;
+
+ template <class, class, class = void>
+ struct is_output_iterator : std::false_type {};
+
+ template <class I, class E>
+ struct is_output_iterator<I, E, void_t<
+ typename std::iterator_traits<I>::iterator_category,
+ decltype(*std::declval<I>() = std::declval<E>())>> : std::true_type {};
+
+ // is_same_size fixes issue with vs2013
+
+ // primary template
+ template<class T1, class T2, class Enable = void>
+ struct is_same_size : std::false_type
+ {
+ };
+
+ // specialization for non void types
+ template<class T1, class T2>
+ struct is_same_size<T1, T2, typename std::enable_if<!std::is_void<T1>::value && !std::is_void<T2>::value>::type>
+ {
+ static constexpr bool value = (sizeof(T1) == sizeof(T2));
+ };
+
+ // convert
+
+ template <class CharT>
+ struct convert_result
+ {
+ const CharT* ptr;
+ conv_errc ec;
+ };
+
+ // to_codepoint
+
+ template <class CharT,class CodepointT>
+ typename std::enable_if<type_traits::is_char8<CharT>::value && type_traits::is_char32<CodepointT>::value,
+ convert_result<CharT>>::type
+ to_codepoint(const CharT* first, const CharT* last,
+ CodepointT& ch,
+ conv_flags flags = conv_flags::strict) noexcept
+ {
+ ch = 0;
+ if (first >= last)
+ {
+ return convert_result<CharT>{first, conv_errc::source_exhausted};
+ }
+ conv_errc result = conv_errc();
+
+ unsigned short extra_bytes_to_read = trailing_bytes_for_utf8[static_cast<uint8_t>(*first)];
+ if (extra_bytes_to_read >= last - first)
+ {
+ result = conv_errc::source_exhausted;
+ return convert_result<CharT>{first, result};
+ }
+ // Do this check whether lenient or strict
+ if ((result=is_legal_utf8(first, extra_bytes_to_read+1)) != conv_errc())
+ {
+ return convert_result<CharT>{first, result};
+ }
+ // The cases all fall through. See "Note A" below.
+ switch (extra_bytes_to_read)
+ {
+ case 5:
+ ch += static_cast<uint8_t>(*first++);
+ ch <<= 6;
+ JSONCONS_FALLTHROUGH;
+ case 4:
+ ch += static_cast<uint8_t>(*first++);
+ ch <<= 6;
+ JSONCONS_FALLTHROUGH;
+ case 3:
+ ch += static_cast<uint8_t>(*first++);
+ ch <<= 6;
+ JSONCONS_FALLTHROUGH;
+ case 2:
+ ch += static_cast<uint8_t>(*first++);
+ ch <<= 6;
+ JSONCONS_FALLTHROUGH;
+ case 1:
+ ch += static_cast<uint8_t>(*first++);
+ ch <<= 6;
+ JSONCONS_FALLTHROUGH;
+ case 0:
+ ch += static_cast<uint8_t>(*first++);
+ break;
+ }
+ ch -= offsets_from_utf8[extra_bytes_to_read];
+
+ if (ch <= max_legal_utf32) {
+ /*
+ * UTF-16 surrogate values are illegal in UTF-32, and anything
+ * over Plane 17 (> 0x10FFFF) is illegal.
+ */
+ if (is_surrogate(ch) )
+ {
+ if (flags == conv_flags::strict)
+ {
+ first -= (extra_bytes_to_read+1); // return to the illegal value itself
+ result = conv_errc::source_illegal;
+ return convert_result<CharT>{first, result};
+ }
+ else
+ {
+ ch = replacement_char;
+ }
+ }
+ }
+ else // i.e., ch > max_legal_utf32
+ {
+ result = conv_errc::source_illegal;
+ ch = replacement_char;
+ }
+
+ return convert_result<CharT>{first,result} ;
+ }
+
+ template <class CharT,class CodepointT>
+ typename std::enable_if<type_traits::is_char16<CharT>::value && type_traits::is_char32<CodepointT>::value,
+ convert_result<CharT>>::type
+ to_codepoint(const CharT* first, const CharT* last,
+ CodepointT& ch,
+ conv_flags flags = conv_flags::strict) noexcept
+ {
+ ch = 0;
+ if (first >= last)
+ {
+ return convert_result<CharT>{first, conv_errc::source_exhausted};
+ }
+ conv_errc result = conv_errc();
+
+ ch = *first++;
+ // If we have a surrogate pair, convert to UTF32 first.
+ if (is_high_surrogate(ch))
+ {
+ // If the 16 bits following the high surrogate are in the first buffer...
+ if (first < last)
+ {
+ uint32_t ch2 = *first;
+ // If ptr's a low surrogate, convert to UTF32.
+ if (ch2 >= sur_low_start && ch2 <= sur_low_end )
+ {
+ ch = ((ch - sur_high_start) << half_shift)
+ + (ch2 - sur_low_start) + half_base;
+ ++first;
+ }
+ else if (flags == conv_flags::strict) // ptr's an unpaired high surrogate
+ {
+ --first; /* return to the illegal value itself */
+ result = conv_errc::source_illegal;
+ return convert_result<CharT>{first, result};
+ }
+ }
+ else
+ { /* We don't have the 16 bits following the high surrogate. */
+ --first; /* return to the high surrogate */
+ result = conv_errc::source_exhausted;
+ return convert_result<CharT>{first, result};
+ }
+ } else if (flags == conv_flags::strict) {
+ /* UTF-16 surrogate values are illegal in UTF-32 */
+ if (is_low_surrogate(ch) )
+ {
+ --first; /* return to the illegal value itself */
+ result = conv_errc::source_illegal;
+ return convert_result<CharT>{first, result};
+ }
+ }
+
+ return convert_result<CharT>{first,result} ;
+ }
+
+ template <class CharT,class CodepointT>
+ typename std::enable_if<type_traits::is_char32<CharT>::value && type_traits::is_char32<CodepointT>::value,
+ convert_result<CharT>>::type
+ to_codepoint(const CharT* first, const CharT* last,
+ CodepointT& ch,
+ conv_flags flags = conv_flags::strict) noexcept
+ {
+ ch = 0;
+ if (first >= last)
+ {
+ return convert_result<CharT>{first, conv_errc::source_exhausted};
+ }
+ conv_errc result = conv_errc();
+
+ ch = *first++;
+ if (flags == conv_flags::strict )
+ {
+ /* UTF-16 surrogate values are illegal in UTF-32 */
+ if (is_surrogate(ch))
+ {
+ --first; /* return to the illegal value itself */
+ result = conv_errc::illegal_surrogate_value;
+ return convert_result<CharT>{first,result} ;
+ }
+ }
+ if (!(ch <= max_legal_utf32))
+ {
+ ch = replacement_char;
+ result = conv_errc::source_illegal;
+ }
+
+ return convert_result<CharT>{first,result} ;
+ }
+
+ // convert
+
+ template <class CharT,class Container>
+ typename std::enable_if<type_traits::is_char8<CharT>::value
+ && type_traits::is_back_insertable<Container>::value
+ && type_traits::is_char8<typename Container::value_type>::value,
+ convert_result<CharT>>::type
+ convert(const CharT* data, std::size_t length, Container& target, conv_flags flags=conv_flags::strict)
+ {
+ (void)flags;
+
+ conv_errc result = conv_errc();
+ const CharT* last = data + length;
+ while (data != last)
+ {
+ std::size_t len = trailing_bytes_for_utf8[static_cast<uint8_t>(*data)] + 1;
+ if (len > (std::size_t)(last - data))
+ {
+ return convert_result<CharT>{data, conv_errc::source_exhausted};
+ }
+ if ((result=is_legal_utf8(data, len)) != conv_errc())
+ {
+ return convert_result<CharT>{data,result};
+ }
+
+ switch (len) {
+ case 4: target.push_back(static_cast<uint8_t>(*data++));
+ JSONCONS_FALLTHROUGH;
+ case 3: target.push_back(static_cast<uint8_t>(*data++));
+ JSONCONS_FALLTHROUGH;
+ case 2: target.push_back(static_cast<uint8_t>(*data++));
+ JSONCONS_FALLTHROUGH;
+ case 1: target.push_back(static_cast<uint8_t>(*data++));
+ }
+ }
+ return convert_result<CharT>{data,result} ;
+ }
+
+ template <class CharT,class Container>
+ typename std::enable_if<type_traits::is_char8<CharT>::value
+ && type_traits::is_back_insertable<Container>::value
+ && type_traits::is_char16<typename Container::value_type>::value,
+ convert_result<CharT>>::type
+ convert(const CharT* data, std::size_t length,
+ Container& target,
+ conv_flags flags = conv_flags::strict)
+ {
+ conv_errc result = conv_errc();
+
+ const CharT* last = data + length;
+ while (data != last)
+ {
+ unsigned short extra_bytes_to_read = trailing_bytes_for_utf8[static_cast<uint8_t>(*data)];
+ if (extra_bytes_to_read >= last - data)
+ {
+ result = conv_errc::source_exhausted;
+ break;
+ }
+ /* Do this check whether lenient or strict */
+ if ((result=is_legal_utf8(data, extra_bytes_to_read+1)) != conv_errc())
+ {
+ break;
+ }
+ /*
+ * The cases all fall through. See "Note A" below.
+ */
+ uint32_t ch = 0;
+ switch (extra_bytes_to_read) {
+ case 5: ch += static_cast<uint8_t>(*data++); ch <<= 6; /* remember, illegal UTF-8 */
+ JSONCONS_FALLTHROUGH;
+ case 4: ch += static_cast<uint8_t>(*data++); ch <<= 6; /* remember, illegal UTF-8 */
+ JSONCONS_FALLTHROUGH;
+ case 3: ch += static_cast<uint8_t>(*data++); ch <<= 6;
+ JSONCONS_FALLTHROUGH;
+ case 2: ch += static_cast<uint8_t>(*data++); ch <<= 6;
+ JSONCONS_FALLTHROUGH;
+ case 1: ch += static_cast<uint8_t>(*data++); ch <<= 6;
+ JSONCONS_FALLTHROUGH;
+ case 0: ch += static_cast<uint8_t>(*data++);
+ break;
+ }
+ ch -= offsets_from_utf8[extra_bytes_to_read];
+
+ if (ch <= max_bmp) { /* Target is a character <= 0xFFFF */
+ /* UTF-16 surrogate values are illegal in UTF-32 */
+ if (is_surrogate(ch) )
+ {
+ if (flags == conv_flags::strict) {
+ data -= (extra_bytes_to_read+1); /* return to the illegal value itself */
+ result = conv_errc::source_illegal;
+ break;
+ } else {
+ target.push_back(replacement_char);
+ }
+ } else {
+ target.push_back((uint16_t)ch); /* normal case */
+ }
+ } else if (ch > max_utf16) {
+ if (flags == conv_flags::strict) {
+ result = conv_errc::source_illegal;
+ data -= (extra_bytes_to_read+1); /* return to the start */
+ break; /* Bail out; shouldn't continue */
+ } else {
+ target.push_back(replacement_char);
+ }
+ } else {
+ /* target is a character in range 0xFFFF - 0x10FFFF. */
+ ch -= half_base;
+ target.push_back((uint16_t)((ch >> half_shift) + sur_high_start));
+ target.push_back((uint16_t)((ch & half_mask) + sur_low_start));
+ }
+ }
+ return convert_result<CharT>{data,result} ;
+ }
+
+ template <class CharT,class Container>
+ typename std::enable_if<type_traits::is_char8<CharT>::value
+ && type_traits::is_back_insertable<Container>::value
+ && type_traits::is_char32<typename Container::value_type>::value,
+ convert_result<CharT>>::type
+ convert(const CharT* data, std::size_t length,
+ Container& target,
+ conv_flags flags = conv_flags::strict)
+ {
+ conv_errc result = conv_errc();
+
+ const CharT* last = data + length;
+ while (data < last)
+ {
+ uint32_t ch = 0;
+ unsigned short extra_bytes_to_read = trailing_bytes_for_utf8[static_cast<uint8_t>(*data)];
+ if (extra_bytes_to_read >= last - data)
+ {
+ result = conv_errc::source_exhausted;
+ break;
+ }
+ /* Do this check whether lenient or strict */
+ if ((result=is_legal_utf8(data, extra_bytes_to_read+1)) != conv_errc())
+ {
+ break;
+ }
+ /*
+ * The cases all fall through. See "Note A" below.
+ */
+ switch (extra_bytes_to_read)
+ {
+ case 5:
+ ch += static_cast<uint8_t>(*data++);
+ ch <<= 6;
+ JSONCONS_FALLTHROUGH;
+ case 4:
+ ch += static_cast<uint8_t>(*data++);
+ ch <<= 6;
+ JSONCONS_FALLTHROUGH;
+ case 3:
+ ch += static_cast<uint8_t>(*data++);
+ ch <<= 6;
+ JSONCONS_FALLTHROUGH;
+ case 2:
+ ch += static_cast<uint8_t>(*data++);
+ ch <<= 6;
+ JSONCONS_FALLTHROUGH;
+ case 1:
+ ch += static_cast<uint8_t>(*data++);
+ ch <<= 6;
+ JSONCONS_FALLTHROUGH;
+ case 0:
+ ch += static_cast<uint8_t>(*data++);
+ break;
+ }
+ ch -= offsets_from_utf8[extra_bytes_to_read];
+
+ if (ch <= max_legal_utf32) {
+ /*
+ * UTF-16 surrogate values are illegal in UTF-32, and anything
+ * over Plane 17 (> 0x10FFFF) is illegal.
+ */
+ if (is_surrogate(ch) )
+ {
+ if (flags == conv_flags::strict) {
+ data -= (extra_bytes_to_read+1); /* return to the illegal value itself */
+ result = conv_errc::source_illegal;
+ break;
+ } else {
+ target.push_back(replacement_char);
+ }
+ } else {
+ target.push_back(ch);
+ }
+ } else { /* i.e., ch > max_legal_utf32 */
+ result = conv_errc::source_illegal;
+ target.push_back(replacement_char);
+ }
+ }
+ return convert_result<CharT>{data,result} ;
+ }
+
+ // utf16
+
+ template <class CharT,class Container>
+ typename std::enable_if<type_traits::is_char16<CharT>::value
+ && type_traits::is_back_insertable<Container>::value
+ && type_traits::is_char8<typename Container::value_type>::value,
+ convert_result<CharT>>::type
+ convert(const CharT* data, std::size_t length,
+ Container& target,
+ conv_flags flags = conv_flags::strict) {
+ conv_errc result = conv_errc();
+
+ const CharT* last = data + length;
+ while (data < last) {
+ unsigned short bytes_to_write = 0;
+ const uint32_t byteMask = 0xBF;
+ const uint32_t byteMark = 0x80;
+ uint32_t ch = *data++;
+ /* If we have a surrogate pair, convert to uint32_t data. */
+ if (is_high_surrogate(ch))
+ {
+ /* If the 16 bits following the high surrogate are in the data buffer... */
+ if (data < last) {
+ uint32_t ch2 = *data;
+ /* If ptr's a low surrogate, convert to uint32_t. */
+ if (ch2 >= sur_low_start && ch2 <= sur_low_end) {
+ ch = ((ch - sur_high_start) << half_shift)
+ + (ch2 - sur_low_start) + half_base;
+ ++data;
+ } else if (flags == conv_flags::strict) { /* ptr's an unpaired high surrogate */
+ --data; /* return to the illegal value itself */
+ result = conv_errc::unpaired_high_surrogate;
+ break;
+ }
+ } else { /* We don't have the 16 bits following the high surrogate. */
+ --data; /* return to the high surrogate */
+ result = conv_errc::source_exhausted;
+ break;
+ }
+ } else if (flags == conv_flags::strict) {
+ /* UTF-16 surrogate values are illegal in UTF-32 */
+ if (is_low_surrogate(ch))
+ {
+ --data; /* return to the illegal value itself */
+ result = conv_errc::source_illegal;
+ break;
+ }
+ }
+ /* Figure out how many bytes the result will require */
+ if (ch < (uint32_t)0x80) {
+ bytes_to_write = 1;
+ } else if (ch < (uint32_t)0x800) {
+ bytes_to_write = 2;
+ } else if (ch < (uint32_t)0x10000) {
+ bytes_to_write = 3;
+ } else if (ch < (uint32_t)0x110000) {
+ bytes_to_write = 4;
+ } else {
+ bytes_to_write = 3;
+ ch = replacement_char;
+ }
+
+ uint8_t byte1 = 0;
+ uint8_t byte2 = 0;
+ uint8_t byte3 = 0;
+ uint8_t byte4 = 0;
+
+ switch (bytes_to_write) { // note: everything falls through
+ case 4: byte4 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6;
+ JSONCONS_FALLTHROUGH;
+ case 3: byte3 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6;
+ JSONCONS_FALLTHROUGH;
+ case 2: byte2 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6;
+ JSONCONS_FALLTHROUGH;
+ case 1: byte1 = (uint8_t)(ch | first_byte_mark[bytes_to_write]);
+ break;
+ }
+ switch (bytes_to_write)
+ {
+ case 4:
+ target.push_back(byte1);
+ target.push_back(byte2);
+ target.push_back(byte3);
+ target.push_back(byte4);
+ break;
+ case 3:
+ target.push_back(byte1);
+ target.push_back(byte2);
+ target.push_back(byte3);
+ break;
+ case 2:
+ target.push_back(byte1);
+ target.push_back(byte2);
+ break;
+ case 1:
+ target.push_back(byte1);
+ break;
+ }
+ }
+ return convert_result<CharT>{data,result} ;
+ }
+
+ template <class CharT,class Container>
+ typename std::enable_if<type_traits::is_char16<CharT>::value
+ && type_traits::is_back_insertable<Container>::value
+ && type_traits::is_char16<typename Container::value_type>::value,
+ convert_result<CharT>>::type
+ convert(const CharT* data, std::size_t length,
+ Container& target,
+ conv_flags flags = conv_flags::strict)
+ {
+ conv_errc result = conv_errc();
+
+ const CharT* last = data + length;
+ while (data != last)
+ {
+ uint32_t ch = *data++;
+ /* If we have a surrogate pair, convert to uint32_t data. */
+ if (is_high_surrogate(ch))
+ {
+ /* If the 16 bits following the high surrogate are in the data buffer... */
+ if (data < last) {
+ uint32_t ch2 = *data;
+ /* If ptr's a low surrogate, */
+ if (ch2 >= sur_low_start && ch2 <= sur_low_end) {
+ target.push_back((uint16_t)ch);
+ target.push_back((uint16_t)ch2);
+ ++data;
+ } else if (flags == conv_flags::strict) { /* ptr's an unpaired high surrogate */
+ --data; /* return to the illegal value itself */
+ result = conv_errc::unpaired_high_surrogate;
+ break;
+ }
+ } else { /* We don't have the 16 bits following the high surrogate. */
+ --data; /* return to the high surrogate */
+ result = conv_errc::source_exhausted;
+ break;
+ }
+ } else if (is_low_surrogate(ch))
+ {
+ // illegal leading low surrogate
+ if (flags == conv_flags::strict) {
+ --data; /* return to the illegal value itself */
+ result = conv_errc::source_illegal;
+ break;
+ }
+ else
+ {
+ target.push_back((uint16_t)ch);
+ }
+ }
+ else
+ {
+ target.push_back((uint16_t)ch);
+ }
+ }
+ return convert_result<CharT>{data,result} ;
+ }
+
+ template <class CharT,class Container>
+ typename std::enable_if<type_traits::is_char16<CharT>::value
+ && type_traits::is_back_insertable<Container>::value
+ && type_traits::is_char32<typename Container::value_type>::value,
+ convert_result<CharT>>::type
+ convert(const CharT* data, std::size_t length,
+ Container& target,
+ conv_flags flags = conv_flags::strict)
+ {
+ conv_errc result = conv_errc();
+
+ const CharT* last = data + length;
+ while (data != last)
+ {
+ uint32_t ch = *data++;
+ /* If we have a surrogate pair, convert to UTF32 data. */
+ if (is_high_surrogate(ch))
+ {
+ /* If the 16 bits following the high surrogate are in the data buffer... */
+ if (data < last) {
+ uint32_t ch2 = *data;
+ /* If ptr's a low surrogate, convert to UTF32. */
+ if (ch2 >= sur_low_start && ch2 <= sur_low_end )
+ {
+ ch = ((ch - sur_high_start) << half_shift)
+ + (ch2 - sur_low_start) + half_base;
+ ++data;
+ } else if (flags == conv_flags::strict) { /* ptr's an unpaired high surrogate */
+ --data; /* return to the illegal value itself */
+ result = conv_errc::source_illegal;
+ break;
+ }
+ } else { /* We don't have the 16 bits following the high surrogate. */
+ --data; /* return to the high surrogate */
+ result = conv_errc::source_exhausted;
+ break;
+ }
+ } else if (flags == conv_flags::strict) {
+ /* UTF-16 surrogate values are illegal in UTF-32 */
+ if (is_low_surrogate(ch) )
+ {
+ --data; /* return to the illegal value itself */
+ result = conv_errc::source_illegal;
+ break;
+ }
+ }
+ target.push_back(ch);
+ }
+ return convert_result<CharT>{data,result} ;
+ }
+
+ // utf32
+
+ template <class CharT,class Container>
+ typename std::enable_if<type_traits::is_char32<CharT>::value
+ && type_traits::is_back_insertable<Container>::value
+ && type_traits::is_char8<typename Container::value_type>::value,
+ convert_result<CharT>>::type
+ convert(const CharT* data, std::size_t length,
+ Container& target,
+ conv_flags flags = conv_flags::strict)
+ {
+ conv_errc result = conv_errc();
+ const CharT* last = data + length;
+ while (data < last)
+ {
+ unsigned short bytes_to_write = 0;
+ const uint32_t byteMask = 0xBF;
+ const uint32_t byteMark = 0x80;
+ uint32_t ch = *data++;
+ if (flags == conv_flags::strict )
+ {
+ /* UTF-16 surrogate values are illegal in UTF-32 */
+ if (is_surrogate(ch))
+ {
+ --data; /* return to the illegal value itself */
+ result = conv_errc::illegal_surrogate_value;
+ break;
+ }
+ }
+ /*
+ * Figure out how many bytes the result will require. Turn any
+ * illegally large UTF32 things (> Plane 17) into replacement chars.
+ */
+ if (ch < (uint32_t)0x80) { bytes_to_write = 1;
+ } else if (ch < (uint32_t)0x800) { bytes_to_write = 2;
+ } else if (ch < (uint32_t)0x10000) { bytes_to_write = 3;
+ } else if (ch <= max_legal_utf32) { bytes_to_write = 4;
+ } else {
+ bytes_to_write = 3;
+ ch = replacement_char;
+ result = conv_errc::source_illegal;
+ }
+
+ uint8_t byte1 = 0;
+ uint8_t byte2 = 0;
+ uint8_t byte3 = 0;
+ uint8_t byte4 = 0;
+
+ switch (bytes_to_write) {
+ case 4:
+ byte4 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6;
+ JSONCONS_FALLTHROUGH;
+ case 3:
+ byte3 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6;
+ JSONCONS_FALLTHROUGH;
+ case 2:
+ byte2 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6;
+ JSONCONS_FALLTHROUGH;
+ case 1:
+ byte1 = (uint8_t) (ch | first_byte_mark[bytes_to_write]);
+ break;
+ }
+
+ switch (bytes_to_write)
+ {
+ case 4:
+ target.push_back(byte1);
+ target.push_back(byte2);
+ target.push_back(byte3);
+ target.push_back(byte4);
+ break;
+ case 3:
+ target.push_back(byte1);
+ target.push_back(byte2);
+ target.push_back(byte3);
+ break;
+ case 2:
+ target.push_back(byte1);
+ target.push_back(byte2);
+ break;
+ case 1:
+ target.push_back(byte1);
+ break;
+ }
+ }
+ return convert_result<CharT>{data,result} ;
+ }
+
+ template <class CharT,class Container>
+ typename std::enable_if<type_traits::is_char32<CharT>::value
+ && type_traits::is_back_insertable<Container>::value
+ && type_traits::is_char16<typename Container::value_type>::value,
+ convert_result<CharT>>::type
+ convert(const CharT* data, std::size_t length,
+ Container& target,
+ conv_flags flags = conv_flags::strict)
+ {
+ conv_errc result = conv_errc();
+
+ const CharT* last = data + length;
+ while (data != last)
+ {
+ uint32_t ch = *data++;
+ if (ch <= max_bmp) { /* Target is a character <= 0xFFFF */
+ /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
+ if (is_surrogate(ch) )
+ {
+ if (flags == conv_flags::strict) {
+ --data; /* return to the illegal value itself */
+ result = conv_errc::source_illegal;
+ break;
+ } else {
+ target.push_back(replacement_char);
+ }
+ } else {
+ target.push_back((uint16_t)ch); /* normal case */
+ }
+ } else if (ch > max_legal_utf32) {
+ if (flags == conv_flags::strict) {
+ result = conv_errc::source_illegal;
+ } else {
+ target.push_back(replacement_char);
+ }
+ } else {
+ /* target is a character in range 0xFFFF - 0x10FFFF. */
+ ch -= half_base;
+ target.push_back((uint16_t)((ch >> half_shift) + sur_high_start));
+ target.push_back((uint16_t)((ch & half_mask) + sur_low_start));
+ }
+ }
+ return convert_result<CharT>{data,result} ;
+ }
+
+ template <class CharT,class Container>
+ typename std::enable_if<type_traits::is_char32<CharT>::value
+ && type_traits::is_back_insertable<Container>::value
+ && type_traits::is_char32<typename Container::value_type>::value,
+ convert_result<CharT>>::type
+ convert(const CharT* data, std::size_t length,
+ Container& target,
+ conv_flags flags = conv_flags::strict)
+ {
+ conv_errc result = conv_errc();
+
+ const CharT* last = data + length;
+ while (data != last)
+ {
+ uint32_t ch = *data++;
+ if (flags == conv_flags::strict )
+ {
+ /* UTF-16 surrogate values are illegal in UTF-32 */
+ if (is_surrogate(ch))
+ {
+ --data; /* return to the illegal value itself */
+ result = conv_errc::illegal_surrogate_value;
+ break;
+ }
+ }
+ if (ch <= max_legal_utf32)
+ {
+ target.push_back(ch);
+ }
+ else
+ {
+ target.push_back(replacement_char);
+ result = conv_errc::source_illegal;
+ }
+ }
+ return convert_result<CharT>{data,result} ;
+ }
+
+ // validate
+
+ template <class CharT>
+ typename std::enable_if<type_traits::is_char8<CharT>::value,
+ convert_result<CharT>>::type
+ validate(const CharT* data, std::size_t length) noexcept
+ {
+ conv_errc result = conv_errc();
+ const CharT* last = data + length;
+ while (data != last)
+ {
+ std::size_t len = static_cast<std::size_t>(trailing_bytes_for_utf8[static_cast<uint8_t>(*data)]) + 1;
+ if (len > (std::size_t)(last - data))
+ {
+ return convert_result<CharT>{data, conv_errc::source_exhausted};
+ }
+ if ((result=is_legal_utf8(data, len)) != conv_errc())
+ {
+ return convert_result<CharT>{data,result} ;
+ }
+ data += len;
+ }
+ return convert_result<CharT>{data,result} ;
+ }
+
+ // utf16
+
+ template <class CharT>
+ typename std::enable_if<type_traits::is_char16<CharT>::value,
+ convert_result<CharT>>::type
+ validate(const CharT* data, std::size_t length) noexcept
+ {
+ conv_errc result = conv_errc();
+
+ const CharT* last = data + length;
+ while (data != last)
+ {
+ uint32_t ch = *data++;
+ /* If we have a surrogate pair, validate to uint32_t data. */
+ if (is_high_surrogate(ch))
+ {
+ /* If the 16 bits following the high surrogate are in the data buffer... */
+ if (data < last) {
+ uint32_t ch2 = *data;
+ /* If ptr's a low surrogate, */
+ if (ch2 >= sur_low_start && ch2 <= sur_low_end) {
+ ++data;
+ } else {
+ --data; /* return to the illegal value itself */
+ result = conv_errc::unpaired_high_surrogate;
+ break;
+ }
+ }
+ else // We don't have the 16 bits following the high surrogate.
+ {
+ --data; /* return to the high surrogate */
+ result = conv_errc::source_exhausted;
+ break;
+ }
+ }
+ else if (is_low_surrogate(ch))
+ {
+ /* UTF-16 surrogate values are illegal in UTF-32 */
+ --data; /* return to the illegal value itself */
+ result = conv_errc::source_illegal;
+ break;
+ }
+ }
+ return convert_result<CharT>{data,result} ;
+ }
+
+ // utf32
+
+ template <class CharT>
+ typename std::enable_if<type_traits::is_char32<CharT>::value,
+ convert_result<CharT>>::type
+ validate(const CharT* data, std::size_t length) noexcept
+ {
+ conv_errc result = conv_errc();
+
+ const CharT* last = data + length;
+ while (data != last)
+ {
+ uint32_t ch = *data++;
+ /* UTF-16 surrogate values are illegal in UTF-32 */
+ if (is_surrogate(ch))
+ {
+ --data; /* return to the illegal value itself */
+ result = conv_errc::illegal_surrogate_value;
+ break;
+ }
+ if (!(ch <= max_legal_utf32))
+ {
+ result = conv_errc::source_illegal;
+ }
+ }
+ return convert_result<CharT>{data, result} ;
+ }
+
+ enum class encoding {u8,u16le,u16be,u32le,u32be,undetected};
+
+ template <class Iterator>
+ struct determine_encoding_result
+ {
+ Iterator it;
+ encoding ec;
+ };
+
+ template <class Iterator>
+ typename std::enable_if<std::is_integral<typename std::iterator_traits<Iterator>::value_type>::value && sizeof(typename std::iterator_traits<Iterator>::value_type) == sizeof(uint8_t),
+ determine_encoding_result<Iterator>>::type
+ detect_encoding(Iterator first, Iterator last) noexcept
+ {
+ Iterator it1 = first;
+ if (std::distance(first,last) < 4)
+ {
+ if (std::distance(first,last) == 3)
+ {
+ Iterator it2 = ++first;
+ Iterator it3 = ++first;
+ if (static_cast<uint8_t>(*it1) == 0xEF && static_cast<uint8_t>(*it2) == 0xBB && static_cast<uint8_t>(*it3) == 0xBF)
+ {
+ return determine_encoding_result<Iterator>{last,encoding::u8};
+ }
+ }
+ return determine_encoding_result<Iterator>{it1,encoding::undetected};
+ }
+ else
+ {
+ Iterator it2 = ++first;
+ Iterator it3 = ++first;
+ Iterator it4 = ++first;
+
+ uint32_t bom = static_cast<uint8_t>(*it1) | (static_cast<uint8_t>(*it2) << 8) | (static_cast<uint8_t>(*it3) << 16) | (static_cast<uint8_t>(*it4) << 24);
+ if (bom == 0xFFFE0000)
+ {
+ return determine_encoding_result<Iterator>{it4++,encoding::u32be};
+ }
+ else if (bom == 0x0000FEFF)
+ {
+ return determine_encoding_result<Iterator>{first,encoding::u32le};
+ }
+ else if ((bom & 0xFFFF) == 0xFFFE)
+ {
+ return determine_encoding_result<Iterator>{it3,encoding::u16be};
+ }
+ else if ((bom & 0xFFFF) == 0xFEFF)
+ {
+ return determine_encoding_result<Iterator>{it3,encoding::u16le};
+ }
+ else if ((bom & 0xFFFFFF) == 0xBFBBEF)
+ {
+ return determine_encoding_result<Iterator>{it4,encoding::u8};
+ }
+ else
+ {
+ uint32_t pattern = (static_cast<uint8_t>(*it1) ? 1 : 0) | (static_cast<uint8_t>(*it2) ? 2 : 0) | (static_cast<uint8_t>(*it3) ? 4 : 0) | (static_cast<uint8_t>(*it4) ? 8 : 0);
+ switch (pattern) {
+ case 0x08:
+ return determine_encoding_result<Iterator>{it1,encoding::u32be};
+ case 0x0A:
+ return determine_encoding_result<Iterator>{it1,encoding::u16be};
+ case 0x01:
+ return determine_encoding_result<Iterator>{it1,encoding::u32le};
+ case 0x05:
+ return determine_encoding_result<Iterator>{it1,encoding::u16le};
+ case 0x0F:
+ return determine_encoding_result<Iterator>{it1,encoding::u8};
+ default:
+ return determine_encoding_result<Iterator>{it1,encoding::undetected};
+ }
+ }
+ }
+ }
+
+ // count_codepoints
+
+ template <class CharT>
+ typename std::enable_if<type_traits::is_char8<CharT>::value || type_traits::is_char16<CharT>::value || type_traits::is_char32<CharT>::value, std::size_t>::type
+ count_codepoints(const CharT* data, std::size_t length,
+ conv_flags flags = conv_flags::strict) noexcept
+ {
+ conv_errc ec = conv_errc();
+
+ std::size_t count = 0;
+ const CharT* ptr = data;
+ const CharT* last = data + length;
+
+ for (; ptr < last; ++count)
+ {
+ uint32_t cp = 0;
+ auto r = to_codepoint(ptr, last, cp, flags);
+ if (r.ec != conv_errc())
+ {
+ ec = r.ec;
+ break;
+ }
+ ptr = r.ptr;
+ }
+ return ec == conv_errc() && ptr == last ? count : 0;
+ }
+
+} // unicode_traits
+} // jsoncons
+
+#endif
+