198 lines
4.7 KiB
C++
198 lines
4.7 KiB
C++
//
|
|
// Copyright (c) 2020 Krystian Stasiowski (sdkrystian@gmail.com)
|
|
//
|
|
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
|
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
//
|
|
// Official repository: https://github.com/boostorg/json
|
|
//
|
|
|
|
#ifndef BOOST_JSON_DETAIL_UTF8_HPP
|
|
#define BOOST_JSON_DETAIL_UTF8_HPP
|
|
|
|
#include <boost/json/detail/config.hpp>
|
|
|
|
#include <cstddef>
|
|
#include <cstring>
|
|
#include <cstdint>
|
|
|
|
BOOST_JSON_NS_BEGIN
|
|
namespace detail {
|
|
|
|
template<int N>
|
|
std::uint32_t
|
|
load_little_endian(void const* p)
|
|
{
|
|
std::uint32_t v = 0;
|
|
std::memcpy(&v, p, N);
|
|
#ifdef BOOST_JSON_BIG_ENDIAN
|
|
v = ((v & 0xFF000000) >> 24) |
|
|
((v & 0x00FF0000) >> 8) |
|
|
((v & 0x0000FF00) << 8) |
|
|
((v & 0x000000FF) << 24);
|
|
#endif
|
|
return v;
|
|
}
|
|
|
|
inline
|
|
uint16_t
|
|
classify_utf8(char c)
|
|
{
|
|
// 0x000 = invalid
|
|
// 0x102 = 2 bytes, second byte [80, BF]
|
|
// 0x203 = 3 bytes, second byte [A0, BF]
|
|
// 0x303 = 3 bytes, second byte [80, BF]
|
|
// 0x403 = 3 bytes, second byte [80, 9F]
|
|
// 0x504 = 4 bytes, second byte [90, BF]
|
|
// 0x604 = 4 bytes, second byte [80, BF]
|
|
// 0x704 = 4 bytes, second byte [80, 8F]
|
|
static constexpr uint16_t first[128]
|
|
{
|
|
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
|
|
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
|
|
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
|
|
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
|
|
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
|
|
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
|
|
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
|
|
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
|
|
|
|
0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
|
|
0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
|
|
0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
|
|
0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
|
|
0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303,
|
|
0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303,
|
|
0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000,
|
|
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
|
|
};
|
|
return first[static_cast<unsigned char>(c & 0x7F)];
|
|
}
|
|
|
|
inline
|
|
bool
|
|
is_valid_utf8(const char* p, uint16_t first)
|
|
{
|
|
uint32_t v;
|
|
switch(first >> 8)
|
|
{
|
|
default:
|
|
return false;
|
|
|
|
// 2 bytes, second byte [80, BF]
|
|
case 1:
|
|
v = load_little_endian<2>(p);
|
|
return (v & 0xC000) == 0x8000;
|
|
|
|
// 3 bytes, second byte [A0, BF]
|
|
case 2:
|
|
v = load_little_endian<3>(p);
|
|
return (v & 0xC0E000) == 0x80A000;
|
|
|
|
// 3 bytes, second byte [80, BF]
|
|
case 3:
|
|
v = load_little_endian<3>(p);
|
|
return (v & 0xC0C000) == 0x808000;
|
|
|
|
// 3 bytes, second byte [80, 9F]
|
|
case 4:
|
|
v = load_little_endian<3>(p);
|
|
return (v & 0xC0E000) == 0x808000;
|
|
|
|
// 4 bytes, second byte [90, BF]
|
|
case 5:
|
|
v = load_little_endian<4>(p);
|
|
return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00;
|
|
|
|
// 4 bytes, second byte [80, BF]
|
|
case 6:
|
|
v = load_little_endian<4>(p);
|
|
return (v & 0xC0C0C000) == 0x80808000;
|
|
|
|
// 4 bytes, second byte [80, 8F]
|
|
case 7:
|
|
v = load_little_endian<4>(p);
|
|
return (v & 0xC0C0F000) == 0x80808000;
|
|
}
|
|
}
|
|
|
|
class utf8_sequence
|
|
{
|
|
char seq_[4];
|
|
uint16_t first_;
|
|
uint8_t size_;
|
|
|
|
public:
|
|
void
|
|
save(
|
|
const char* p,
|
|
std::size_t remain) noexcept
|
|
{
|
|
first_ = classify_utf8(*p );
|
|
if(remain >= length())
|
|
size_ = length();
|
|
else
|
|
size_ = static_cast<uint8_t>(remain);
|
|
std::memcpy(seq_, p, size_);
|
|
}
|
|
|
|
uint8_t
|
|
length() const noexcept
|
|
{
|
|
return first_ & 0xFF;
|
|
}
|
|
|
|
bool
|
|
complete() const noexcept
|
|
{
|
|
return size_ >= length();
|
|
}
|
|
|
|
// returns true if complete
|
|
bool
|
|
append(
|
|
const char* p,
|
|
std::size_t remain) noexcept
|
|
{
|
|
if(BOOST_JSON_UNLIKELY(needed() == 0))
|
|
return true;
|
|
if(BOOST_JSON_LIKELY(remain >= needed()))
|
|
{
|
|
std::memcpy(
|
|
seq_ + size_, p, needed());
|
|
size_ = length();
|
|
return true;
|
|
}
|
|
if(BOOST_JSON_LIKELY(remain > 0))
|
|
{
|
|
std::memcpy(seq_ + size_, p, remain);
|
|
size_ += static_cast<uint8_t>(remain);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
const char*
|
|
data() const noexcept
|
|
{
|
|
return seq_;
|
|
}
|
|
|
|
uint8_t
|
|
needed() const noexcept
|
|
{
|
|
return length() - size_;
|
|
}
|
|
|
|
bool
|
|
valid() const noexcept
|
|
{
|
|
BOOST_ASSERT(size_ >= length());
|
|
return is_valid_utf8(seq_, first_);
|
|
}
|
|
};
|
|
|
|
} // detail
|
|
BOOST_JSON_NS_END
|
|
|
|
#endif
|