detail/utf8.hpp

100.0% Lines (62/62) 100.0% Functions (13/13)
Line TLA Hits Source Code
1 //
2 // Copyright (c) 2020 Krystian Stasiowski (sdkrystian@gmail.com)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 //
7 // Official repository: https://github.com/boostorg/json
8 //
9
10 #ifndef BOOST_JSON_DETAIL_UTF8_HPP
11 #define BOOST_JSON_DETAIL_UTF8_HPP
12
13 #include <boost/endian/conversion.hpp>
14 #include <boost/json/detail/config.hpp>
15
16 #include <cstddef>
17 #include <cstring>
18 #include <cstdint>
19
20 namespace boost {
21 namespace json {
22 namespace detail {
23
24 template<int N>
25 std::uint32_t
26 21733x load_little_endian(void const* p)
27 {
28 21733x std::uint32_t v = 0;
29 21733x std::memcpy(&v, p, N);
30 21733x endian::little_to_native_inplace(v);
31 21733x return v;
32 }
33
34 inline
35 uint16_t
36 16690x classify_utf8(char c)
37 {
38 // 0x000 = invalid
39 // 0x102 = 2 bytes, second byte [80, BF]
40 // 0x203 = 3 bytes, second byte [A0, BF]
41 // 0x303 = 3 bytes, second byte [80, BF]
42 // 0x403 = 3 bytes, second byte [80, 9F]
43 // 0x504 = 4 bytes, second byte [90, BF]
44 // 0x604 = 4 bytes, second byte [80, BF]
45 // 0x704 = 4 bytes, second byte [80, 8F]
46 static constexpr uint16_t first[128]
47 {
48 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
49 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
50 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
51 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
52 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
53 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
54 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
55 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
56
57 0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
58 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
59 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
60 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
61 0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303,
62 0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303,
63 0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000,
64 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
65 };
66 16690x return first[static_cast<unsigned char>(c & 0x7F)];
67 }
68
69 inline
70 bool
71 13177x is_valid_utf8(const char* p, uint16_t first)
72 {
73 uint32_t v;
74 13177x switch(first >> 8)
75 {
76 362x default:
77 362x return false;
78
79 // 2 bytes, second byte [80, BF]
80 2348x case 1:
81 2348x v = load_little_endian<2>(p);
82 2348x return (v & 0xC000) == 0x8000;
83
84 // 3 bytes, second byte [A0, BF]
85 665x case 2:
86 665x v = load_little_endian<3>(p);
87 665x return (v & 0xC0E000) == 0x80A000;
88
89 // 3 bytes, second byte [80, BF]
90 3882x case 3:
91 3882x v = load_little_endian<3>(p);
92 3882x return (v & 0xC0C000) == 0x808000;
93
94 // 3 bytes, second byte [80, 9F]
95 725x case 4:
96 725x v = load_little_endian<3>(p);
97 725x return (v & 0xC0E000) == 0x808000;
98
99 // 4 bytes, second byte [90, BF]
100 1310x case 5:
101 1310x v = load_little_endian<4>(p);
102 1310x return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00;
103
104 // 4 bytes, second byte [80, BF]
105 2346x case 6:
106 2346x v = load_little_endian<4>(p);
107 2346x return (v & 0xC0C0C000) == 0x80808000;
108
109 // 4 bytes, second byte [80, 8F]
110 1539x case 7:
111 1539x v = load_little_endian<4>(p);
112 1539x return (v & 0xC0C0F000) == 0x80808000;
113 }
114 }
115
116 class utf8_sequence
117 {
118 char seq_[4];
119 uint16_t first_;
120 uint8_t size_;
121
122 public:
123 void
124 3466x save(
125 const char* p,
126 std::size_t remain) noexcept
127 {
128 3466x first_ = classify_utf8(*p );
129 3466x if(remain >= length())
130 1560x size_ = length();
131 else
132 1906x size_ = static_cast<uint8_t>(remain);
133 3466x std::memcpy(seq_, p, size_);
134 3466x }
135
136 uint8_t
137 21338x length() const noexcept
138 {
139 21338x return first_ & 0xFF;
140 }
141
142 bool
143 3469x complete() const noexcept
144 {
145 3469x return size_ >= length();
146 }
147
148 // returns true if complete
149 bool
150 1864x append(
151 const char* p,
152 std::size_t remain) noexcept
153 {
154 1864x if(BOOST_JSON_UNLIKELY(needed() == 0))
155 1x return true;
156 1863x if(BOOST_JSON_LIKELY(remain >= needed()))
157 {
158 1862x std::memcpy(
159 1862x seq_ + size_, p, needed());
160 1862x size_ = length();
161 1862x return true;
162 }
163 1x if(BOOST_JSON_LIKELY(remain > 0))
164 {
165 1x std::memcpy(seq_ + size_, p, remain);
166 1x size_ += static_cast<uint8_t>(remain);
167 }
168 1x return false;
169 }
170
171 const char*
172 1658x data() const noexcept
173 {
174 1658x return seq_;
175 }
176
177 uint8_t
178 7457x needed() const noexcept
179 {
180 7457x return length() - size_;
181 }
182
183 bool
184 1866x valid() const noexcept
185 {
186 1866x BOOST_ASSERT(size_ >= length());
187 1866x return is_valid_utf8(seq_, first_);
188 }
189 };
190
191 } // detail
192 } // namespace json
193 } // namespace boost
194
195 #endif
196