FIFE  2008.0
 All Classes Namespaces Functions Variables Enumerations Enumerator Pages
checked.h
1 // Copyright 2006 Nemanja Trifunovic
2 
3 /*
4 Permission is hereby granted, free of charge, to any person or organization
5 obtaining a copy of the software and accompanying documentation covered by
6 this license (the "Software") to use, reproduce, display, distribute,
7 execute, and transmit the Software, and to prepare derivative works of the
8 Software, and to permit third-parties to whom the Software is furnished to
9 do so, all subject to the following:
10 
11 The copyright notices in the Software and this entire statement, including
12 the above license grant, this restriction and the following disclaimer,
13 must be included in all copies of the Software, in whole or in part, and
14 all derivative works of the Software, unless such copies or derivative
15 works are solely in the form of machine-executable object code generated by
16 a source language processor.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.
25 */
26 
27 
28 #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
29 #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
30 
31 #include "core.h"
32 #include <stdexcept>
33 
34 namespace utf8
35 {
36  // Exceptions that may be thrown from the library functions.
37  class invalid_code_point : public std::exception {
38  uint32_t cp;
39  public:
40  invalid_code_point(uint32_t cp) : cp(cp) {}
41  virtual const char* what() const throw() { return "Invalid code point"; }
42  uint32_t code_point() const {return cp;}
43  };
44 
45  class invalid_utf8 : public std::exception {
46  uint8_t u8;
47  public:
48  invalid_utf8 (uint8_t u) : u8(u) {}
49  virtual const char* what() const throw() { return "Invalid UTF-8"; }
50  uint8_t utf8_octet() const {return u8;}
51  };
52 
53  class invalid_utf16 : public std::exception {
54  uint16_t u16;
55  public:
56  invalid_utf16 (uint16_t u) : u16(u) {}
57  virtual const char* what() const throw() { return "Invalid UTF-16"; }
58  uint16_t utf16_word() const {return u16;}
59  };
60 
61  class not_enough_room : public std::exception {
62  public:
63  virtual const char* what() const throw() { return "Not enough space"; }
64  };
65 
67 
68  template <typename octet_iterator, typename output_iterator>
69  output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
70  {
71  while (start != end) {
72  octet_iterator sequence_start = start;
73  internal::utf_error err_code = internal::validate_next(start, end);
74  switch (err_code) {
75  case internal::OK :
76  for (octet_iterator it = sequence_start; it != start; ++it)
77  *out++ = *it;
78  break;
79  case internal::NOT_ENOUGH_ROOM:
80  throw not_enough_room();
81  case internal::INVALID_LEAD:
82  append (replacement, out);
83  ++start;
84  break;
85  case internal::INCOMPLETE_SEQUENCE:
86  case internal::OVERLONG_SEQUENCE:
87  case internal::INVALID_CODE_POINT:
88  append (replacement, out);
89  ++start;
90  // just one replacement mark for the sequence
91  while (internal::is_trail(*start) && start != end)
92  ++start;
93  break;
94  }
95  }
96  return out;
97  }
98 
99  template <typename octet_iterator, typename output_iterator>
100  inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
101  {
102  static const uint32_t replacement_marker = internal::mask16(0xfffd);
103  return replace_invalid(start, end, out, replacement_marker);
104  }
105 
106  template <typename octet_iterator>
107  octet_iterator append(uint32_t cp, octet_iterator result)
108  {
109  if (!internal::is_code_point_valid(cp))
110  throw invalid_code_point(cp);
111 
112  if (cp < 0x80) // one octet
113  *(result++) = static_cast<uint8_t>(cp);
114  else if (cp < 0x800) { // two octets
115  *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
116  *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
117  }
118  else if (cp < 0x10000) { // three octets
119  *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
120  *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
121  *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
122  }
123  else if (cp <= internal::CODE_POINT_MAX) { // four octets
124  *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
125  *(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f | 0x80);
126  *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
127  *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
128  }
129  else
130  throw invalid_code_point(cp);
131 
132  return result;
133  }
134 
135  template <typename octet_iterator>
136  uint32_t next(octet_iterator& it, octet_iterator end)
137  {
138  uint32_t cp = 0;
139  internal::utf_error err_code = internal::validate_next(it, end, &cp);
140  switch (err_code) {
141  case internal::OK :
142  break;
143  case internal::NOT_ENOUGH_ROOM :
144  throw not_enough_room();
145  case internal::INVALID_LEAD :
146  case internal::INCOMPLETE_SEQUENCE :
147  case internal::OVERLONG_SEQUENCE :
148  throw invalid_utf8(*it);
149  case internal::INVALID_CODE_POINT :
150  throw invalid_code_point(cp);
151  }
152  return cp;
153  }
154 
155  template <typename octet_iterator>
156  uint32_t prior(octet_iterator& it, octet_iterator start)
157  {
158  octet_iterator end = it;
159  while (internal::is_trail(*(--it)))
160  if (it < start)
161  throw invalid_utf8(*it); // error - no lead byte in the sequence
162  octet_iterator temp = it;
163  return next(temp, end);
164  }
165 
167  template <typename octet_iterator>
168  uint32_t previous(octet_iterator& it, octet_iterator pass_start)
169  {
170  octet_iterator end = it;
171  while (internal::is_trail(*(--it)))
172  if (it == pass_start)
173  throw invalid_utf8(*it); // error - no lead byte in the sequence
174  octet_iterator temp = it;
175  return next(temp, end);
176  }
177 
178  template <typename octet_iterator, typename distance_type>
179  void advance (octet_iterator& it, distance_type n, octet_iterator end)
180  {
181  for (distance_type i = 0; i < n; ++i)
182  next(it, end);
183  }
184 
185  template <typename octet_iterator>
186  typename std::iterator_traits<octet_iterator>::difference_type
187  distance (octet_iterator first, octet_iterator last)
188  {
189  typename std::iterator_traits<octet_iterator>::difference_type dist;
190  for (dist = 0; first < last; ++dist)
191  next(first, last);
192  return dist;
193  }
194 
195  template <typename u16bit_iterator, typename octet_iterator>
196  octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
197  {
198  while (start != end) {
199  uint32_t cp = internal::mask16(*start++);
200  // Take care of surrogate pairs first
201  if (internal::is_surrogate(cp)) {
202  if (start != end) {
203  uint32_t trail_surrogate = internal::mask16(*start++);
204  if (trail_surrogate >= internal::TRAIL_SURROGATE_MIN && trail_surrogate <= internal::TRAIL_SURROGATE_MAX)
205  cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
206  else
207  throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
208  }
209  else
210  throw invalid_utf16(static_cast<uint16_t>(*start));
211 
212  }
213  result = append(cp, result);
214  }
215  return result;
216  }
217 
218  template <typename u16bit_iterator, typename octet_iterator>
219  u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
220  {
221  while (start != end) {
222  uint32_t cp = next(start, end);
223  if (cp > 0xffff) { //make a surrogate pair
224  *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
225  *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
226  }
227  else
228  *result++ = static_cast<uint16_t>(cp);
229  }
230  return result;
231  }
232 
233  template <typename octet_iterator, typename u32bit_iterator>
234  octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
235  {
236  while (start != end)
237  result = append(*(start++), result);
238 
239  return result;
240  }
241 
242  template <typename octet_iterator, typename u32bit_iterator>
243  u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
244  {
245  while (start < end)
246  (*result++) = next(start, end);
247 
248  return result;
249  }
250 
251  // The iterator class
252  template <typename octet_iterator>
253  class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
254  octet_iterator it;
255  octet_iterator range_start;
256  octet_iterator range_end;
257  public:
258  iterator () {};
259  explicit iterator (const octet_iterator& octet_it,
260  const octet_iterator& range_start,
261  const octet_iterator& range_end) :
262  it(octet_it), range_start(range_start), range_end(range_end)
263  {
264  if (it < range_start || it > range_end)
265  throw std::out_of_range("Invalid utf-8 iterator position");
266  }
267  // the default "big three" are OK
268  octet_iterator base () const { return it; }
269  uint32_t operator * () const
270  {
271  octet_iterator temp = it;
272  return next(temp, range_end);
273  }
274  bool operator == (const iterator& rhs) const
275  {
276  if (range_start != rhs.range_start && range_end != rhs.range_end)
277  throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
278  return (it == rhs.it);
279  }
280  bool operator != (const iterator& rhs) const
281  {
282  return !(operator == (rhs));
283  }
284  iterator& operator ++ ()
285  {
286  next(it, range_end);
287  return *this;
288  }
289  iterator operator ++ (int)
290  {
291  iterator temp = *this;
292  next(it, range_end);
293  return temp;
294  }
295  iterator& operator -- ()
296  {
297  prior(it, range_start);
298  return *this;
299  }
300  iterator operator -- (int)
301  {
302  iterator temp = *this;
303  prior(it, range_start);
304  return temp;
305  }
306  }; // class iterator
307 
308 } // namespace utf8
309 
310 #endif //header guard
311 
312