unchecked.h

00001 // Copyright 2006 Nemanja Trifunovic
00002 
00003 /*
00004 Permission is hereby granted, free of charge, to any person or organization
00005 obtaining a copy of the software and accompanying documentation covered by
00006 this license (the "Software") to use, reproduce, display, distribute,
00007 execute, and transmit the Software, and to prepare derivative works of the
00008 Software, and to permit third-parties to whom the Software is furnished to
00009 do so, all subject to the following:
00010 
00011 The copyright notices in the Software and this entire statement, including
00012 the above license grant, this restriction and the following disclaimer,
00013 must be included in all copies of the Software, in whole or in part, and
00014 all derivative works of the Software, unless such copies or derivative
00015 works are solely in the form of machine-executable object code generated by
00016 a source language processor.
00017 
00018 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00019 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00020 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
00021 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
00022 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
00023 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
00024 DEALINGS IN THE SOFTWARE.
00025 */
00026 
00027 
00028 #ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00029 #define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00030 
00031 #include "core.h"
00032 
00033 namespace utf8
00034 {
00035     namespace unchecked 
00036     {
00037         template <typename octet_iterator>
00038         octet_iterator append(uint32_t cp, octet_iterator result)
00039         {
00040             if (cp < 0x80)                        // one octet
00041                 *(result++) = static_cast<uint8_t>(cp);  
00042             else if (cp < 0x800) {                // two octets
00043                 *(result++) = static_cast<uint8_t>((cp >> 6)          | 0xc0);
00044                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
00045             }
00046             else if (cp < 0x10000) {              // three octets
00047                 *(result++) = static_cast<uint8_t>((cp >> 12)         | 0xe0);
00048                 *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f   | 0x80);
00049                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
00050             }
00051             else {                                // four octets
00052                 *(result++) = static_cast<uint8_t>((cp >> 18)         | 0xf0);
00053                 *(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f   | 0x80);
00054                 *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f   | 0x80);
00055                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
00056             }
00057             return result;
00058         }
00059         template <typename octet_iterator>
00060         uint32_t next(octet_iterator& it)
00061         {
00062             uint32_t cp = internal::mask8(*it);
00063             typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it);
00064             switch (length) {
00065                 case 1:
00066                     break;
00067                 case 2:
00068                     it++;
00069                     cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
00070                     break;
00071                 case 3:
00072                     ++it; 
00073                     cp = ((cp << 12) & 0xffff) + ((internal::mask8(*it) << 6) & 0xfff);
00074                     ++it;
00075                     cp += (*it) & 0x3f;
00076                     break;
00077                 case 4:
00078                     ++it;
00079                     cp = ((cp << 18) & 0x1fffff) + ((internal::mask8(*it) << 12) & 0x3ffff);                
00080                     ++it;
00081                     cp += (internal::mask8(*it) << 6) & 0xfff;
00082                     ++it;
00083                     cp += (*it) & 0x3f; 
00084                     break;
00085             }
00086             ++it;
00087             return cp;        
00088         }
00089 
00090         template <typename octet_iterator>
00091         uint32_t prior(octet_iterator& it)
00092         {
00093             while (internal::is_trail(*(--it))) ;
00094             octet_iterator temp = it;
00095             return next(temp);
00096         }
00097 
00098         // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous)
00099         template <typename octet_iterator>
00100         inline uint32_t previous(octet_iterator& it)
00101         {
00102             return prior(it);
00103         }
00104 
00105         template <typename octet_iterator, typename distance_type>
00106         void advance (octet_iterator& it, distance_type n)
00107         {
00108             for (distance_type i = 0; i < n; ++i)
00109                 next(it);
00110         }
00111 
00112         template <typename octet_iterator>
00113         typename std::iterator_traits<octet_iterator>::difference_type
00114         distance (octet_iterator first, octet_iterator last)
00115         {
00116             typename std::iterator_traits<octet_iterator>::difference_type dist;
00117             for (dist = 0; first < last; ++dist) 
00118                 next(first);
00119             return dist;
00120         }
00121 
00122         template <typename u16bit_iterator, typename octet_iterator>
00123         octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
00124         {       
00125             while (start != end) {
00126                 uint32_t cp = internal::mask16(*start++);
00127             // Take care of surrogate pairs first
00128                 if (internal::is_surrogate(cp)) {
00129                     uint32_t trail_surrogate = internal::mask16(*start++);
00130                     cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
00131                 }
00132                 result = append(cp, result);
00133             }
00134             return result;         
00135         }
00136 
00137         template <typename u16bit_iterator, typename octet_iterator>
00138         u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
00139         {
00140             while (start != end) {
00141                 uint32_t cp = next(start);
00142                 if (cp > 0xffff) { //make a surrogate pair
00143                     *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
00144                     *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
00145                 }
00146                 else
00147                     *result++ = static_cast<uint16_t>(cp);
00148             }
00149             return result;
00150         }
00151 
00152         template <typename octet_iterator, typename u32bit_iterator>
00153         octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
00154         {
00155             while (start != end)
00156                 result = append(*(start++), result);
00157 
00158             return result;
00159         }
00160 
00161         template <typename octet_iterator, typename u32bit_iterator>
00162         u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
00163         {
00164             while (start < end)
00165                 (*result++) = next(start);
00166 
00167             return result;
00168         }
00169 
00170         // The iterator class
00171         template <typename octet_iterator>
00172           class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { 
00173             octet_iterator it;
00174             public:
00175             iterator () {};
00176             explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
00177             // the default "big three" are OK
00178             octet_iterator base () const { return it; }
00179             uint32_t operator * () const
00180             {
00181                 octet_iterator temp = it;
00182                 return next(temp);
00183             }
00184             bool operator == (const iterator& rhs) const 
00185             { 
00186                 return (it == rhs.it);
00187             }
00188             bool operator != (const iterator& rhs) const
00189             {
00190                 return !(operator == (rhs));
00191             }
00192             iterator& operator ++ () 
00193             {
00194                 std::advance(it, internal::sequence_length(it));
00195                 return *this;
00196             }
00197             iterator operator ++ (int)
00198             {
00199                 iterator temp = *this;
00200                 std::advance(it, internal::sequence_length(it));
00201                 return temp;
00202             }  
00203             iterator& operator -- ()
00204             {
00205                 prior(it);
00206                 return *this;
00207             }
00208             iterator operator -- (int)
00209             {
00210                 iterator temp = *this;
00211                 prior(it);
00212                 return temp;
00213             }
00214           }; // class iterator
00215 
00216     } // namespace utf8::unchecked
00217 } // namespace utf8 
00218 
00219 
00220 #endif // header guard
00221