cprover
unicode.cpp
Go to the documentation of this file.
1 /*******************************************************************\
2 
3 Module:
4 
5 Author: Daniel Kroening, kroening@kroening.com
6 
7 \*******************************************************************/
8 
9 #include "unicode.h"
10 
11 #include <cstring>
12 #include <locale>
13 #include <iomanip>
14 #include <sstream>
15 #include <cstdint>
16 
17 #ifdef _WIN32
18 #include <windows.h>
19 #endif
20 
21 std::string narrow(const wchar_t *s)
22 {
23  #ifdef _WIN32
24 
25  int slength=static_cast<int>(wcslen(s));
26  int rlength=
27  WideCharToMultiByte(CP_UTF8, 0, s, slength, NULL, 0, NULL, NULL);
28  std::string r(rlength, 0);
29  WideCharToMultiByte(CP_UTF8, 0, s, slength, &r[0], rlength, NULL, NULL);
30  return r;
31 
32  #else
33  // dummy conversion
34  std::string r;
35  r.reserve(wcslen(s));
36  while(*s!=0)
37  {
38  r+=static_cast<char>(*s);
39  s++;
40  }
41 
42  return r;
43  #endif
44 }
45 
46 std::wstring widen(const char *s)
47 {
48  #ifdef _WIN32
49 
50  int slength=static_cast<int>(strlen(s));
51  int rlength=
52  MultiByteToWideChar(CP_UTF8, 0, s, slength, NULL, 0);
53  std::wstring r(rlength, 0);
54  MultiByteToWideChar(CP_UTF8, 0, s, slength, &r[0], rlength);
55  return r;
56 
57  #else
58  // dummy conversion
59  std::wstring r;
60  r.reserve(strlen(s));
61  while(*s!=0)
62  {
63  r+=wchar_t(*s);
64  s++;
65  }
66 
67  return r;
68  #endif
69 }
70 
71 std::string narrow(const std::wstring &s)
72 {
73  #ifdef _WIN32
74 
75  int slength=static_cast<int>(s.size());
76  int rlength=
77  WideCharToMultiByte(CP_UTF8, 0, &s[0], slength, NULL, 0, NULL, NULL);
78  std::string r(rlength, 0);
79  WideCharToMultiByte(CP_UTF8, 0, &s[0], slength, &r[0], rlength, NULL, NULL);
80  return r;
81 
82  #else
83  // dummy conversion
84  return std::string(s.begin(), s.end());
85  #endif
86 }
87 
88 std::wstring widen(const std::string &s)
89 {
90  #ifdef _WIN32
91 
92  int slength=static_cast<int>(s.size());
93  int rlength=
94  MultiByteToWideChar(CP_UTF8, 0, &s[0], slength, NULL, 0);
95  std::wstring r(rlength, 0);
96  MultiByteToWideChar(CP_UTF8, 0, &s[0], slength, &r[0], rlength);
97  return r;
98 
99  #else
100  // dummy conversion
101  return std::wstring(s.begin(), s.end());
102  #endif
103 }
104 
107 static void utf8_append_code(unsigned int c, std::string &result)
108 {
109  if(c<=0x7f)
110  result+=static_cast<char>(c);
111  else if(c<=0x7ff)
112  {
113  result+=static_cast<char>((c >> 6) | 0xc0);
114  result+=static_cast<char>((c &0x3f) | 0x80);
115  }
116  else if(c<=0xffff)
117  {
118  result+=static_cast<char>((c >> 12) | 0xe0);
119  result+=static_cast<char>(((c >> 6) &0x3f) | 0x80);
120  result+=static_cast<char>((c &0x3f) | 0x80);
121  }
122  else
123  {
124  result+=static_cast<char>((c >> 18) | 0xf0);
125  result+=static_cast<char>(((c >> 12) &0x3f)| 0x80);
126  result+=static_cast<char>(((c >> 6) &0x3f) | 0x80);
127  result+=static_cast<char>((c &0x3f) | 0x80);
128  }
129 }
130 
133 std::string
134 utf32_native_endian_to_utf8(const std::basic_string<unsigned int> &s)
135 {
136  std::string result;
137 
138  result.reserve(s.size()); // at least that long
139 
140  for(const auto c : s)
141  utf8_append_code(c, result);
142 
143  return result;
144 }
145 
146 std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide)
147 {
148  if(argv_wide==nullptr)
149  return std::vector<std::string>();
150 
151  std::vector<std::string> argv_narrow;
152  argv_narrow.reserve(argc);
153 
154  for(int i=0; i!=argc; ++i)
155  argv_narrow.push_back(narrow(argv_wide[i]));
156 
157  return argv_narrow;
158 }
159 
160 static void utf16_append_code(unsigned int code, std::wstring &result)
161 {
162  // we do not treat 0xD800 to 0xDFFF, although
163  // they are not valid unicode symbols
164 
165  if(code<0xFFFF)
166  {
167  // code is encoded as one UTF16 character
168  result += static_cast<wchar_t>(code);
169  }
170  else // code is encoded as two UTF16 characters
171  {
172  // if this is valid unicode, we have
173  // code<0x10FFFF
174  // but let's not check it programmatically
175 
176  // encode the code in UTF16
177  code=code-0x10000;
178  const uint16_t i1 = static_cast<uint16_t>(((code >> 10) & 0x3ff) | 0xD800);
179  result += static_cast<wchar_t>(i1);
180  const uint16_t i2 = static_cast<uint16_t>((code & 0x3ff) | 0xDC00);
181  result += static_cast<wchar_t>(i2);
182  }
183 }
184 
185 
190 std::wstring utf8_to_utf16_native_endian(const std::string &in)
191 {
192  std::wstring result;
193  result.reserve(in.size());
195  while(i<in.size())
196  {
197  unsigned char c=in[i++];
198  unsigned int code=0;
199  // the ifs that follow find out how many UTF8 characters (1-4) store the
200  // next unicode character. This is determined by the few most
201  // significant bits.
202  if(c<=0x7F)
203  {
204  // if it's one character, then code is exactly the value
205  code=c;
206  }
207  else if(c<=0xDF && i<in.size())
208  { // in other cases, we need to read the right number of chars and decode
209  // note: if we wanted to make sure that we capture incorrect strings,
210  // we should check that whatever follows first character starts with
211  // bits 10.
212  code = (c & 0x1Fu) << 6;
213  c=in[i++];
214  code += c & 0x3Fu;
215  }
216  else if(c<=0xEF && i+1<in.size())
217  {
218  code = (c & 0xFu) << 12;
219  c=in[i++];
220  code += (c & 0x3Fu) << 6;
221  c=in[i++];
222  code += c & 0x3Fu;
223  }
224  else if(c<=0xF7 && i+2<in.size())
225  {
226  code = (c & 0x7u) << 18;
227  c=in[i++];
228  code += (c & 0x3Fu) << 12;
229  c=in[i++];
230  code += (c & 0x3Fu) << 6;
231  c=in[i++];
232  code += c & 0x3Fu;
233  }
234  else
235  {
236  // The string is not a valid UTF8 string! Either it has some characters
237  // missing from a multi-character unicode symbol, or it has a char with
238  // too high value.
239  // For now, let's replace the character with a space
240  code=32;
241  }
242 
243  utf16_append_code(code, result);
244  }
245 
246  return result;
247 }
248 
254  const wchar_t ch,
255  std::ostringstream &result,
256  const std::locale &loc)
257 {
258  // \u unicode characters are translated very early by the Java compiler and so
259  // \u000a or \u000d would become a newline character in a char constant, which
260  // is illegal. Instead use \n or \r.
261  if(ch == '\n')
262  result << "\\n";
263  else if(ch == '\r')
264  result << "\\r";
265  // \f, \b and \t do not need to be escaped, but this will improve readability
266  // of generated tests.
267  else if(ch == '\f')
268  result << "\\f";
269  else if(ch == '\b')
270  result << "\\b";
271  else if(ch == '\t')
272  result << "\\t";
273  else if(ch <= 255 && isprint(ch, loc))
274  {
275  const auto uch = static_cast<unsigned char>(ch);
276  // ", \ and ' need to be escaped.
277  if(uch == '"' || uch == '\\' || uch == '\'')
278  result << '\\';
279  result << uch;
280  }
281  else
282  {
283  // Format ch as a hexadecimal unicode character padded to four digits with
284  // zeros.
285  result << "\\u" << std::hex << std::setw(4) << std::setfill('0')
286  << static_cast<unsigned int>(ch);
287  }
288 }
289 
292 std::string utf16_native_endian_to_java(const wchar_t ch)
293 {
294  std::ostringstream result;
295  const std::locale loc;
296  utf16_native_endian_to_java(ch, result, loc);
297  return result.str();
298 }
299 
302 std::string utf16_native_endian_to_java(const std::wstring &in)
303 {
304  std::ostringstream result;
305  const std::locale loc;
306  for(const auto ch : in)
307  utf16_native_endian_to_java(ch, result, loc);
308  return result.str();
309 }
#define loc()
std::string narrow(const wchar_t *s)
Definition: unicode.cpp:21
static int8_t r
Definition: irep_hash.h:59
std::wstring widen(const char *s)
Definition: unicode.cpp:46
std::wstring utf8_to_utf16_native_endian(const std::string &in)
Convert UTF8-encoded string to UTF-16 with architecture-native endianness.
Definition: unicode.cpp:190
unsignedbv_typet size_type()
Definition: c_types.cpp:58
std::string utf32_native_endian_to_utf8(const std::basic_string< unsigned int > &s)
Definition: unicode.cpp:134
std::vector< std::string > narrow_argv(int argc, const wchar_t **argv_wide)
Definition: unicode.cpp:146
static void utf16_native_endian_to_java(const wchar_t ch, std::ostringstream &result, const std::locale &loc)
Definition: unicode.cpp:253
static void utf16_append_code(unsigned int code, std::wstring &result)
Definition: unicode.cpp:160
static void utf8_append_code(unsigned int c, std::string &result)
Appends a unicode character to a utf8-encoded string.
Definition: unicode.cpp:107