liblcf
reader_util.cpp
Go to the documentation of this file.
1 /*
2  * This file is part of liblcf. Copyright (c) 2020 liblcf authors.
3  * https://github.com/EasyRPG/liblcf - https://easyrpg.org
4  *
5  * liblcf is Free/Libre Open Source Software, released under the MIT License.
6  * For the full copyright and license information, please view the COPYING
7  * file that was distributed with this source code.
8  */
9 
10 #include "lcf_options.h"
11 #include "scope_guard.h"
12 
13 #ifdef LCF_SUPPORT_ICU
14 # include <unicode/ucsdet.h>
15 # include <unicode/ucnv.h>
16 # include <unicode/normalizer2.h>
17 # include <unicode/unistr.h>
18 #else
19 # ifdef _MSC_VER
20 # error MSVC builds require ICU
21 # endif
22 #endif
23 
24 #ifdef _WIN32
25 # include <windows.h>
26 #else
27 # ifndef LCF_SUPPORT_ICU
28 # include <iconv.h>
29 # endif
30 # include <locale>
31 #endif
32 
33 #if defined(__MORPHOS__) || defined(__amigaos4__)
34 #define ICONV_CONST const
35 #endif
36 #include <algorithm>
37 #include <cstdio>
38 #include <cstdlib>
39 #include <sstream>
40 #include <vector>
41 
42 #include "data.h"
43 #include "inireader.h"
44 #include "ldb_reader.h"
45 #include "reader_util.h"
46 
47 namespace ReaderUtil {
48 }
49 
50 std::string ReaderUtil::CodepageToEncoding(int codepage) {
51  if (codepage == 0)
52  return std::string();
53 
54  if (codepage == 932) {
55 #ifdef LCF_SUPPORT_ICU
56  return "ibm-943_P15A-2003";
57 #else
58  return "SHIFT_JIS";
59 #endif
60  }
61  if (codepage == 949) {
62 #ifdef LCF_SUPPORT_ICU
63  return "windows-949-2000";
64 #else
65  return "cp949";
66 #endif
67  }
68  std::ostringstream out;
69 #ifdef LCF_SUPPORT_ICU
70  out << "windows-" << codepage;
71 #else
72  out << "CP" << codepage;
73 #endif
74 
75  // Looks like a valid codepage
76  std::string outs = out.str();
77  return outs;
78 }
79 
80 std::string ReaderUtil::DetectEncoding(std::istream& filestream) {
81  std::vector<std::string> encodings = DetectEncodings(filestream);
82 
83  if (encodings.empty()) {
84  return "";
85  }
86 
87  return encodings.front();
88 }
89 
90 std::string ReaderUtil::DetectEncoding(std::string const & data) {
91  std::vector<std::string> encodings = DetectEncodings(data);
92 
93  if (encodings.empty()) {
94  return "";
95  }
96 
97  return encodings.front();
98 }
99 
100 std::vector<std::string> ReaderUtil::DetectEncodings(std::istream& filestream) {
101 #ifdef LCF_SUPPORT_ICU
102  std::ostringstream text;
103 
104  // Populate Data::terms and Data::system or will empty by default even if load fails
105  LDB_Reader::Load(filestream, "");
106 
107  text <<
114  Data::terms.row <<
115  Data::terms.order <<
118  Data::terms.level <<
133  Data::terms.armor <<
138  Data::terms.file <<
140  Data::terms.yes <<
141  Data::terms.no <<
151 
152  return ReaderUtil::DetectEncodings(text.str());
153 #else
154  return std::vector<std::string>();
155 #endif
156 }
157 
158 std::vector<std::string> ReaderUtil::DetectEncodings(std::string const & data) {
159 std::vector<std::string> encodings;
160 #ifdef LCF_SUPPORT_ICU
161  if (!data.empty()) {
162  UErrorCode status = U_ZERO_ERROR;
163  UCharsetDetector* detector = ucsdet_open(&status);
164 
165  std::string s = data;
166  ucsdet_setText(detector, s.c_str(), s.length(), &status);
167 
168  int32_t matches_count;
169  const UCharsetMatch** matches = ucsdet_detectAll(detector, &matches_count, &status);
170 
171  if (matches != NULL) {
172  // Collect all candidates, most confident comes first
173  for (int i = 0; i < matches_count; ++i) {
174  std::string encoding = ucsdet_getName(matches[i], &status);
175 
176  // Fixes to ensure proper Windows encodings
177  if (encoding == "Shift_JIS") {
178  encodings.push_back("ibm-943_P15A-2003"); // Japanese with \ as backslash
179  } else if (encoding == "EUC-KR") {
180  encodings.push_back("windows-949-2000"); // Korean with \ as backlash
181  } else if (encoding == "GB18030") {
182  encodings.push_back("windows-936-2000"); // Simplified Chinese
183  } else if (encoding == "ISO-8859-1" || encoding == "windows-1252") {
184  encodings.push_back("ibm-5348_P100-1997"); // Occidental with Euro
185  } else if (encoding == "ISO-8859-2" || encoding == "windows-1250") {
186  encodings.push_back("ibm-5346_P100-1998"); // Central Europe with Euro
187  } else if (encoding == "ISO-8859-5" || encoding == "windows-1251") {
188  encodings.push_back("ibm-5347_P100-1998"); // Cyrillic with Euro
189  } else if (encoding == "ISO-8859-6" || encoding == "windows-1256") {
190  encodings.push_back("ibm-9448_X100-2005"); // Arabic with Euro + 8 chars
191  } else if (encoding == "ISO-8859-7" || encoding == "windows-1253") {
192  encodings.push_back("ibm-5349_P100-1998"); // Greek with Euro
193  } else if (encoding == "ISO-8859-8" || encoding == "windows-1255") {
194  encodings.push_back("ibm-9447_P100-2002"); // Hebrew with Euro
195  } else {
196  encodings.push_back(encoding);
197  }
198  }
199  }
200  ucsdet_close(detector);
201  }
202 #endif
203 
204  return encodings;
205 }
206 
207 std::string ReaderUtil::GetEncoding(const std::string& ini_file) {
208  INIReader ini(ini_file);
209  if (ini.ParseError() != -1) {
210  std::string encoding = ini.Get("EasyRPG", "Encoding", std::string());
211  if (!encoding.empty()) {
212  return ReaderUtil::CodepageToEncoding(atoi(encoding.c_str()));
213  }
214  }
215  return std::string();
216 }
217 
218 std::string ReaderUtil::GetEncoding(std::istream& filestream) {
219  INIReader ini(filestream);
220  if (ini.ParseError() != -1) {
221  std::string encoding = ini.Get("EasyRPG", "Encoding", std::string());
222  if (!encoding.empty()) {
223  return ReaderUtil::CodepageToEncoding(atoi(encoding.c_str()));
224  }
225  }
226  return std::string();
227 }
228 
230 #ifdef _WIN32
231  int codepage = GetACP();
232 #elif __ANDROID__
233  // No std::locale support in NDK
234  // Doesn't really matter because the Android version auto-detects via ICU
235  int codepage = 1252;
236 #else
237  int codepage = 1252;
238 
239  std::locale loc = std::locale("");
240  // Gets the language and culture part only
241  std::string loc_full = loc.name().substr(0, loc.name().find_first_of("@."));
242  // Gets the language part only
243  std::string loc_lang = loc.name().substr(0, loc.name().find_first_of("_"));
244 
245  if (loc_lang == "th") codepage = 874;
246  else if (loc_lang == "ja") codepage = 932;
247  else if (loc_full == "zh_CN" ||
248  loc_full == "zh_SG") codepage = 936;
249  else if (loc_lang == "ko") codepage = 949;
250  else if (loc_full == "zh_TW" ||
251  loc_full == "zh_HK") codepage = 950;
252  else if (loc_lang == "cs" ||
253  loc_lang == "hu" ||
254  loc_lang == "pl" ||
255  loc_lang == "ro" ||
256  loc_lang == "hr" ||
257  loc_lang == "sk" ||
258  loc_lang == "sl") codepage = 1250;
259  else if (loc_lang == "ru") codepage = 1251;
260  else if (loc_lang == "ca" ||
261  loc_lang == "da" ||
262  loc_lang == "de" ||
263  loc_lang == "en" ||
264  loc_lang == "es" ||
265  loc_lang == "fi" ||
266  loc_lang == "fr" ||
267  loc_lang == "it" ||
268  loc_lang == "nl" ||
269  loc_lang == "nb" ||
270  loc_lang == "pt" ||
271  loc_lang == "sv" ||
272  loc_lang == "eu") codepage = 1252;
273  else if (loc_lang == "el") codepage = 1253;
274  else if (loc_lang == "tr") codepage = 1254;
275  else if (loc_lang == "he") codepage = 1255;
276  else if (loc_lang == "ar") codepage = 1256;
277  else if (loc_lang == "et" ||
278  loc_lang == "lt" ||
279  loc_lang == "lv") codepage = 1257;
280  else if (loc_lang == "vi") codepage = 1258;
281 #endif
282 
283  return CodepageToEncoding(codepage);
284 }
285 
286 std::string ReaderUtil::Recode(const std::string& str_to_encode, const std::string& source_encoding) {
287  return ReaderUtil::Recode(str_to_encode, source_encoding, "UTF-8");
288 }
289 
290 std::string ReaderUtil::Recode(const std::string& str_to_encode,
291  const std::string& src_enc,
292  const std::string& dst_enc) {
293 
294  if (src_enc.empty() || dst_enc.empty() || str_to_encode.empty()) {
295  return str_to_encode;
296  }
297 
298  auto src_cp = atoi(src_enc.c_str());
299  const auto& src_enc_str = src_cp > 0
301  : src_enc;
302 
303  auto dst_cp = atoi(dst_enc.c_str());
304  const auto& dst_enc_str = dst_cp > 0
306  : dst_enc;
307 
308 #ifdef LCF_SUPPORT_ICU
309  auto status = U_ZERO_ERROR;
310  auto conv_from = ucnv_open(src_enc_str.c_str(), &status);
311 
312  if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
313  fprintf(stderr, "liblcf: ucnv_open() error for source encoding \"%s\": %s\n", src_enc_str.c_str(), u_errorName(status));
314  return std::string();
315  }
316  status = U_ZERO_ERROR;
317  auto conv_from_sg = makeScopeGuard([&]() { ucnv_close(conv_from); });
318 
319  auto conv_to = ucnv_open(dst_enc_str.c_str(), &status);
320 
321  if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
322  fprintf(stderr, "liblcf: ucnv_open() error for dest encoding \"%s\": %s\n", dst_enc_str.c_str(), u_errorName(status));
323  return std::string();
324  }
325  auto conv_to_sg = makeScopeGuard([&]() { ucnv_close(conv_to); });
326  status = U_ZERO_ERROR;
327 
328  std::string result(str_to_encode.size() * 4, '\0');
329  auto* src = &str_to_encode.front();
330  auto* dst = &result.front();
331 
332  ucnv_convertEx(conv_to, conv_from,
333  &dst, dst + result.size(),
334  &src, src + str_to_encode.size(),
335  nullptr, nullptr, nullptr, nullptr,
336  true, true,
337  &status);
338 
339  if (U_FAILURE(status)) {
340  fprintf(stderr, "liblcf: ucnv_convertEx() error when encoding \"%s\": %s\n", str_to_encode.c_str(), u_errorName(status));
341  return std::string();
342  }
343 
344  result.resize(dst - result.c_str());
345  result.shrink_to_fit();
346 
347  return result;
348 #else
349  iconv_t cd = iconv_open(dst_enc_str.c_str(), src_enc_str.c_str());
350  if (cd == (iconv_t)-1)
351  return str_to_encode;
352  char *src = const_cast<char *>(str_to_encode.c_str());
353  size_t src_left = str_to_encode.size();
354  size_t dst_size = str_to_encode.size() * 5 + 10;
355  char *dst = new char[dst_size];
356  size_t dst_left = dst_size;
357 # ifdef ICONV_CONST
358  char ICONV_CONST *p = src;
359 # else
360  char *p = src;
361 # endif
362  char *q = dst;
363  size_t status = iconv(cd, &p, &src_left, &q, &dst_left);
364  iconv_close(cd);
365  if (status == (size_t) -1 || src_left > 0) {
366  delete[] dst;
367  return std::string();
368  }
369  *q++ = '\0';
370  std::string result(dst);
371  delete[] dst;
372  return result;
373 #endif
374 }
375 
376 std::string ReaderUtil::Normalize(const std::string &str) {
377 #ifdef LCF_SUPPORT_ICU
378  icu::UnicodeString uni = icu::UnicodeString(str.c_str(), "utf-8").toLower();
379  UErrorCode err = U_ZERO_ERROR;
380  std::string res;
381  const icu::Normalizer2* norm = icu::Normalizer2::getNFKCInstance(err);
382  if (U_FAILURE(err)) {
383  static bool err_reported = false;
384  if (!err_reported) {
385  fprintf(stderr, "Normalizer2::getNFKCInstance failed (%s). \"nrm\" is probably missing in the ICU data file. Unicode normalization will not work!\n", u_errorName(err));
386  err_reported = true;
387  }
388  uni.toUTF8String(res);
389  return res;
390  }
391  icu::UnicodeString f = norm->normalize(uni, err);
392  if (U_FAILURE(err)) {
393  uni.toUTF8String(res);
394  } else {
395  f.toUTF8String(res);
396  }
397  return res;
398 #else
399  std::string result = str;
400  std::transform(result.begin(), result.end(), result.begin(), tolower);
401  return result;
402 #endif
403 }
std::string menu_quit
Definition: rpg_terms.h:118
int ParseError() const
Definition: inireader.cpp:98
std::string order
Definition: rpg_terms.h:124
RPG::Database data
Definition: data.cpp:14
std::string wait_on
Definition: rpg_terms.h:125
std::string weapon
Definition: rpg_terms.h:140
std::string system_name
Definition: rpg_system.h:186
RPG::System & system
Definition: data.cpp:31
std::string DetectEncoding(std::istream &filestream)
Definition: reader_util.cpp:80
std::string row
Definition: rpg_terms.h:123
std::string CodepageToEncoding(int codepage)
Definition: reader_util.cpp:50
std::string Normalize(const std::string &str)
std::string spirit_points
Definition: rpg_terms.h:129
std::string Recode(const std::string &str_to_encode, const std::string &source_encoding)
std::string shield
Definition: rpg_terms.h:141
std::string file
Definition: rpg_terms.h:147
std::string spirit
Definition: rpg_terms.h:138
std::string exp_short
Definition: rpg_terms.h:131
std::string wait_off
Definition: rpg_terms.h:126
std::string normal_status
Definition: rpg_terms.h:130
std::string sp_cost
Definition: rpg_terms.h:135
std::string Get(const std::string &section, const std::string &name, const std::string &default_value) const
Definition: inireader.cpp:103
std::string gameover_name
Definition: rpg_system.h:185
std::string frame_name
Definition: rpg_system.h:229
std::string system2_name
Definition: rpg_system.h:187
std::string hp_short
Definition: rpg_terms.h:133
std::string new_game
Definition: rpg_terms.h:119
std::string lvl_short
Definition: rpg_terms.h:132
std::string menu_save
Definition: rpg_terms.h:117
std::vector< std::string > DetectEncodings(std::istream &filestream)
std::string no
Definition: rpg_terms.h:150
std::string GetLocaleEncoding()
std::string boat_name
Definition: rpg_system.h:178
std::string accessory
Definition: rpg_terms.h:144
std::string title_name
Definition: rpg_system.h:184
bool Load(const std::string &filename, const std::string &encoding)
Definition: ldb_reader.cpp:24
std::string GetEncoding(const std::string &ini_file)
std::string status
Definition: rpg_terms.h:122
std::string exit_game_message
Definition: rpg_terms.h:148
std::string health_points
Definition: rpg_terms.h:128
std::string yes
Definition: rpg_terms.h:149
std::string armor
Definition: rpg_terms.h:142
std::string exit_game
Definition: rpg_terms.h:121
std::string helmet
Definition: rpg_terms.h:143
RPG::Terms & terms
Definition: data.cpp:30
std::string ship_name
Definition: rpg_system.h:179
std::string sp_short
Definition: rpg_terms.h:134
std::string save_game_message
Definition: rpg_terms.h:145
std::string level
Definition: rpg_terms.h:127
ScopeGuard< F > makeScopeGuard(F &&f)
Definition: scope_guard.h:39
std::string agility
Definition: rpg_terms.h:139
std::string load_game
Definition: rpg_terms.h:120
std::string airship_name
Definition: rpg_system.h:180
std::string attack
Definition: rpg_terms.h:136
std::string battletest_background
Definition: rpg_system.h:220
std::string load_game_message
Definition: rpg_terms.h:146
std::string defense
Definition: rpg_terms.h:137