Audacious $Id:Doxyfile42802007-03-2104:39:00Znenolod$
|
00001 /* Audacious 00002 * Copyright (C) 2005-2007 Audacious development team. 00003 * 00004 * This program is free software; you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation; under version 3 of the License. 00007 * 00008 * This program is distributed in the hope that it will be useful, 00009 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00010 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00011 * GNU General Public License for more details. 00012 * 00013 * You should have received a copy of the GNU General Public License 00014 * along with this program. If not, see <http://www.gnu.org/licenses>. 00015 * 00016 * The Audacious team does not consider modular code linking to 00017 * Audacious or using our public API to be a derived work. 00018 */ 00019 00020 #include <string.h> 00021 #include <libaudcore/audstrings.h> 00022 00023 #include "audconfig.h" 00024 #include "chardet.h" 00025 #include "config.h" 00026 #include "i18n.h" 00027 #include "main.h" 00028 #include "debug.h" 00029 00030 #ifdef USE_CHARDET 00031 # include <libguess.h> 00032 #endif 00033 00034 gchar * 00035 cd_str_to_utf8(const gchar * str) 00036 { 00037 gchar *out_str; 00038 00039 if (str == NULL) 00040 return NULL; 00041 00042 /* Note: Currently, playlist calls this function repeatedly, even 00043 * if the string is already converted into utf-8. 00044 * chardet_to_utf8() would convert a valid utf-8 string into a 00045 * different utf-8 string, if fallback encodings were supplied and 00046 * the given string could be treated as a string in one of 00047 * fallback encodings. To avoid this, g_utf8_validate() had been 00048 * used at the top of evaluation. 00049 */ 00050 00051 /* Note 2: g_utf8_validate() has so called encapsulated utf-8 00052 * problem, thus chardet_to_utf8() took the place of that. 00053 */ 00054 00055 /* Note 3: As introducing madplug, the problem of conversion from 00056 * ISO-8859-1 to UTF-8 arose. This may be coped with g_convert() 00057 * located near the end of chardet_to_utf8(), but it requires utf8 00058 * validation guard where g_utf8_validate() was. New 00059 * dfa_validate_utf8() employs libguess' DFA engine to validate 00060 * utf-8 and can properly distinguish examples of encapsulated 00061 * utf-8. It is considered to be safe to use as a guard. 00062 */ 00063 00064 /* Already UTF-8? */ 00065 #ifdef USE_CHARDET 00066 if (libguess_validate_utf8(str, strlen(str))) 00067 return g_strdup(str); 00068 #else 00069 if (g_utf8_validate(str, strlen(str), NULL)) 00070 return g_strdup(str); 00071 #endif 00072 00073 /* chardet encoding detector */ 00074 if ((out_str = cd_chardet_to_utf8(str, strlen(str), NULL, NULL, NULL)) != NULL) 00075 return out_str; 00076 00077 /* all else fails, we mask off character codes >= 128, replace with '?' */ 00078 return str_to_utf8_fallback(str); 00079 } 00080 00081 gchar * 00082 cd_chardet_to_utf8(const gchar * str, gssize len, gsize * arg_bytes_read, 00083 gsize * arg_bytes_write, GError ** error) 00084 { 00085 if (error) 00086 * error = NULL; 00087 00088 #ifdef USE_CHARDET 00089 gchar *det = NULL, *encoding = NULL; 00090 #endif 00091 gchar *ret = NULL; 00092 gsize *bytes_read, *bytes_write; 00093 gsize my_bytes_read, my_bytes_write; 00094 00095 bytes_read = arg_bytes_read != NULL ? arg_bytes_read : &my_bytes_read; 00096 bytes_write = arg_bytes_write != NULL ? arg_bytes_write : &my_bytes_write; 00097 00098 g_return_val_if_fail(str != NULL, NULL); 00099 00100 #ifdef USE_CHARDET 00101 if (libguess_validate_utf8(str, len)) 00102 #else 00103 if (g_utf8_validate(str, len, NULL)) 00104 #endif 00105 { 00106 if (len < 0) 00107 len = strlen (str); 00108 00109 ret = g_malloc (len + 1); 00110 memcpy (ret, str, len); 00111 ret[len] = 0; 00112 00113 if (arg_bytes_read != NULL) 00114 * arg_bytes_read = len; 00115 if (arg_bytes_write != NULL) 00116 * arg_bytes_write = len; 00117 00118 return ret; 00119 } 00120 #ifdef USE_CHARDET 00121 if (cfg.chardet_detector) 00122 det = cfg.chardet_detector; 00123 00124 if (det) 00125 { 00126 AUDDBG("guess encoding (%s) %s\n", det, str); 00127 encoding = (gchar *) libguess_determine_encoding(str, len, det); 00128 AUDDBG("encoding = %s\n", encoding); 00129 if (encoding == NULL) 00130 goto fallback; 00131 00132 ret = g_convert (str, len, "UTF-8", encoding, bytes_read, bytes_write, 00133 (error && * error) ? NULL : error); 00134 } 00135 00136 fallback: 00137 #endif 00138 00139 /* If detection failed or was not enabled, try fallbacks (if there are any) */ 00140 if (ret == NULL && cfg.chardet_fallback_s != NULL) 00141 { 00142 gchar **enc; 00143 for (enc = cfg.chardet_fallback_s; *enc != NULL; enc++) 00144 { 00145 ret = g_convert (str, len, "UTF-8", * enc, bytes_read, bytes_write, 00146 (error && * error) ? NULL : error); 00147 if (len == *bytes_read) 00148 break; 00149 else { 00150 g_free(ret); 00151 ret = NULL; 00152 } 00153 } 00154 } 00155 00156 /* First fallback: locale (duh!) */ 00157 if (ret == NULL) 00158 ret = g_locale_to_utf8 (str, len, bytes_read, bytes_write, 00159 (error && * error) ? NULL : error); 00160 00161 /* The final fallback is ISO-8859-1, if no other is specified or conversions fail */ 00162 if (ret == NULL) 00163 ret = g_convert (str, len, "UTF-8", "ISO-8859-1", bytes_read, 00164 bytes_write, (error && * error) ? NULL : error); 00165 00166 if (ret != NULL) 00167 { 00168 if (g_utf8_validate(ret, -1, NULL)) 00169 return ret; 00170 else 00171 { 00172 g_warning("g_utf8_validate() failed for converted string in cd_chardet_to_utf8: '%s'", ret); 00173 g_free(ret); 00174 return NULL; 00175 } 00176 } 00177 00178 return NULL; /* If we have no idea, return NULL. */ 00179 } 00180 00181 00182 void chardet_init(void) 00183 { 00184 str_to_utf8 = cd_str_to_utf8; 00185 chardet_to_utf8 = cd_chardet_to_utf8; 00186 }