Audacious $Id:Doxyfile42802007-03-2104:39:00Znenolod$
chardet.c
Go to the documentation of this file.
00001 /*  Audacious
00002  *  Copyright (C) 2005-2007  Audacious development team.
00003  *
00004  *  This program is free software; you can redistribute it and/or modify
00005  *  it under the terms of the GNU General Public License as published by
00006  *  the Free Software Foundation; under version 3 of the License.
00007  *
00008  *  This program is distributed in the hope that it will be useful,
00009  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00010  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00011  *  GNU General Public License for more details.
00012  *
00013  *  You should have received a copy of the GNU General Public License
00014  *  along with this program.  If not, see <http://www.gnu.org/licenses>.
00015  *
00016  *  The Audacious team does not consider modular code linking to
00017  *  Audacious or using our public API to be a derived work.
00018  */
00019 
00020 #include <string.h>
00021 #include <libaudcore/audstrings.h>
00022 
00023 #include "audconfig.h"
00024 #include "config.h"
00025 #include "i18n.h"
00026 #include "debug.h"
00027 
00028 #ifdef USE_CHARDET
00029 #  include <libguess.h>
00030 #endif
00031 
00032 static gchar * cd_chardet_to_utf8 (const gchar * str, gssize len,
00033  gsize * arg_bytes_read, gsize * arg_bytes_write, GError ** error);
00034 
00035 static gchar * str_to_utf8_fallback (const gchar * str)
00036 {
00037     gchar * out = g_strconcat (str, _("  (invalid UTF-8)"), NULL);
00038 
00039     for (gchar * c = out; * c; c ++)
00040     {
00041         if (* c & 0x80)
00042             * c = '?';
00043     }
00044 
00045     return out;
00046 }
00047 
00048 static gchar * cd_str_to_utf8 (const gchar * str)
00049 {
00050     gchar *out_str;
00051 
00052     if (str == NULL)
00053         return NULL;
00054 
00055     /* Note: Currently, playlist calls this function repeatedly, even
00056      * if the string is already converted into utf-8.
00057      * chardet_to_utf8() would convert a valid utf-8 string into a
00058      * different utf-8 string, if fallback encodings were supplied and
00059      * the given string could be treated as a string in one of
00060      * fallback encodings. To avoid this, g_utf8_validate() had been
00061      * used at the top of evaluation.
00062      */
00063 
00064     /* Note 2: g_utf8_validate() has so called encapsulated utf-8
00065      * problem, thus chardet_to_utf8() took the place of that.
00066      */
00067 
00068     /* Note 3: As introducing madplug, the problem of conversion from
00069      * ISO-8859-1 to UTF-8 arose. This may be coped with g_convert()
00070      * located near the end of chardet_to_utf8(), but it requires utf8
00071      * validation guard where g_utf8_validate() was. New
00072      * dfa_validate_utf8() employs libguess' DFA engine to validate
00073      * utf-8 and can properly distinguish examples of encapsulated
00074      * utf-8. It is considered to be safe to use as a guard.
00075      */
00076 
00077     /* Already UTF-8? */
00078 #ifdef USE_CHARDET
00079     if (libguess_validate_utf8(str, strlen(str)))
00080         return g_strdup(str);
00081 #else
00082     if (g_utf8_validate(str, strlen(str), NULL))
00083         return g_strdup(str);
00084 #endif
00085 
00086     /* chardet encoding detector */
00087     if ((out_str = cd_chardet_to_utf8(str, strlen(str), NULL, NULL, NULL)) != NULL)
00088         return out_str;
00089 
00090     /* all else fails, we mask off character codes >= 128, replace with '?' */
00091     return str_to_utf8_fallback(str);
00092 }
00093 
00094 static gchar * cd_chardet_to_utf8 (const gchar * str, gssize len,
00095  gsize * arg_bytes_read, gsize * arg_bytes_write, GError ** error)
00096 {
00097     if (error)
00098         * error = NULL;
00099 
00100 #ifdef USE_CHARDET
00101     gchar *det = NULL, *encoding = NULL;
00102 #endif
00103     gchar *ret = NULL;
00104     gsize *bytes_read, *bytes_write;
00105     gsize my_bytes_read, my_bytes_write;
00106 
00107     bytes_read = arg_bytes_read != NULL ? arg_bytes_read : &my_bytes_read;
00108     bytes_write = arg_bytes_write != NULL ? arg_bytes_write : &my_bytes_write;
00109 
00110     g_return_val_if_fail(str != NULL, NULL);
00111 
00112 #ifdef USE_CHARDET
00113     if (libguess_validate_utf8(str, len))
00114 #else
00115     if (g_utf8_validate(str, len, NULL))
00116 #endif
00117     {
00118         if (len < 0)
00119             len = strlen (str);
00120 
00121         ret = g_malloc (len + 1);
00122         memcpy (ret, str, len);
00123         ret[len] = 0;
00124 
00125         if (arg_bytes_read != NULL)
00126             * arg_bytes_read = len;
00127         if (arg_bytes_write != NULL)
00128             * arg_bytes_write = len;
00129 
00130         return ret;
00131     }
00132 #ifdef USE_CHARDET
00133     if (cfg.chardet_detector)
00134         det = cfg.chardet_detector;
00135 
00136     if (det)
00137     {
00138         AUDDBG("guess encoding (%s) %s\n", det, str);
00139         encoding = (gchar *) libguess_determine_encoding(str, len, det);
00140         AUDDBG("encoding = %s\n", encoding);
00141         if (encoding == NULL)
00142             goto fallback;
00143 
00144         ret = g_convert (str, len, "UTF-8", encoding, bytes_read, bytes_write,
00145          (error && * error) ? NULL : error);
00146     }
00147 
00148 fallback:
00149 #endif
00150 
00151     /* If detection failed or was not enabled, try fallbacks (if there are any) */
00152     if (ret == NULL && cfg.chardet_fallback_s != NULL)
00153     {
00154         gchar **enc;
00155         for (enc = cfg.chardet_fallback_s; *enc != NULL; enc++)
00156         {
00157             ret = g_convert (str, len, "UTF-8", * enc, bytes_read, bytes_write,
00158              (error && * error) ? NULL : error);
00159             if (len == *bytes_read)
00160                 break;
00161             else {
00162                 g_free(ret);
00163                 ret = NULL;
00164             }
00165         }
00166     }
00167 
00168     /* First fallback: locale (duh!) */
00169     if (ret == NULL)
00170         ret = g_locale_to_utf8 (str, len, bytes_read, bytes_write,
00171          (error && * error) ? NULL : error);
00172 
00173     /* The final fallback is ISO-8859-1, if no other is specified or conversions fail */
00174     if (ret == NULL)
00175         ret = g_convert (str, len, "UTF-8", "ISO-8859-1", bytes_read,
00176          bytes_write, (error && * error) ? NULL : error);
00177 
00178     if (ret != NULL)
00179     {
00180         if (g_utf8_validate(ret, -1, NULL))
00181             return ret;
00182         else
00183         {
00184             g_warning("g_utf8_validate() failed for converted string in cd_chardet_to_utf8: '%s'", ret);
00185             g_free(ret);
00186             return NULL;
00187         }
00188     }
00189 
00190     return NULL; /* If we have no idea, return NULL. */
00191 }
00192 
00193 void chardet_init (void)
00194 {
00195     str_set_utf8_impl (cd_str_to_utf8, cd_chardet_to_utf8);
00196 }