PocketSphinx  0.6
src/libpocketsphinx/dict.c
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 1999-2004 Carnegie Mellon University.  All rights
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 
00038 /* System headers. */
00039 #include <string.h>
00040 
00041 /* SphinxBase headers. */
00042 #include <sphinxbase/pio.h>
00043 #include <sphinxbase/strfuncs.h>
00044 
00045 /* Local headers. */
00046 #include "dict.h"
00047 
00048 
00049 #define DELIM   " \t\n"         /* Set of field separator characters */
00050 #define DEFAULT_NUM_PHONE       (MAX_S3CIPID+1)
00051 
00052 #if WIN32
00053 #define snprintf sprintf_s
00054 #endif 
00055 
00056 extern const char *const cmu6_lts_phone_table[];
00057 
00058 static s3cipid_t
00059 dict_ciphone_id(dict_t * d, const char *str)
00060 {
00061     if (d->nocase)
00062         return bin_mdef_ciphone_id_nocase(d->mdef, str);
00063     else
00064         return bin_mdef_ciphone_id(d->mdef, str);
00065 }
00066 
00067 
00068 const char *
00069 dict_ciphone_str(dict_t * d, s3wid_t wid, int32 pos)
00070 {
00071     assert(d != NULL);
00072     assert((wid >= 0) && (wid < d->n_word));
00073     assert((pos >= 0) && (pos < d->word[wid].pronlen));
00074 
00075     return bin_mdef_ciphone_str(d->mdef, d->word[wid].ciphone[pos]);
00076 }
00077 
00078 
00079 s3wid_t
00080 dict_add_word(dict_t * d, char const *word, s3cipid_t const * p, int32 np)
00081 {
00082     int32 len;
00083     dictword_t *wordp;
00084     s3wid_t newwid;
00085     char *wword;
00086 
00087     if (d->n_word >= d->max_words) {
00088         E_INFO("Reallocating to %d KiB for word entries\n",
00089                (d->max_words + S3DICT_INC_SZ) * sizeof(dictword_t) / 1024);
00090         d->word =
00091             (dictword_t *) ckd_realloc(d->word,
00092                                        (d->max_words +
00093                                         S3DICT_INC_SZ) * sizeof(dictword_t));
00094         d->max_words = d->max_words + S3DICT_INC_SZ;
00095         return BAD_S3WID;
00096     }
00097 
00098     wordp = d->word + d->n_word;
00099     wordp->word = (char *) ckd_salloc(word);    /* Freed in dict_free */
00100 
00101     /* Associate word string with d->n_word in hash table */
00102     if (hash_table_enter_int32(d->ht, wordp->word, d->n_word) != d->n_word) {
00103         ckd_free(wordp->word);
00104         wordp->word = NULL;
00105         return BAD_S3WID;
00106     }
00107 
00108     /* Fill in word entry, and set defaults */
00109     if (p && (np > 0)) {
00110         wordp->ciphone = (s3cipid_t *) ckd_malloc(np * sizeof(s3cipid_t));      /* Freed in dict_free */
00111         memcpy(wordp->ciphone, p, np * sizeof(s3cipid_t));
00112         wordp->pronlen = np;
00113     }
00114     else {
00115         wordp->ciphone = NULL;
00116         wordp->pronlen = 0;
00117     }
00118     wordp->alt = BAD_S3WID;
00119     wordp->basewid = d->n_word;
00120 
00121     /* Determine base/alt wids */
00122     wword = ckd_salloc(word);
00123     if ((len = dict_word2basestr(wword)) > 0) {
00124         int32 w;
00125 
00126         /* Truncated to a baseword string; find its ID */
00127         if (hash_table_lookup_int32(d->ht, wword, &w) < 0) {
00128             E_ERROR("Missing base word for: %s\n", word);
00129             ckd_free(wword);
00130             ckd_free(wordp->word);
00131             wordp->word = NULL;
00132             return BAD_S3WID;
00133         }
00134 
00135         /* Link into alt list */
00136         wordp->basewid = w;
00137         wordp->alt = d->word[w].alt;
00138         d->word[w].alt = d->n_word;
00139     }
00140     ckd_free(wword);
00141 
00142     newwid = d->n_word++;
00143 
00144     return newwid;
00145 }
00146 
00147 
00148 static int32
00149 dict_read(FILE * fp, dict_t * d)
00150 {
00151     lineiter_t *li;
00152     char **wptr;
00153     s3cipid_t *p;
00154     int32 lineno, nwd;
00155     s3wid_t w;
00156     int32 i, maxwd;
00157     size_t stralloc, phnalloc;
00158 
00159     maxwd = 512;
00160     p = (s3cipid_t *) ckd_calloc(maxwd + 4, sizeof(*p));
00161     wptr = (char **) ckd_calloc(maxwd, sizeof(char *)); /* Freed below */
00162 
00163     lineno = 0;
00164     stralloc = phnalloc = 0;
00165     for (li = lineiter_start(fp); li; li = lineiter_next(li)) {
00166         lineno++;
00167         if (0 == strncmp(li->buf, "##", 2)
00168             || 0 == strncmp(li->buf, ";;", 2))
00169             continue;
00170 
00171         if ((nwd = str2words(li->buf, wptr, maxwd)) < 0) {
00172             /* Increase size of p, wptr. */
00173             nwd = str2words(li->buf, NULL, 0);
00174             assert(nwd > maxwd); /* why else would it fail? */
00175             maxwd = nwd;
00176             p = (s3cipid_t *) ckd_realloc(p, (maxwd + 4) * sizeof(*p));
00177             wptr = (char **) ckd_realloc(wptr, maxwd * sizeof(*wptr));
00178         }
00179 
00180         if (nwd == 0)           /* Empty line */
00181             continue;
00182         /* wptr[0] is the word-string and wptr[1..nwd-1] the pronunciation sequence */
00183         if (nwd == 1) {
00184             E_ERROR("Line %d: No pronunciation for word '%s'; ignored\n",
00185                     lineno, wptr[0]);
00186             continue;
00187         }
00188 
00189 
00190         /* Convert pronunciation string to CI-phone-ids */
00191         for (i = 1; i < nwd; i++) {
00192             p[i - 1] = dict_ciphone_id(d, wptr[i]);
00193             if (NOT_S3CIPID(p[i - 1])) {
00194                 E_ERROR("Line %d: Phone '%s' is mising in the acoustic model; word '%s' ignored\n",
00195                         lineno, wptr[i], wptr[0]);
00196                 break;
00197             }
00198         }
00199 
00200         if (i == nwd) {         /* All CI-phones successfully converted to IDs */
00201             w = dict_add_word(d, wptr[0], p, nwd - 1);
00202             if (NOT_S3WID(w))
00203                 E_ERROR
00204                     ("Line %d: Failed to add the word '%s' (duplicate?); ignored\n",
00205                      lineno, wptr[0]);
00206             else {
00207                 stralloc += strlen(d->word[w].word);
00208                 phnalloc += d->word[w].pronlen * sizeof(s3cipid_t);
00209             }
00210         }
00211     }
00212     E_INFO("Allocated %d KiB for strings, %d KiB for phones\n",
00213            (int)stralloc / 1024, (int)phnalloc / 1024);
00214     ckd_free(p);
00215     ckd_free(wptr);
00216 
00217     return 0;
00218 }
00219 
00220 int
00221 dict_write(dict_t *dict, char const *filename, char const *format)
00222 {
00223     FILE *fh;
00224     int i;
00225 
00226     if ((fh = fopen(filename, "w")) == NULL) {
00227         E_ERROR_SYSTEM("Failed to open '%s'", filename);
00228         return -1;
00229     }
00230     for (i = 0; i < dict->n_word; ++i) {
00231         char *phones;
00232         int j, phlen;
00233         if (!dict_real_word(dict, i))
00234             continue;
00235         for (phlen = j = 0; j < dict_pronlen(dict, i); ++j)
00236             phlen += strlen(dict_ciphone_str(dict, i, j)) + 1;
00237         phones = ckd_calloc(1, phlen);
00238         for (j = 0; j < dict_pronlen(dict, i); ++j) {
00239             strcat(phones, dict_ciphone_str(dict, i, j));
00240             if (j != dict_pronlen(dict, i) - 1)
00241                 strcat(phones, " ");
00242         }
00243         fprintf(fh, "%-30s %s\n", dict_wordstr(dict, i), phones);
00244         ckd_free(phones);
00245     }
00246     fclose(fh);
00247     return 0;
00248 }
00249 
00250 
00251 dict_t *
00252 dict_init(cmd_ln_t *config, bin_mdef_t * mdef)
00253 {
00254     FILE *fp, *fp2;
00255     int32 n;
00256     lineiter_t *li;
00257     dict_t *d;
00258     s3cipid_t sil;
00259     char const *dictfile = NULL, *fillerfile = NULL;
00260 
00261     if (config) {
00262         dictfile = cmd_ln_str_r(config, "-dict");
00263         fillerfile = cmd_ln_str_r(config, "-fdict");
00264     }
00265 
00266     /*
00267      * First obtain #words in dictionary (for hash table allocation).
00268      * Reason: The PC NT system doesn't like to grow memory gradually.  Better to allocate
00269      * all the required memory in one go.
00270      */
00271     fp = NULL;
00272     n = 0;
00273     if (dictfile) {
00274         if ((fp = fopen(dictfile, "r")) == NULL)
00275             E_FATAL_SYSTEM("Failed to open dictionary file '%s' for reading", dictfile);
00276         for (li = lineiter_start(fp); li; li = lineiter_next(li)) {
00277             if (li->buf[0] != '#')
00278                 n++;
00279         }
00280         rewind(fp);
00281     }
00282 
00283     fp2 = NULL;
00284     if (fillerfile) {
00285         if ((fp2 = fopen(fillerfile, "r")) == NULL)
00286             E_FATAL_SYSTEM("Failed to open filler dictionary file '%s' for reading", fillerfile);
00287         for (li = lineiter_start(fp2); li; li = lineiter_next(li)) {
00288             if (li->buf[0] != '#')
00289                 n++;
00290         }
00291         rewind(fp2);
00292     }
00293 
00294     /*
00295      * Allocate dict entries.  HACK!!  Allow some extra entries for words not in file.
00296      * Also check for type size restrictions.
00297      */
00298     d = (dict_t *) ckd_calloc(1, sizeof(dict_t));       /* freed in dict_free() */
00299     d->refcnt = 1;
00300     d->max_words =
00301         (n + S3DICT_INC_SZ < MAX_S3WID) ? n + S3DICT_INC_SZ : MAX_S3WID;
00302     if (n >= MAX_S3WID)
00303         E_FATAL("Number of words in dictionaries (%d) exceeds limit (%d)\n", n,
00304                 MAX_S3WID);
00305 
00306     E_INFO("Allocating %d * %d bytes (%d KiB) for word entries\n",
00307            d->max_words, sizeof(dictword_t),
00308            d->max_words * sizeof(dictword_t) / 1024);
00309     d->word = (dictword_t *) ckd_calloc(d->max_words, sizeof(dictword_t));      /* freed in dict_free() */
00310     d->n_word = 0;
00311     if (mdef)
00312         d->mdef = bin_mdef_retain(mdef);
00313 
00314     /* Create new hash table for word strings; case-insensitive word strings */
00315     if (config && cmd_ln_exists_r(config, "-dictcase"))
00316         d->nocase = cmd_ln_boolean_r(config, "-dictcase");
00317     d->ht = hash_table_new(d->max_words, d->nocase);
00318 
00319     /* Digest main dictionary file */
00320     if (fp) {
00321         E_INFO("Reading main dictionary: %s\n", dictfile);
00322         dict_read(fp, d);
00323         fclose(fp);
00324         E_INFO("%d words read\n", d->n_word);
00325     }
00326 
00327     /* Now the filler dictionary file, if it exists */
00328     d->filler_start = d->n_word;
00329     if (fillerfile) {
00330         E_INFO("Reading filler dictionary: %s\n", fillerfile);
00331         dict_read(fp2, d);
00332         fclose(fp2);
00333         E_INFO("%d words read\n", d->n_word - d->filler_start);
00334     }
00335     if (mdef)
00336         sil = bin_mdef_silphone(mdef);
00337     else
00338         sil = 0;
00339     if (dict_wordid(d, S3_START_WORD) == BAD_S3WID) {
00340         dict_add_word(d, S3_START_WORD, &sil, 1);
00341     }
00342     if (dict_wordid(d, S3_FINISH_WORD) == BAD_S3WID) {
00343         dict_add_word(d, S3_FINISH_WORD, &sil, 1);
00344     }
00345     if (dict_wordid(d, S3_SILENCE_WORD) == BAD_S3WID) {
00346         dict_add_word(d, S3_SILENCE_WORD, &sil, 1);
00347     }
00348 
00349     d->filler_end = d->n_word - 1;
00350 
00351     /* Initialize distinguished word-ids */
00352     d->startwid = dict_wordid(d, S3_START_WORD);
00353     d->finishwid = dict_wordid(d, S3_FINISH_WORD);
00354     d->silwid = dict_wordid(d, S3_SILENCE_WORD);
00355 
00356     if ((d->filler_start > d->filler_end)
00357         || (!dict_filler_word(d, d->silwid)))
00358         E_FATAL("Word '%s' must occur (only) in filler dictionary\n",
00359                 S3_SILENCE_WORD);
00360 
00361     /* No check that alternative pronunciations for filler words are in filler range!! */
00362 
00363     return d;
00364 }
00365 
00366 
00367 s3wid_t
00368 dict_wordid(dict_t *d, const char *word)
00369 {
00370     int32 w;
00371 
00372     assert(d);
00373     assert(word);
00374 
00375     if (hash_table_lookup_int32(d->ht, word, &w) < 0)
00376         return (BAD_S3WID);
00377     return w;
00378 }
00379 
00380 
00381 int
00382 dict_filler_word(dict_t *d, s3wid_t w)
00383 {
00384     assert(d);
00385     assert((w >= 0) && (w < d->n_word));
00386 
00387     w = dict_basewid(d, w);
00388     if ((w == d->startwid) || (w == d->finishwid))
00389         return 0;
00390     if ((w >= d->filler_start) && (w <= d->filler_end))
00391         return 1;
00392     return 0;
00393 }
00394 
00395 int
00396 dict_real_word(dict_t *d, s3wid_t w)
00397 {
00398     assert(d);
00399     assert((w >= 0) && (w < d->n_word));
00400 
00401     w = dict_basewid(d, w);
00402     if ((w == d->startwid) || (w == d->finishwid))
00403         return 0;
00404     if ((w >= d->filler_start) && (w <= d->filler_end))
00405         return 0;
00406     return 1;
00407 }
00408 
00409 
00410 int32
00411 dict_word2basestr(char *word)
00412 {
00413     int32 i, len;
00414 
00415     len = strlen(word);
00416     if (word[len - 1] == ')') {
00417         for (i = len - 2; (i > 0) && (word[i] != '('); --i);
00418 
00419         if (i > 0) {
00420             /* The word is of the form <baseword>(...); strip from left-paren */
00421             word[i] = '\0';
00422             return i;
00423         }
00424     }
00425 
00426     return -1;
00427 }
00428 
00429 dict_t *
00430 dict_retain(dict_t *d)
00431 {
00432     ++d->refcnt;
00433     return d;
00434 }
00435 
00436 int
00437 dict_free(dict_t * d)
00438 {
00439     int i;
00440     dictword_t *word;
00441 
00442     if (d == NULL)
00443         return 0;
00444     if (--d->refcnt > 0)
00445         return d->refcnt;
00446 
00447     /* First Step, free all memory allocated for each word */
00448     for (i = 0; i < d->n_word; i++) {
00449         word = (dictword_t *) & (d->word[i]);
00450         if (word->word)
00451             ckd_free((void *) word->word);
00452         if (word->ciphone)
00453             ckd_free((void *) word->ciphone);
00454     }
00455 
00456     if (d->word)
00457         ckd_free((void *) d->word);
00458     if (d->ht)
00459         hash_table_free(d->ht);
00460     if (d->mdef)
00461         bin_mdef_free(d->mdef);
00462     ckd_free((void *) d);
00463 
00464     return 0;
00465 }
00466 
00467 void
00468 dict_report(dict_t * d)
00469 {
00470     E_INFO_NOFN("Initialization of dict_t, report:\n");
00471     E_INFO_NOFN("Max word: %d\n", d->max_words);
00472     E_INFO_NOFN("No of word: %d\n", d->n_word);
00473     E_INFO_NOFN("\n");
00474 }