PocketSphinx
0.6
|
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 00002 /* ==================================================================== 00003 * Copyright (c) 2008 Carnegie Mellon University. All rights 00004 * reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions 00008 * are met: 00009 * 00010 * 1. Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in 00015 * the documentation and/or other materials provided with the 00016 * distribution. 00017 * 00018 * This work was supported in part by funding from the Defense Advanced 00019 * Research Projects Agency and the National Science Foundation of the 00020 * United States of America, and the CMU Sphinx Speech Consortium. 00021 * 00022 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 00023 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 00024 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00025 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 00026 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00027 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00028 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00029 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00030 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00032 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00033 * 00034 * ==================================================================== 00035 * 00036 */ 00037 00038 /* System headers. */ 00039 #include <stdio.h> 00040 #include <assert.h> 00041 00042 /* SphinxBase headers. */ 00043 #include <sphinxbase/err.h> 00044 #include <sphinxbase/strfuncs.h> 00045 #include <sphinxbase/filename.h> 00046 #include <sphinxbase/pio.h> 00047 00048 /* Local headers. */ 00049 #include "cmdln_macro.h" 00050 #include "pocketsphinx_internal.h" 00051 #include "ps_lattice_internal.h" 00052 #include "phone_loop_search.h" 00053 #include "fsg_search_internal.h" 00054 #include "ngram_search.h" 00055 #include "ngram_search_fwdtree.h" 00056 #include "ngram_search_fwdflat.h" 00057 00058 static const arg_t ps_args_def[] = { 00059 POCKETSPHINX_OPTIONS, 00060 CMDLN_EMPTY_OPTION 00061 }; 00062 00063 /* I'm not sure what the portable way to do this is. */ 00064 static int 00065 file_exists(const char *path) 00066 { 00067 FILE *tmp; 00068 00069 tmp = fopen(path, "rb"); 00070 if (tmp) fclose(tmp); 00071 return (tmp != NULL); 00072 } 00073 00074 static int 00075 hmmdir_exists(const char *path) 00076 { 00077 FILE *tmp; 00078 char *mdef = string_join(path, "/mdef", NULL); 00079 00080 tmp = fopen(mdef, "rb"); 00081 if (tmp) fclose(tmp); 00082 ckd_free(mdef); 00083 return (tmp != NULL); 00084 } 00085 00086 static void 00087 ps_add_file(ps_decoder_t *ps, const char *arg, 00088 const char *hmmdir, const char *file) 00089 { 00090 char *tmp = string_join(hmmdir, "/", file, NULL); 00091 00092 if (cmd_ln_str_r(ps->config, arg) == NULL && file_exists(tmp)) 00093 cmd_ln_set_str_r(ps->config, arg, tmp); 00094 ckd_free(tmp); 00095 } 00096 00097 static void 00098 ps_init_defaults(ps_decoder_t *ps) 00099 { 00100 char const *hmmdir, *lmfile, *dictfile; 00101 00102 /* Disable memory mapping on Blackfin (FIXME: should be uClinux in general). */ 00103 #ifdef __ADSPBLACKFIN__ 00104 E_INFO("Will not use mmap() on uClinux/Blackfin."); 00105 cmd_ln_set_boolean_r(ps->config, "-mmap", FALSE); 00106 #endif 00107 00108 #ifdef MODELDIR 00109 /* Set default acoustic and language models. */ 00110 hmmdir = cmd_ln_str_r(ps->config, "-hmm"); 00111 lmfile = cmd_ln_str_r(ps->config, "-lm"); 00112 dictfile = cmd_ln_str_r(ps->config, "-dict"); 00113 if (hmmdir == NULL && hmmdir_exists(MODELDIR "/hmm/en_US/hub4wsj_sc_8k")) { 00114 hmmdir = MODELDIR "/hmm/en_US/hub4wsj_sc_8k"; 00115 cmd_ln_set_str_r(ps->config, "-hmm", hmmdir); 00116 } 00117 if (lmfile == NULL && !cmd_ln_str_r(ps->config, "-fsg") 00118 && !cmd_ln_str_r(ps->config, "-jsgf") 00119 && file_exists(MODELDIR "/lm/en_US/hub4.5000.DMP")) { 00120 lmfile = MODELDIR "/lm/en_US/hub4.5000.DMP"; 00121 cmd_ln_set_str_r(ps->config, "-lm", lmfile); 00122 } 00123 if (dictfile == NULL && file_exists(MODELDIR "/lm/en_US/cmu07a.dic")) { 00124 dictfile = MODELDIR "/lm/en_US/cmu07a.dic"; 00125 cmd_ln_set_str_r(ps->config, "-dict", dictfile); 00126 } 00127 00128 /* Expand acoustic and language model filenames relative to installation path. */ 00129 if (hmmdir && !path_is_absolute(hmmdir) && !hmmdir_exists(hmmdir)) { 00130 char *tmphmm = string_join(MODELDIR "/hmm/", hmmdir, NULL); 00131 if (hmmdir_exists(tmphmm)) { 00132 cmd_ln_set_str_r(ps->config, "-hmm", tmphmm); 00133 } else { 00134 E_ERROR("Failed to find mdef file inside the model folder specified with -hmm '%s'\n", hmmdir); 00135 } 00136 ckd_free(tmphmm); 00137 } 00138 if (lmfile && !path_is_absolute(lmfile) && !file_exists(lmfile)) { 00139 char *tmplm = string_join(MODELDIR "/lm/", lmfile, NULL); 00140 cmd_ln_set_str_r(ps->config, "-lm", tmplm); 00141 ckd_free(tmplm); 00142 } 00143 if (dictfile && !path_is_absolute(dictfile) && !file_exists(dictfile)) { 00144 char *tmpdict = string_join(MODELDIR "/lm/", dictfile, NULL); 00145 cmd_ln_set_str_r(ps->config, "-dict", tmpdict); 00146 ckd_free(tmpdict); 00147 } 00148 #endif 00149 00150 /* Get acoustic model filenames and add them to the command-line */ 00151 if ((hmmdir = cmd_ln_str_r(ps->config, "-hmm")) != NULL) { 00152 ps_add_file(ps, "-mdef", hmmdir, "mdef"); 00153 ps_add_file(ps, "-mean", hmmdir, "means"); 00154 ps_add_file(ps, "-var", hmmdir, "variances"); 00155 ps_add_file(ps, "-tmat", hmmdir, "transition_matrices"); 00156 ps_add_file(ps, "-mixw", hmmdir, "mixture_weights"); 00157 ps_add_file(ps, "-sendump", hmmdir, "sendump"); 00158 ps_add_file(ps, "-fdict", hmmdir, "noisedict"); 00159 ps_add_file(ps, "-lda", hmmdir, "feature_transform"); 00160 ps_add_file(ps, "-featparams", hmmdir, "feat.params"); 00161 ps_add_file(ps, "-senmgau", hmmdir, "senmgau"); 00162 } 00163 } 00164 00165 static void 00166 ps_free_searches(ps_decoder_t *ps) 00167 { 00168 gnode_t *gn; 00169 00170 if (ps->searches == NULL) 00171 return; 00172 00173 for (gn = ps->searches; gn; gn = gnode_next(gn)) 00174 ps_search_free(gnode_ptr(gn)); 00175 glist_free(ps->searches); 00176 ps->searches = NULL; 00177 ps->search = NULL; 00178 } 00179 00180 static ps_search_t * 00181 ps_find_search(ps_decoder_t *ps, char const *name) 00182 { 00183 gnode_t *gn; 00184 00185 for (gn = ps->searches; gn; gn = gnode_next(gn)) { 00186 if (0 == strcmp(ps_search_name(gnode_ptr(gn)), name)) 00187 return (ps_search_t *)gnode_ptr(gn); 00188 } 00189 return NULL; 00190 } 00191 00192 int 00193 ps_reinit(ps_decoder_t *ps, cmd_ln_t *config) 00194 { 00195 char const *lmfile, *lmctl = NULL; 00196 00197 if (config && config != ps->config) { 00198 cmd_ln_free_r(ps->config); 00199 ps->config = config; 00200 } 00201 #ifndef _WIN32_WCE 00202 /* Set up logging. */ 00203 if (cmd_ln_str_r(ps->config, "-logfn")) 00204 err_set_logfile(cmd_ln_str_r(ps->config, "-logfn")); 00205 #endif 00206 err_set_debug_level(cmd_ln_int32_r(ps->config, "-debug")); 00207 ps->mfclogdir = cmd_ln_str_r(ps->config, "-mfclogdir"); 00208 ps->rawlogdir = cmd_ln_str_r(ps->config, "-rawlogdir"); 00209 ps->senlogdir = cmd_ln_str_r(ps->config, "-senlogdir"); 00210 00211 /* Fill in some default arguments. */ 00212 ps_init_defaults(ps); 00213 00214 /* Free old searches (do this before other reinit) */ 00215 ps_free_searches(ps); 00216 00217 /* Free old acmod. */ 00218 acmod_free(ps->acmod); 00219 ps->acmod = NULL; 00220 00221 /* Free old dictionary (must be done after the two things above) */ 00222 dict_free(ps->dict); 00223 ps->dict = NULL; 00224 00225 00226 /* Logmath computation (used in acmod and search) */ 00227 if (ps->lmath == NULL 00228 || (logmath_get_base(ps->lmath) != 00229 (float64)cmd_ln_float32_r(ps->config, "-logbase"))) { 00230 if (ps->lmath) 00231 logmath_free(ps->lmath); 00232 ps->lmath = logmath_init 00233 ((float64)cmd_ln_float32_r(ps->config, "-logbase"), 0, 00234 cmd_ln_boolean_r(ps->config, "-bestpath")); 00235 } 00236 00237 /* Acoustic model (this is basically everything that 00238 * uttproc.c, senscr.c, and others used to do) */ 00239 if ((ps->acmod = acmod_init(ps->config, ps->lmath, NULL, NULL)) == NULL) 00240 return -1; 00241 /* Make the acmod's feature buffer growable if we are doing two-pass search. */ 00242 if (cmd_ln_boolean_r(ps->config, "-fwdflat") 00243 && cmd_ln_boolean_r(ps->config, "-fwdtree")) 00244 acmod_set_grow(ps->acmod, TRUE); 00245 00246 if ((ps->pl_window = cmd_ln_int32_r(ps->config, "-pl_window"))) { 00247 /* Initialize an auxiliary phone loop search, which will run in 00248 * "parallel" with FSG or N-Gram search. */ 00249 if ((ps->phone_loop = phone_loop_search_init(ps->config, 00250 ps->acmod, ps->dict)) == NULL) 00251 return -1; 00252 ps->searches = glist_add_ptr(ps->searches, ps->phone_loop); 00253 } 00254 00255 /* Dictionary and triphone mappings (depends on acmod). */ 00256 /* FIXME: pass config, change arguments, implement LTS, etc. */ 00257 if ((ps->dict = dict_init(ps->config, ps->acmod->mdef)) == NULL) 00258 return -1; 00259 00260 /* Determine whether we are starting out in FSG or N-Gram search mode. */ 00261 if (cmd_ln_str_r(ps->config, "-fsg") || cmd_ln_str_r(ps->config, "-jsgf")) { 00262 ps_search_t *fsgs; 00263 00264 if ((ps->d2p = dict2pid_build(ps->acmod->mdef, ps->dict)) == NULL) 00265 return -1; 00266 if ((fsgs = fsg_search_init(ps->config, ps->acmod, ps->dict, ps->d2p)) == NULL) 00267 return -1; 00268 fsgs->pls = ps->phone_loop; 00269 ps->searches = glist_add_ptr(ps->searches, fsgs); 00270 ps->search = fsgs; 00271 } 00272 else if ((lmfile = cmd_ln_str_r(ps->config, "-lm")) 00273 || (lmctl = cmd_ln_str_r(ps->config, "-lmctl"))) { 00274 ps_search_t *ngs; 00275 00276 if ((ps->d2p = dict2pid_build(ps->acmod->mdef, ps->dict)) == NULL) 00277 return -1; 00278 if ((ngs = ngram_search_init(ps->config, ps->acmod, ps->dict, ps->d2p)) == NULL) 00279 return -1; 00280 ngs->pls = ps->phone_loop; 00281 ps->searches = glist_add_ptr(ps->searches, ngs); 00282 ps->search = ngs; 00283 } 00284 /* Otherwise, we will initialize the search whenever the user 00285 * decides to load an FSG or a language model. */ 00286 else { 00287 if ((ps->d2p = dict2pid_build(ps->acmod->mdef, ps->dict)) == NULL) 00288 return -1; 00289 } 00290 00291 /* Initialize performance timer. */ 00292 ps->perf.name = "decode"; 00293 ptmr_init(&ps->perf); 00294 00295 return 0; 00296 } 00297 00298 ps_decoder_t * 00299 ps_init(cmd_ln_t *config) 00300 { 00301 ps_decoder_t *ps; 00302 00303 ps = ckd_calloc(1, sizeof(*ps)); 00304 ps->refcount = 1; 00305 if (ps_reinit(ps, config) < 0) { 00306 ps_free(ps); 00307 return NULL; 00308 } 00309 return ps; 00310 } 00311 00312 arg_t const * 00313 ps_args(void) 00314 { 00315 return ps_args_def; 00316 } 00317 00318 ps_decoder_t * 00319 ps_retain(ps_decoder_t *ps) 00320 { 00321 ++ps->refcount; 00322 return ps; 00323 } 00324 00325 int 00326 ps_free(ps_decoder_t *ps) 00327 { 00328 gnode_t *gn; 00329 00330 if (ps == NULL) 00331 return 0; 00332 if (--ps->refcount > 0) 00333 return ps->refcount; 00334 for (gn = ps->searches; gn; gn = gnode_next(gn)) 00335 ps_search_free(gnode_ptr(gn)); 00336 glist_free(ps->searches); 00337 dict_free(ps->dict); 00338 dict2pid_free(ps->d2p); 00339 acmod_free(ps->acmod); 00340 logmath_free(ps->lmath); 00341 cmd_ln_free_r(ps->config); 00342 ckd_free(ps->uttid); 00343 ckd_free(ps); 00344 return 0; 00345 } 00346 00347 char const * 00348 ps_get_uttid(ps_decoder_t *ps) 00349 { 00350 return ps->uttid; 00351 } 00352 00353 cmd_ln_t * 00354 ps_get_config(ps_decoder_t *ps) 00355 { 00356 return ps->config; 00357 } 00358 00359 logmath_t * 00360 ps_get_logmath(ps_decoder_t *ps) 00361 { 00362 return ps->lmath; 00363 } 00364 00365 fe_t * 00366 ps_get_fe(ps_decoder_t *ps) 00367 { 00368 return ps->acmod->fe; 00369 } 00370 00371 feat_t * 00372 ps_get_feat(ps_decoder_t *ps) 00373 { 00374 return ps->acmod->fcb; 00375 } 00376 00377 ps_mllr_t * 00378 ps_update_mllr(ps_decoder_t *ps, ps_mllr_t *mllr) 00379 { 00380 return acmod_update_mllr(ps->acmod, mllr); 00381 } 00382 00383 ngram_model_t * 00384 ps_get_lmset(ps_decoder_t *ps) 00385 { 00386 if (ps->search == NULL 00387 || 0 != strcmp(ps_search_name(ps->search), "ngram")) 00388 return NULL; 00389 return ((ngram_search_t *)ps->search)->lmset; 00390 } 00391 00392 ngram_model_t * 00393 ps_update_lmset(ps_decoder_t *ps, ngram_model_t *lmset) 00394 { 00395 ngram_search_t *ngs; 00396 ps_search_t *search; 00397 00398 /* Look for N-Gram search. */ 00399 search = ps_find_search(ps, "ngram"); 00400 if (search == NULL) { 00401 /* Initialize N-Gram search. */ 00402 search = ngram_search_init(ps->config, ps->acmod, ps->dict, ps->d2p); 00403 if (search == NULL) 00404 return NULL; 00405 search->pls = ps->phone_loop; 00406 ps->searches = glist_add_ptr(ps->searches, search); 00407 ngs = (ngram_search_t *)search; 00408 } 00409 else { 00410 ngs = (ngram_search_t *)search; 00411 /* Free any previous lmset if this is a new one. */ 00412 if (ngs->lmset != NULL && ngs->lmset != lmset) 00413 ngram_model_free(ngs->lmset); 00414 ngs->lmset = lmset; 00415 /* Tell N-Gram search to update its view of the world. */ 00416 if (ps_search_reinit(search, ps->dict, ps->d2p) < 0) 00417 return NULL; 00418 } 00419 ps->search = search; 00420 return ngs->lmset; 00421 } 00422 00423 fsg_set_t * 00424 ps_get_fsgset(ps_decoder_t *ps) 00425 { 00426 if (ps->search == NULL 00427 || 0 != strcmp(ps_search_name(ps->search), "fsg")) 00428 return NULL; 00429 return (fsg_set_t *)ps->search; 00430 } 00431 00432 fsg_set_t * 00433 ps_update_fsgset(ps_decoder_t *ps) 00434 { 00435 ps_search_t *search; 00436 00437 /* Look for FSG search. */ 00438 search = ps_find_search(ps, "fsg"); 00439 if (search == NULL) { 00440 /* Initialize FSG search. */ 00441 search = fsg_search_init(ps->config, 00442 ps->acmod, ps->dict, ps->d2p); 00443 search->pls = ps->phone_loop; 00444 ps->searches = glist_add_ptr(ps->searches, search); 00445 } 00446 else { 00447 /* Tell FSG search to update its view of the world. */ 00448 if (ps_search_reinit(search, ps->dict, ps->d2p) < 0) 00449 return NULL; 00450 } 00451 ps->search = search; 00452 return (fsg_set_t *)search; 00453 } 00454 00455 int 00456 ps_load_dict(ps_decoder_t *ps, char const *dictfile, 00457 char const *fdictfile, char const *format) 00458 { 00459 cmd_ln_t *newconfig; 00460 dict2pid_t *d2p; 00461 dict_t *dict; 00462 gnode_t *gn; 00463 int rv; 00464 00465 /* Create a new scratch config to load this dict (so existing one 00466 * won't be affected if it fails) */ 00467 newconfig = cmd_ln_init(NULL, ps_args(), TRUE, NULL); 00468 cmd_ln_set_boolean_r(newconfig, "-dictcase", 00469 cmd_ln_boolean_r(ps->config, "-dictcase")); 00470 cmd_ln_set_str_r(newconfig, "-dict", dictfile); 00471 if (fdictfile) 00472 cmd_ln_set_str_r(newconfig, "-fdict", fdictfile); 00473 else 00474 cmd_ln_set_str_r(newconfig, "-fdict", 00475 cmd_ln_str_r(ps->config, "-fdict")); 00476 00477 /* Try to load it. */ 00478 if ((dict = dict_init(newconfig, ps->acmod->mdef)) == NULL) { 00479 cmd_ln_free_r(newconfig); 00480 return -1; 00481 } 00482 00483 /* Reinit the dict2pid. */ 00484 if ((d2p = dict2pid_build(ps->acmod->mdef, dict)) == NULL) { 00485 cmd_ln_free_r(newconfig); 00486 return -1; 00487 } 00488 00489 /* Success! Update the existing config to reflect new dicts and 00490 * drop everything into place. */ 00491 cmd_ln_free_r(newconfig); 00492 cmd_ln_set_str_r(ps->config, "-dict", dictfile); 00493 if (fdictfile) 00494 cmd_ln_set_str_r(ps->config, "-fdict", fdictfile); 00495 dict_free(ps->dict); 00496 ps->dict = dict; 00497 dict2pid_free(ps->d2p); 00498 ps->d2p = d2p; 00499 00500 /* And tell all searches to reconfigure themselves. */ 00501 for (gn = ps->searches; gn; gn = gnode_next(gn)) { 00502 ps_search_t *search = gnode_ptr(gn); 00503 if ((rv = ps_search_reinit(search, dict, d2p)) < 0) 00504 return rv; 00505 } 00506 00507 return 0; 00508 } 00509 00510 int 00511 ps_save_dict(ps_decoder_t *ps, char const *dictfile, 00512 char const *format) 00513 { 00514 return dict_write(ps->dict, dictfile, format); 00515 } 00516 00517 int 00518 ps_add_word(ps_decoder_t *ps, 00519 char const *word, 00520 char const *phones, 00521 int update) 00522 { 00523 int32 wid, lmwid; 00524 ngram_model_t *lmset; 00525 s3cipid_t *pron; 00526 char **phonestr, *tmp; 00527 int np, i, rv; 00528 00529 /* Parse phones into an array of phone IDs. */ 00530 tmp = ckd_salloc(phones); 00531 np = str2words(tmp, NULL, 0); 00532 phonestr = ckd_calloc(np, sizeof(*phonestr)); 00533 str2words(tmp, phonestr, np); 00534 pron = ckd_calloc(np, sizeof(*pron)); 00535 for (i = 0; i < np; ++i) { 00536 pron[i] = bin_mdef_ciphone_id(ps->acmod->mdef, phonestr[i]); 00537 if (pron[i] == -1) { 00538 E_ERROR("Unknown phone %s in phone string %s\n", 00539 phonestr[i], tmp); 00540 ckd_free(phonestr); 00541 ckd_free(tmp); 00542 ckd_free(pron); 00543 return -1; 00544 } 00545 } 00546 /* No longer needed. */ 00547 ckd_free(phonestr); 00548 ckd_free(tmp); 00549 00550 /* Add it to the dictionary. */ 00551 if ((wid = dict_add_word(ps->dict, word, pron, np)) == -1) { 00552 ckd_free(pron); 00553 return -1; 00554 } 00555 /* No longer needed. */ 00556 ckd_free(pron); 00557 00558 /* Now we also have to add it to dict2pid. */ 00559 dict2pid_add_word(ps->d2p, wid); 00560 00561 if ((lmset = ps_get_lmset(ps)) != NULL) { 00562 /* Add it to the LM set (meaning, the current LM). In a perfect 00563 * world, this would result in the same WID, but because of the 00564 * weird way that word IDs are handled, it doesn't. */ 00565 if ((lmwid = ngram_model_add_word(lmset, word, 1.0)) 00566 == NGRAM_INVALID_WID) 00567 return -1; 00568 } 00569 00570 /* Rebuild the widmap and search tree if requested. */ 00571 if (update) { 00572 if ((rv = ps_search_reinit(ps->search, ps->dict, ps->d2p) < 0)) 00573 return rv; 00574 } 00575 return wid; 00576 } 00577 00578 int 00579 ps_decode_raw(ps_decoder_t *ps, FILE *rawfh, 00580 char const *uttid, long maxsamps) 00581 { 00582 long total, pos; 00583 00584 ps_start_utt(ps, uttid); 00585 /* If this file is seekable or maxsamps is specified, then decode 00586 * the whole thing at once. */ 00587 if (maxsamps != -1 || (pos = ftell(rawfh)) >= 0) { 00588 int16 *data; 00589 00590 if (maxsamps == -1) { 00591 long endpos; 00592 fseek(rawfh, 0, SEEK_END); 00593 endpos = ftell(rawfh); 00594 fseek(rawfh, pos, SEEK_SET); 00595 maxsamps = endpos - pos; 00596 } 00597 data = ckd_calloc(maxsamps, sizeof(*data)); 00598 total = fread(data, sizeof(*data), maxsamps, rawfh); 00599 ps_process_raw(ps, data, total, FALSE, TRUE); 00600 ckd_free(data); 00601 } 00602 else { 00603 /* Otherwise decode it in a stream. */ 00604 total = 0; 00605 while (!feof(rawfh)) { 00606 int16 data[256]; 00607 size_t nread; 00608 00609 nread = fread(data, sizeof(*data), sizeof(data)/sizeof(*data), rawfh); 00610 ps_process_raw(ps, data, nread, FALSE, FALSE); 00611 total += nread; 00612 } 00613 } 00614 ps_end_utt(ps); 00615 return total; 00616 } 00617 00618 int 00619 ps_start_utt(ps_decoder_t *ps, char const *uttid) 00620 { 00621 int rv; 00622 00623 if (ps->search == NULL) { 00624 E_ERROR("No search module is selected, did you forget to " 00625 "specify a language model or grammar?\n"); 00626 return -1; 00627 } 00628 00629 ptmr_reset(&ps->perf); 00630 ptmr_start(&ps->perf); 00631 00632 if (uttid) { 00633 ckd_free(ps->uttid); 00634 ps->uttid = ckd_salloc(uttid); 00635 } 00636 else { 00637 char nuttid[16]; 00638 ckd_free(ps->uttid); 00639 sprintf(nuttid, "%09u", ps->uttno); 00640 ps->uttid = ckd_salloc(nuttid); 00641 ++ps->uttno; 00642 } 00643 /* Remove any residual word lattice and hypothesis. */ 00644 ps_lattice_free(ps->search->dag); 00645 ps->search->dag = NULL; 00646 ps->search->last_link = NULL; 00647 ps->search->post = 0; 00648 ckd_free(ps->search->hyp_str); 00649 ps->search->hyp_str = NULL; 00650 00651 if ((rv = acmod_start_utt(ps->acmod)) < 0) 00652 return rv; 00653 00654 /* Start logging features and audio if requested. */ 00655 if (ps->mfclogdir) { 00656 char *logfn = string_join(ps->mfclogdir, "/", 00657 ps->uttid, ".mfc", NULL); 00658 FILE *mfcfh; 00659 E_INFO("Writing MFCC log file: %s\n", logfn); 00660 if ((mfcfh = fopen(logfn, "wb")) == NULL) { 00661 E_ERROR_SYSTEM("Failed to open MFCC log file %s", logfn); 00662 ckd_free(logfn); 00663 return -1; 00664 } 00665 ckd_free(logfn); 00666 acmod_set_mfcfh(ps->acmod, mfcfh); 00667 } 00668 if (ps->rawlogdir) { 00669 char *logfn = string_join(ps->rawlogdir, "/", 00670 ps->uttid, ".raw", NULL); 00671 FILE *rawfh; 00672 E_INFO("Writing raw audio log file: %s\n", logfn); 00673 if ((rawfh = fopen(logfn, "wb")) == NULL) { 00674 E_ERROR_SYSTEM("Failed to open raw audio log file %s", logfn); 00675 ckd_free(logfn); 00676 return -1; 00677 } 00678 ckd_free(logfn); 00679 acmod_set_rawfh(ps->acmod, rawfh); 00680 } 00681 if (ps->senlogdir) { 00682 char *logfn = string_join(ps->senlogdir, "/", 00683 ps->uttid, ".sen", NULL); 00684 FILE *senfh; 00685 E_INFO("Writing senone score log file: %s\n", logfn); 00686 if ((senfh = fopen(logfn, "wb")) == NULL) { 00687 E_ERROR_SYSTEM("Failed to open senone score log file %s", logfn); 00688 ckd_free(logfn); 00689 return -1; 00690 } 00691 ckd_free(logfn); 00692 acmod_set_senfh(ps->acmod, senfh); 00693 } 00694 00695 /* Start auxiliary phone loop search. */ 00696 if (ps->phone_loop) 00697 ps_search_start(ps->phone_loop); 00698 00699 return ps_search_start(ps->search); 00700 } 00701 00702 static int 00703 ps_search_forward(ps_decoder_t *ps) 00704 { 00705 int nfr; 00706 00707 nfr = 0; 00708 while (ps->acmod->n_feat_frame > 0) { 00709 int k; 00710 if (ps->phone_loop) 00711 if ((k = ps_search_step(ps->phone_loop, ps->acmod->output_frame)) < 0) 00712 return k; 00713 if (ps->acmod->output_frame >= ps->pl_window) 00714 if ((k = ps_search_step(ps->search, 00715 ps->acmod->output_frame - ps->pl_window)) < 0) 00716 return k; 00717 acmod_advance(ps->acmod); 00718 ++ps->n_frame; 00719 ++nfr; 00720 } 00721 return nfr; 00722 } 00723 00724 int 00725 ps_decode_senscr(ps_decoder_t *ps, FILE *senfh, 00726 char const *uttid) 00727 { 00728 int nfr, n_searchfr; 00729 00730 ps_start_utt(ps, uttid); 00731 n_searchfr = 0; 00732 acmod_set_insenfh(ps->acmod, senfh); 00733 while ((nfr = acmod_read_scores(ps->acmod)) > 0) { 00734 if ((nfr = ps_search_forward(ps)) < 0) { 00735 ps_end_utt(ps); 00736 return nfr; 00737 } 00738 n_searchfr += nfr; 00739 } 00740 ps_end_utt(ps); 00741 acmod_set_insenfh(ps->acmod, NULL); 00742 00743 return n_searchfr; 00744 } 00745 00746 int 00747 ps_process_raw(ps_decoder_t *ps, 00748 int16 const *data, 00749 size_t n_samples, 00750 int no_search, 00751 int full_utt) 00752 { 00753 int n_searchfr = 0; 00754 00755 if (no_search) 00756 acmod_set_grow(ps->acmod, TRUE); 00757 00758 while (n_samples) { 00759 int nfr; 00760 00761 /* Process some data into features. */ 00762 if ((nfr = acmod_process_raw(ps->acmod, &data, 00763 &n_samples, full_utt)) < 0) 00764 return nfr; 00765 00766 /* Score and search as much data as possible */ 00767 if (no_search) 00768 continue; 00769 if ((nfr = ps_search_forward(ps)) < 0) 00770 return nfr; 00771 n_searchfr += nfr; 00772 } 00773 00774 return n_searchfr; 00775 } 00776 00777 int 00778 ps_process_cep(ps_decoder_t *ps, 00779 mfcc_t **data, 00780 int32 n_frames, 00781 int no_search, 00782 int full_utt) 00783 { 00784 int n_searchfr = 0; 00785 00786 if (no_search) 00787 acmod_set_grow(ps->acmod, TRUE); 00788 00789 while (n_frames) { 00790 int nfr; 00791 00792 /* Process some data into features. */ 00793 if ((nfr = acmod_process_cep(ps->acmod, &data, 00794 &n_frames, full_utt)) < 0) 00795 return nfr; 00796 00797 /* Score and search as much data as possible */ 00798 if (no_search) 00799 continue; 00800 if ((nfr = ps_search_forward(ps)) < 0) 00801 return nfr; 00802 n_searchfr += nfr; 00803 } 00804 00805 return n_searchfr; 00806 } 00807 00808 int 00809 ps_end_utt(ps_decoder_t *ps) 00810 { 00811 int rv, i; 00812 00813 acmod_end_utt(ps->acmod); 00814 00815 /* Search any remaining frames. */ 00816 if ((rv = ps_search_forward(ps)) < 0) { 00817 ptmr_stop(&ps->perf); 00818 return rv; 00819 } 00820 /* Finish phone loop search. */ 00821 if (ps->phone_loop) { 00822 if ((rv = ps_search_finish(ps->phone_loop)) < 0) { 00823 ptmr_stop(&ps->perf); 00824 return rv; 00825 } 00826 } 00827 /* Search any frames remaining in the lookahead window. */ 00828 for (i = ps->acmod->output_frame - ps->pl_window; 00829 i < ps->acmod->output_frame; ++i) 00830 ps_search_step(ps->search, i); 00831 /* Finish main search. */ 00832 if ((rv = ps_search_finish(ps->search)) < 0) { 00833 ptmr_stop(&ps->perf); 00834 return rv; 00835 } 00836 ptmr_stop(&ps->perf); 00837 00838 /* Log a backtrace if requested. */ 00839 if (cmd_ln_boolean_r(ps->config, "-backtrace")) { 00840 char const *uttid, *hyp; 00841 ps_seg_t *seg; 00842 int32 score; 00843 00844 hyp = ps_get_hyp(ps, &score, &uttid); 00845 E_INFO("%s: %s (%d)\n", uttid, hyp, score); 00846 E_INFO_NOFN("%-20s %-5s %-5s %-5s %-10s %-10s %-3s\n", 00847 "word", "start", "end", "pprob", "ascr", "lscr", "lback"); 00848 for (seg = ps_seg_iter(ps, &score); seg; 00849 seg = ps_seg_next(seg)) { 00850 char const *word; 00851 int sf, ef; 00852 int32 post, lscr, ascr, lback; 00853 00854 word = ps_seg_word(seg); 00855 ps_seg_frames(seg, &sf, &ef); 00856 post = ps_seg_prob(seg, &ascr, &lscr, &lback); 00857 E_INFO_NOFN("%-20s %-5d %-5d %-1.3f %-10d %-10d %-3d\n", 00858 word, sf, ef, logmath_exp(ps_get_logmath(ps), post), ascr, lscr, lback); 00859 } 00860 } 00861 return rv; 00862 } 00863 00864 char const * 00865 ps_get_hyp(ps_decoder_t *ps, int32 *out_best_score, char const **out_uttid) 00866 { 00867 char const *hyp; 00868 00869 ptmr_start(&ps->perf); 00870 hyp = ps_search_hyp(ps->search, out_best_score); 00871 if (out_uttid) 00872 *out_uttid = ps->uttid; 00873 ptmr_stop(&ps->perf); 00874 return hyp; 00875 } 00876 00877 int32 00878 ps_get_prob(ps_decoder_t *ps, char const **out_uttid) 00879 { 00880 int32 prob; 00881 00882 ptmr_start(&ps->perf); 00883 prob = ps_search_prob(ps->search); 00884 if (out_uttid) 00885 *out_uttid = ps->uttid; 00886 ptmr_stop(&ps->perf); 00887 return prob; 00888 } 00889 00890 ps_seg_t * 00891 ps_seg_iter(ps_decoder_t *ps, int32 *out_best_score) 00892 { 00893 ps_seg_t *itor; 00894 00895 ptmr_start(&ps->perf); 00896 itor = ps_search_seg_iter(ps->search, out_best_score); 00897 ptmr_stop(&ps->perf); 00898 return itor; 00899 } 00900 00901 ps_seg_t * 00902 ps_seg_next(ps_seg_t *seg) 00903 { 00904 return ps_search_seg_next(seg); 00905 } 00906 00907 char const * 00908 ps_seg_word(ps_seg_t *seg) 00909 { 00910 return seg->word; 00911 } 00912 00913 void 00914 ps_seg_frames(ps_seg_t *seg, int *out_sf, int *out_ef) 00915 { 00916 if (out_sf) *out_sf = seg->sf; 00917 if (out_ef) *out_ef = seg->ef; 00918 } 00919 00920 int32 00921 ps_seg_prob(ps_seg_t *seg, int32 *out_ascr, int32 *out_lscr, int32 *out_lback) 00922 { 00923 if (out_ascr) *out_ascr = seg->ascr; 00924 if (out_lscr) *out_lscr = seg->lscr; 00925 if (out_lback) *out_lback = seg->lback; 00926 return seg->prob; 00927 } 00928 00929 void 00930 ps_seg_free(ps_seg_t *seg) 00931 { 00932 ps_search_seg_free(seg); 00933 } 00934 00935 ps_lattice_t * 00936 ps_get_lattice(ps_decoder_t *ps) 00937 { 00938 return ps_search_lattice(ps->search); 00939 } 00940 00941 ps_nbest_t * 00942 ps_nbest(ps_decoder_t *ps, int sf, int ef, 00943 char const *ctx1, char const *ctx2) 00944 { 00945 ps_lattice_t *dag; 00946 ngram_model_t *lmset; 00947 ps_astar_t *nbest; 00948 float32 lwf; 00949 int32 w1, w2; 00950 00951 if (ps->search == NULL) 00952 return NULL; 00953 if ((dag = ps_get_lattice(ps)) == NULL) 00954 return NULL; 00955 00956 /* FIXME: This is all quite specific to N-Gram search. Either we 00957 * should make N-best a method for each search module or it needs 00958 * to be abstracted to work for N-Gram and FSG. */ 00959 if (0 != strcmp(ps_search_name(ps->search), "ngram")) { 00960 lmset = NULL; 00961 lwf = 1.0f; 00962 } 00963 else { 00964 lmset = ((ngram_search_t *)ps->search)->lmset; 00965 lwf = ((ngram_search_t *)ps->search)->bestpath_fwdtree_lw_ratio; 00966 } 00967 00968 w1 = ctx1 ? dict_wordid(ps_search_dict(ps->search), ctx1) : -1; 00969 w2 = ctx2 ? dict_wordid(ps_search_dict(ps->search), ctx2) : -1; 00970 nbest = ps_astar_start(dag, lmset, lwf, sf, ef, w1, w2); 00971 00972 return (ps_nbest_t *)nbest; 00973 } 00974 00975 void 00976 ps_nbest_free(ps_nbest_t *nbest) 00977 { 00978 ps_astar_finish(nbest); 00979 } 00980 00981 ps_nbest_t * 00982 ps_nbest_next(ps_nbest_t *nbest) 00983 { 00984 ps_latpath_t *next; 00985 00986 next = ps_astar_next(nbest); 00987 if (next == NULL) { 00988 ps_nbest_free(nbest); 00989 return NULL; 00990 } 00991 return nbest; 00992 } 00993 00994 char const * 00995 ps_nbest_hyp(ps_nbest_t *nbest, int32 *out_score) 00996 { 00997 if (nbest->top == NULL) 00998 return NULL; 00999 if (out_score) *out_score = nbest->top->score; 01000 return ps_astar_hyp(nbest, nbest->top); 01001 } 01002 01003 ps_seg_t * 01004 ps_nbest_seg(ps_nbest_t *nbest, int32 *out_score) 01005 { 01006 if (nbest->top == NULL) 01007 return NULL; 01008 if (out_score) *out_score = nbest->top->score; 01009 return ps_astar_seg_iter(nbest, nbest->top, 1.0); 01010 } 01011 01012 int 01013 ps_get_n_frames(ps_decoder_t *ps) 01014 { 01015 return ps->acmod->output_frame + 1; 01016 } 01017 01018 void 01019 ps_get_utt_time(ps_decoder_t *ps, double *out_nspeech, 01020 double *out_ncpu, double *out_nwall) 01021 { 01022 int32 frate; 01023 01024 frate = cmd_ln_int32_r(ps->config, "-frate"); 01025 *out_nspeech = (double)ps->acmod->output_frame / frate; 01026 *out_ncpu = ps->perf.t_cpu; 01027 *out_nwall = ps->perf.t_elapsed; 01028 } 01029 01030 void 01031 ps_get_all_time(ps_decoder_t *ps, double *out_nspeech, 01032 double *out_ncpu, double *out_nwall) 01033 { 01034 int32 frate; 01035 01036 frate = cmd_ln_int32_r(ps->config, "-frate"); 01037 *out_nspeech = (double)ps->n_frame / frate; 01038 *out_ncpu = ps->perf.t_tot_cpu; 01039 *out_nwall = ps->perf.t_tot_elapsed; 01040 } 01041 01042 void 01043 ps_search_init(ps_search_t *search, ps_searchfuncs_t *vt, 01044 cmd_ln_t *config, acmod_t *acmod, dict_t *dict, 01045 dict2pid_t *d2p) 01046 { 01047 search->vt = vt; 01048 search->config = config; 01049 search->acmod = acmod; 01050 if (d2p) 01051 search->d2p = dict2pid_retain(d2p); 01052 else 01053 search->d2p = NULL; 01054 if (dict) { 01055 search->dict = dict_retain(dict); 01056 search->start_wid = dict_startwid(dict); 01057 search->finish_wid = dict_finishwid(dict); 01058 search->silence_wid = dict_silwid(dict); 01059 search->n_words = dict_size(dict); 01060 } 01061 else { 01062 search->dict = NULL; 01063 search->start_wid = search->finish_wid = search->silence_wid = -1; 01064 search->n_words = 0; 01065 } 01066 } 01067 01068 void 01069 ps_search_base_reinit(ps_search_t *search, dict_t *dict, 01070 dict2pid_t *d2p) 01071 { 01072 dict_free(search->dict); 01073 dict2pid_free(search->d2p); 01074 /* FIXME: _retain() should just return NULL if passed NULL. */ 01075 if (dict) { 01076 search->dict = dict_retain(dict); 01077 search->start_wid = dict_startwid(dict); 01078 search->finish_wid = dict_finishwid(dict); 01079 search->silence_wid = dict_silwid(dict); 01080 search->n_words = dict_size(dict); 01081 } 01082 else { 01083 search->dict = NULL; 01084 search->start_wid = search->finish_wid = search->silence_wid = -1; 01085 search->n_words = 0; 01086 } 01087 if (d2p) 01088 search->d2p = dict2pid_retain(d2p); 01089 else 01090 search->d2p = NULL; 01091 } 01092 01093 01094 void 01095 ps_search_deinit(ps_search_t *search) 01096 { 01097 /* FIXME: We will have refcounting on acmod, config, etc, at which 01098 * point we will free them here too. */ 01099 dict_free(search->dict); 01100 dict2pid_free(search->d2p); 01101 ckd_free(search->hyp_str); 01102 ps_lattice_free(search->dag); 01103 }