PocketSphinx
0.6
|
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 00002 /* ==================================================================== 00003 * Copyright (c) 2008 Carnegie Mellon University. All rights 00004 * reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions 00008 * are met: 00009 * 00010 * 1. Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in 00015 * the documentation and/or other materials provided with the 00016 * distribution. 00017 * 00018 * This work was supported in part by funding from the Defense Advanced 00019 * Research Projects Agency and the National Science Foundation of the 00020 * United States of America, and the CMU Sphinx Speech Consortium. 00021 * 00022 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 00023 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 00024 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00025 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 00026 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00027 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00028 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00029 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00030 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00032 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00033 * 00034 * ==================================================================== 00035 * 00036 */ 00037 00042 /* System headers. */ 00043 #include <string.h> 00044 #include <assert.h> 00045 00046 /* SphinxBase headers. */ 00047 #include <sphinxbase/ckd_alloc.h> 00048 #include <sphinxbase/listelem_alloc.h> 00049 #include <sphinxbase/err.h> 00050 00051 /* Local headers. */ 00052 #include "pocketsphinx_internal.h" 00053 #include "ps_lattice_internal.h" 00054 #include "ngram_search.h" 00055 #include "ngram_search_fwdtree.h" 00056 #include "ngram_search_fwdflat.h" 00057 00058 static int ngram_search_start(ps_search_t *search); 00059 static int ngram_search_step(ps_search_t *search, int frame_idx); 00060 static int ngram_search_finish(ps_search_t *search); 00061 static int ngram_search_reinit(ps_search_t *search, dict_t *dict, dict2pid_t *d2p); 00062 static char const *ngram_search_hyp(ps_search_t *search, int32 *out_score); 00063 static int32 ngram_search_prob(ps_search_t *search); 00064 static ps_seg_t *ngram_search_seg_iter(ps_search_t *search, int32 *out_score); 00065 00066 static ps_searchfuncs_t ngram_funcs = { 00067 /* name: */ "ngram", 00068 /* start: */ ngram_search_start, 00069 /* step: */ ngram_search_step, 00070 /* finish: */ ngram_search_finish, 00071 /* reinit: */ ngram_search_reinit, 00072 /* free: */ ngram_search_free, 00073 /* lattice: */ ngram_search_lattice, 00074 /* hyp: */ ngram_search_hyp, 00075 /* prob: */ ngram_search_prob, 00076 /* seg_iter: */ ngram_search_seg_iter, 00077 }; 00078 00079 static void 00080 ngram_search_update_widmap(ngram_search_t *ngs) 00081 { 00082 const char **words; 00083 int32 i, n_words; 00084 00085 /* It's okay to include fillers since they won't be in the LM */ 00086 n_words = ps_search_n_words(ngs); 00087 words = ckd_calloc(n_words, sizeof(*words)); 00088 /* This will include alternates, again, that's okay since they aren't in the LM */ 00089 for (i = 0; i < n_words; ++i) 00090 words[i] = (const char *)dict_wordstr(ps_search_dict(ngs), i); 00091 ngram_model_set_map_words(ngs->lmset, words, n_words); 00092 ckd_free(words); 00093 } 00094 00095 static void 00096 ngram_search_calc_beams(ngram_search_t *ngs) 00097 { 00098 cmd_ln_t *config; 00099 acmod_t *acmod; 00100 00101 config = ps_search_config(ngs); 00102 acmod = ps_search_acmod(ngs); 00103 00104 /* Log beam widths. */ 00105 ngs->beam = logmath_log(acmod->lmath, cmd_ln_float64_r(config, "-beam"))>>SENSCR_SHIFT; 00106 ngs->wbeam = logmath_log(acmod->lmath, cmd_ln_float64_r(config, "-wbeam"))>>SENSCR_SHIFT; 00107 ngs->pbeam = logmath_log(acmod->lmath, cmd_ln_float64_r(config, "-pbeam"))>>SENSCR_SHIFT; 00108 ngs->lpbeam = logmath_log(acmod->lmath, cmd_ln_float64_r(config, "-lpbeam"))>>SENSCR_SHIFT; 00109 ngs->lponlybeam = logmath_log(acmod->lmath, cmd_ln_float64_r(config, "-lponlybeam"))>>SENSCR_SHIFT; 00110 ngs->fwdflatbeam = logmath_log(acmod->lmath, cmd_ln_float64_r(config, "-fwdflatbeam"))>>SENSCR_SHIFT; 00111 ngs->fwdflatwbeam = logmath_log(acmod->lmath, cmd_ln_float64_r(config, "-fwdflatwbeam"))>>SENSCR_SHIFT; 00112 00113 /* Absolute pruning parameters. */ 00114 ngs->maxwpf = cmd_ln_int32_r(config, "-maxwpf"); 00115 ngs->maxhmmpf = cmd_ln_int32_r(config, "-maxhmmpf"); 00116 00117 /* Various penalties which may or may not be useful. */ 00118 ngs->wip = logmath_log(acmod->lmath, cmd_ln_float32_r(config, "-wip")) >>SENSCR_SHIFT; 00119 ngs->nwpen = logmath_log(acmod->lmath, cmd_ln_float32_r(config, "-nwpen")) >>SENSCR_SHIFT; 00120 ngs->pip = logmath_log(acmod->lmath, cmd_ln_float32_r(config, "-pip")) >>SENSCR_SHIFT; 00121 ngs->silpen = ngs->pip 00122 + (logmath_log(acmod->lmath, cmd_ln_float32_r(config, "-silprob"))>>SENSCR_SHIFT); 00123 ngs->fillpen = ngs->pip 00124 + (logmath_log(acmod->lmath, cmd_ln_float32_r(config, "-fillprob"))>>SENSCR_SHIFT); 00125 00126 /* Language weight ratios for fwdflat and bestpath search. */ 00127 ngs->fwdflat_fwdtree_lw_ratio = 00128 cmd_ln_float32_r(config, "-fwdflatlw") 00129 / cmd_ln_float32_r(config, "-lw"); 00130 ngs->bestpath_fwdtree_lw_ratio = 00131 cmd_ln_float32_r(config, "-bestpathlw") 00132 / cmd_ln_float32_r(config, "-lw"); 00133 00134 /* Acoustic score scale for posterior probabilities. */ 00135 ngs->ascale = 1.0 / cmd_ln_float32_r(config, "-ascale"); 00136 } 00137 00138 ps_search_t * 00139 ngram_search_init(cmd_ln_t *config, 00140 acmod_t *acmod, 00141 dict_t *dict, 00142 dict2pid_t *d2p) 00143 { 00144 ngram_search_t *ngs; 00145 const char *path; 00146 00147 ngs = ckd_calloc(1, sizeof(*ngs)); 00148 ps_search_init(&ngs->base, &ngram_funcs, config, acmod, dict, d2p); 00149 ngs->hmmctx = hmm_context_init(bin_mdef_n_emit_state(acmod->mdef), 00150 acmod->tmat->tp, NULL, acmod->mdef->sseq); 00151 if (ngs->hmmctx == NULL) { 00152 ps_search_free(ps_search_base(ngs)); 00153 return NULL; 00154 } 00155 ngs->chan_alloc = listelem_alloc_init(sizeof(chan_t)); 00156 ngs->root_chan_alloc = listelem_alloc_init(sizeof(root_chan_t)); 00157 ngs->latnode_alloc = listelem_alloc_init(sizeof(ps_latnode_t)); 00158 00159 /* Calculate various beam widths and such. */ 00160 ngram_search_calc_beams(ngs); 00161 00162 /* Allocate a billion different tables for stuff. */ 00163 ngs->word_chan = ckd_calloc(dict_size(dict), 00164 sizeof(*ngs->word_chan)); 00165 ngs->word_lat_idx = ckd_calloc(dict_size(dict), 00166 sizeof(*ngs->word_lat_idx)); 00167 ngs->word_active = bitvec_alloc(dict_size(dict)); 00168 ngs->last_ltrans = ckd_calloc(dict_size(dict), 00169 sizeof(*ngs->last_ltrans)); 00170 00171 /* FIXME: All these structures need to be made dynamic with 00172 * garbage collection. */ 00173 ngs->bp_table_size = cmd_ln_int32_r(config, "-latsize"); 00174 ngs->bp_table = ckd_calloc(ngs->bp_table_size, 00175 sizeof(*ngs->bp_table)); 00176 /* FIXME: This thing is frickin' huge. */ 00177 ngs->bscore_stack_size = ngs->bp_table_size * 20; 00178 ngs->bscore_stack = ckd_calloc(ngs->bscore_stack_size, 00179 sizeof(*ngs->bscore_stack)); 00180 ngs->n_frame_alloc = 256; 00181 ngs->bp_table_idx = ckd_calloc(ngs->n_frame_alloc + 1, 00182 sizeof(*ngs->bp_table_idx)); 00183 ++ngs->bp_table_idx; /* Make bptableidx[-1] valid */ 00184 00185 /* Allocate active word list array */ 00186 ngs->active_word_list = ckd_calloc_2d(2, dict_size(dict), 00187 sizeof(**ngs->active_word_list)); 00188 00189 /* Load language model(s) */ 00190 if ((path = cmd_ln_str_r(config, "-lmctl"))) { 00191 ngs->lmset = ngram_model_set_read(config, path, acmod->lmath); 00192 if (ngs->lmset == NULL) { 00193 E_ERROR("Failed to read language model control file: %s\n", 00194 path); 00195 goto error_out; 00196 } 00197 /* Set the default language model if needed. */ 00198 if ((path = cmd_ln_str_r(config, "-lmname"))) { 00199 ngram_model_set_select(ngs->lmset, path); 00200 } 00201 } 00202 else if ((path = cmd_ln_str_r(config, "-lm"))) { 00203 static const char *name = "default"; 00204 ngram_model_t *lm; 00205 00206 lm = ngram_model_read(config, path, NGRAM_AUTO, acmod->lmath); 00207 if (lm == NULL) { 00208 E_ERROR("Failed to read language model file: %s\n", path); 00209 goto error_out; 00210 } 00211 ngs->lmset = ngram_model_set_init(config, 00212 &lm, (char **)&name, 00213 NULL, 1); 00214 if (ngs->lmset == NULL) { 00215 E_ERROR("Failed to initialize language model set\n"); 00216 goto error_out; 00217 } 00218 } 00219 if (ngs->lmset != NULL 00220 && ngram_wid(ngs->lmset, S3_FINISH_WORD) == ngram_unknown_wid(ngs->lmset)) { 00221 E_ERROR("Language model/set does not contain </s>, recognition will fail\n"); 00222 goto error_out; 00223 } 00224 00225 /* Create word mappings. */ 00226 ngram_search_update_widmap(ngs); 00227 00228 /* Initialize fwdtree, fwdflat, bestpath modules if necessary. */ 00229 if (cmd_ln_boolean_r(config, "-fwdtree")) { 00230 ngram_fwdtree_init(ngs); 00231 ngs->fwdtree = TRUE; 00232 ngs->fwdtree_perf.name = "fwdtree"; 00233 ptmr_init(&ngs->fwdtree_perf); 00234 } 00235 if (cmd_ln_boolean_r(config, "-fwdflat")) { 00236 ngram_fwdflat_init(ngs); 00237 ngs->fwdflat = TRUE; 00238 ngs->fwdflat_perf.name = "fwdflat"; 00239 ptmr_init(&ngs->fwdflat_perf); 00240 } 00241 if (cmd_ln_boolean_r(config, "-bestpath")) { 00242 ngs->bestpath = TRUE; 00243 ngs->bestpath_perf.name = "bestpath"; 00244 ptmr_init(&ngs->bestpath_perf); 00245 } 00246 00247 return (ps_search_t *)ngs; 00248 00249 error_out: 00250 ngram_search_free((ps_search_t *)ngs); 00251 return NULL; 00252 } 00253 00254 static int 00255 ngram_search_reinit(ps_search_t *search, dict_t *dict, dict2pid_t *d2p) 00256 { 00257 ngram_search_t *ngs = (ngram_search_t *)search; 00258 int old_n_words; 00259 int rv = 0; 00260 00261 /* Update the number of words. */ 00262 old_n_words = search->n_words; 00263 if (old_n_words != dict_size(dict)) { 00264 search->n_words = dict_size(dict); 00265 /* Reallocate these temporary arrays. */ 00266 ckd_free(ngs->word_lat_idx); 00267 ckd_free(ngs->word_active); 00268 ckd_free(ngs->last_ltrans); 00269 ckd_free_2d(ngs->active_word_list); 00270 ngs->word_lat_idx = ckd_calloc(search->n_words, sizeof(*ngs->word_lat_idx)); 00271 ngs->word_active = bitvec_alloc(search->n_words); 00272 ngs->last_ltrans = ckd_calloc(search->n_words, sizeof(*ngs->last_ltrans)); 00273 ngs->active_word_list 00274 = ckd_calloc_2d(2, search->n_words, 00275 sizeof(**ngs->active_word_list)); 00276 } 00277 00278 /* Free old dict2pid, dict */ 00279 ps_search_base_reinit(search, dict, d2p); 00280 00281 if (ngs->lmset == NULL) 00282 return; 00283 00284 /* Update beam widths. */ 00285 ngram_search_calc_beams(ngs); 00286 00287 /* Update word mappings. */ 00288 ngram_search_update_widmap(ngs); 00289 00290 /* Now rebuild lextrees. */ 00291 if (ngs->fwdtree) { 00292 if ((rv = ngram_fwdtree_reinit(ngs)) < 0) 00293 return rv; 00294 } 00295 if (ngs->fwdflat) { 00296 if ((rv = ngram_fwdflat_reinit(ngs)) < 0) 00297 return rv; 00298 } 00299 00300 return rv; 00301 } 00302 00303 void 00304 ngram_search_free(ps_search_t *search) 00305 { 00306 ngram_search_t *ngs = (ngram_search_t *)search; 00307 00308 ps_search_deinit(search); 00309 if (ngs->fwdtree) 00310 ngram_fwdtree_deinit(ngs); 00311 if (ngs->fwdflat) 00312 ngram_fwdflat_deinit(ngs); 00313 if (ngs->bestpath) { 00314 double n_speech = (double)ngs->n_tot_frame 00315 / cmd_ln_int32_r(ps_search_config(ngs), "-frate"); 00316 00317 E_INFO("TOTAL bestpath %.2f CPU %.3f xRT\n", 00318 ngs->bestpath_perf.t_tot_cpu, 00319 ngs->bestpath_perf.t_tot_cpu / n_speech); 00320 E_INFO("TOTAL bestpath %.2f wall %.3f xRT\n", 00321 ngs->bestpath_perf.t_tot_elapsed, 00322 ngs->bestpath_perf.t_tot_elapsed / n_speech); 00323 } 00324 00325 hmm_context_free(ngs->hmmctx); 00326 listelem_alloc_free(ngs->chan_alloc); 00327 listelem_alloc_free(ngs->root_chan_alloc); 00328 listelem_alloc_free(ngs->latnode_alloc); 00329 ngram_model_free(ngs->lmset); 00330 00331 ckd_free(ngs->word_chan); 00332 ckd_free(ngs->word_lat_idx); 00333 bitvec_free(ngs->word_active); 00334 ckd_free(ngs->bp_table); 00335 ckd_free(ngs->bscore_stack); 00336 if (ngs->bp_table_idx != NULL) 00337 ckd_free(ngs->bp_table_idx - 1); 00338 ckd_free_2d(ngs->active_word_list); 00339 ckd_free(ngs->last_ltrans); 00340 ckd_free(ngs); 00341 } 00342 00343 int 00344 ngram_search_mark_bptable(ngram_search_t *ngs, int frame_idx) 00345 { 00346 if (frame_idx >= ngs->n_frame_alloc) { 00347 ngs->n_frame_alloc *= 2; 00348 ngs->bp_table_idx = ckd_realloc(ngs->bp_table_idx - 1, 00349 (ngs->n_frame_alloc + 1) 00350 * sizeof(*ngs->bp_table_idx)); 00351 if (ngs->frm_wordlist) { 00352 ngs->frm_wordlist = ckd_realloc(ngs->frm_wordlist, 00353 ngs->n_frame_alloc 00354 * sizeof(*ngs->frm_wordlist)); 00355 } 00356 ++ngs->bp_table_idx; /* Make bptableidx[-1] valid */ 00357 } 00358 ngs->bp_table_idx[frame_idx] = ngs->bpidx; 00359 return ngs->bpidx; 00360 } 00361 00362 static void 00363 set_real_wid(ngram_search_t *ngs, int32 bp) 00364 { 00365 bptbl_t *ent, *prev; 00366 00367 assert(bp != NO_BP); 00368 ent = ngs->bp_table + bp; 00369 if (ent->bp == NO_BP) 00370 prev = NULL; 00371 else 00372 prev = ngs->bp_table + ent->bp; 00373 00374 /* Propagate lm state for fillers, rotate it for words. */ 00375 if (dict_filler_word(ps_search_dict(ngs), ent->wid)) { 00376 if (prev != NULL) { 00377 ent->real_wid = prev->real_wid; 00378 ent->prev_real_wid = prev->prev_real_wid; 00379 } 00380 else { 00381 ent->real_wid = dict_basewid(ps_search_dict(ngs), 00382 ent->wid); 00383 ent->prev_real_wid = BAD_S3WID; 00384 } 00385 } 00386 else { 00387 ent->real_wid = dict_basewid(ps_search_dict(ngs), ent->wid); 00388 if (prev != NULL) 00389 ent->prev_real_wid = prev->real_wid; 00390 else 00391 ent->prev_real_wid = BAD_S3WID; 00392 } 00393 } 00394 00395 void 00396 ngram_search_save_bp(ngram_search_t *ngs, int frame_idx, 00397 int32 w, int32 score, int32 path, int32 rc) 00398 { 00399 int32 bp; 00400 00401 /* Look for an existing exit for this word in this frame. The 00402 * only reason one would exist is from a different right context 00403 * triphone, but of course that happens quite frequently. */ 00404 bp = ngs->word_lat_idx[w]; 00405 if (bp != NO_BP) { 00406 /* Keep only the best scoring one, we will reconstruct the 00407 * others from the right context scores - usually the history 00408 * is not lost. */ 00409 if (ngs->bp_table[bp].score WORSE_THAN score) { 00410 assert(path != bp); /* Pathological. */ 00411 if (ngs->bp_table[bp].bp != path) { 00412 int32 bplh[2], newlh[2]; 00413 /* But, sometimes, the history *is* lost. If we wanted to 00414 * do exact language model scoring we'd have to preserve 00415 * these alternate histories. */ 00416 E_DEBUG(2,("Updating path history %d => %d frame %d\n", 00417 ngs->bp_table[bp].bp, path, frame_idx)); 00418 bplh[0] = ngs->bp_table[bp].bp == -1 00419 ? -1 : ngs->bp_table[ngs->bp_table[bp].bp].prev_real_wid; 00420 bplh[1] = ngs->bp_table[bp].bp == -1 00421 ? -1 : ngs->bp_table[ngs->bp_table[bp].bp].real_wid; 00422 newlh[0] = path == -1 00423 ? -1 : ngs->bp_table[path].prev_real_wid; 00424 newlh[1] = path == -1 00425 ? -1 : ngs->bp_table[path].real_wid; 00426 /* Actually it's worth checking how often the actual 00427 * language model state changes. */ 00428 if (bplh[0] != newlh[0] || bplh[1] != newlh[1]) { 00429 /* It's fairly rare that the actual language model 00430 * state changes, but it does happen some 00431 * times. */ 00432 E_DEBUG(1, ("Updating language model state %s,%s => %s,%s frame %d\n", 00433 dict_wordstr(ps_search_dict(ngs), bplh[0]), 00434 dict_wordstr(ps_search_dict(ngs), bplh[1]), 00435 dict_wordstr(ps_search_dict(ngs), newlh[0]), 00436 dict_wordstr(ps_search_dict(ngs), newlh[1]), 00437 frame_idx)); 00438 set_real_wid(ngs, bp); 00439 } 00440 ngs->bp_table[bp].bp = path; 00441 } 00442 ngs->bp_table[bp].score = score; 00443 } 00444 /* But do keep track of scores for all right contexts, since 00445 * we need them to determine the starting path scores for any 00446 * successors of this word exit. */ 00447 if (ngs->bp_table[bp].s_idx != -1) 00448 ngs->bscore_stack[ngs->bp_table[bp].s_idx + rc] = score; 00449 } 00450 else { 00451 int32 i, rcsize; 00452 bptbl_t *be; 00453 00454 /* This might happen if recognition fails. */ 00455 if (ngs->bpidx == NO_BP) { 00456 E_ERROR("No entries in backpointer table!"); 00457 return; 00458 } 00459 00460 /* Expand the backpointer tables if necessary. */ 00461 if (ngs->bpidx >= ngs->bp_table_size) { 00462 ngs->bp_table_size *= 2; 00463 ngs->bp_table = ckd_realloc(ngs->bp_table, 00464 ngs->bp_table_size 00465 * sizeof(*ngs->bp_table)); 00466 E_INFO("Resized backpointer table to %d entries\n", ngs->bp_table_size); 00467 } 00468 if (ngs->bss_head >= ngs->bscore_stack_size 00469 - bin_mdef_n_ciphone(ps_search_acmod(ngs)->mdef)) { 00470 ngs->bscore_stack_size *= 2; 00471 ngs->bscore_stack = ckd_realloc(ngs->bscore_stack, 00472 ngs->bscore_stack_size 00473 * sizeof(*ngs->bscore_stack)); 00474 E_INFO("Resized score stack to %d entries\n", ngs->bscore_stack_size); 00475 } 00476 00477 ngs->word_lat_idx[w] = ngs->bpidx; 00478 be = &(ngs->bp_table[ngs->bpidx]); 00479 be->wid = w; 00480 be->frame = frame_idx; 00481 be->bp = path; 00482 be->score = score; 00483 be->s_idx = ngs->bss_head; 00484 be->valid = TRUE; 00485 assert(path != ngs->bpidx); 00486 00487 /* DICT2PID */ 00488 /* Get diphone ID for final phone and number of ssids corresponding to it. */ 00489 be->last_phone = dict_last_phone(ps_search_dict(ngs),w); 00490 if (dict_is_single_phone(ps_search_dict(ngs), w)) { 00491 be->last2_phone = -1; 00492 be->s_idx = -1; 00493 rcsize = 0; 00494 } 00495 else { 00496 be->last2_phone = dict_second_last_phone(ps_search_dict(ngs),w); 00497 rcsize = dict2pid_rssid(ps_search_dict2pid(ngs), 00498 be->last_phone, be->last2_phone)->n_ssid; 00499 } 00500 /* Allocate some space on the bscore_stack for all of these triphones. */ 00501 for (i = 0; i < rcsize; ++i) 00502 ngs->bscore_stack[ngs->bss_head + i] = WORST_SCORE; 00503 if (rcsize) 00504 ngs->bscore_stack[ngs->bss_head + rc] = score; 00505 set_real_wid(ngs, ngs->bpidx); 00506 00507 ngs->bpidx++; 00508 ngs->bss_head += rcsize; 00509 } 00510 } 00511 00512 int 00513 ngram_search_find_exit(ngram_search_t *ngs, int frame_idx, int32 *out_best_score) 00514 { 00515 /* End of backpointers for this frame. */ 00516 int end_bpidx; 00517 int best_exit, bp; 00518 int32 best_score; 00519 00520 /* No hypothesis means no exit node! */ 00521 if (ngs->n_frame == 0) 00522 return NO_BP; 00523 00524 if (frame_idx == -1 || frame_idx >= ngs->n_frame) 00525 frame_idx = ngs->n_frame - 1; 00526 end_bpidx = ngs->bp_table_idx[frame_idx]; 00527 00528 best_score = WORST_SCORE; 00529 best_exit = NO_BP; 00530 00531 /* Scan back to find a frame with some backpointers in it. */ 00532 while (frame_idx >= 0 && ngs->bp_table_idx[frame_idx] == end_bpidx) 00533 --frame_idx; 00534 /* This is NOT an error, it just means there is no hypothesis yet. */ 00535 if (frame_idx < 0) 00536 return NO_BP; 00537 00538 /* Now find the entry for </s> OR the best scoring entry. */ 00539 assert(end_bpidx < ngs->bp_table_size); 00540 for (bp = ngs->bp_table_idx[frame_idx]; bp < end_bpidx; ++bp) { 00541 if (ngs->bp_table[bp].wid == ps_search_finish_wid(ngs) 00542 || ngs->bp_table[bp].score BETTER_THAN best_score) { 00543 best_score = ngs->bp_table[bp].score; 00544 best_exit = bp; 00545 } 00546 if (ngs->bp_table[bp].wid == ps_search_finish_wid(ngs)) 00547 break; 00548 } 00549 00550 if (out_best_score) *out_best_score = best_score; 00551 return best_exit; 00552 } 00553 00554 char const * 00555 ngram_search_bp_hyp(ngram_search_t *ngs, int bpidx) 00556 { 00557 ps_search_t *base = ps_search_base(ngs); 00558 char *c; 00559 size_t len; 00560 int bp; 00561 00562 if (bpidx == NO_BP) 00563 return NULL; 00564 00565 bp = bpidx; 00566 len = 0; 00567 while (bp != NO_BP) { 00568 bptbl_t *be = &ngs->bp_table[bp]; 00569 bp = be->bp; 00570 if (dict_real_word(ps_search_dict(ngs), be->wid)) 00571 len += strlen(dict_basestr(ps_search_dict(ngs), be->wid)) + 1; 00572 } 00573 00574 ckd_free(base->hyp_str); 00575 if (len == 0) { 00576 base->hyp_str = NULL; 00577 return base->hyp_str; 00578 } 00579 base->hyp_str = ckd_calloc(1, len); 00580 00581 bp = bpidx; 00582 c = base->hyp_str + len - 1; 00583 while (bp != NO_BP) { 00584 bptbl_t *be = &ngs->bp_table[bp]; 00585 size_t len; 00586 00587 bp = be->bp; 00588 if (dict_real_word(ps_search_dict(ngs), be->wid)) { 00589 len = strlen(dict_basestr(ps_search_dict(ngs), be->wid)); 00590 c -= len; 00591 memcpy(c, dict_basestr(ps_search_dict(ngs), be->wid), len); 00592 if (c > base->hyp_str) { 00593 --c; 00594 *c = ' '; 00595 } 00596 } 00597 } 00598 00599 return base->hyp_str; 00600 } 00601 00602 void 00603 ngram_search_alloc_all_rc(ngram_search_t *ngs, int32 w) 00604 { 00605 chan_t *hmm, *thmm; 00606 xwdssid_t *rssid; 00607 int32 i, tmatid, ciphone; 00608 00609 /* DICT2PID */ 00610 /* Get pointer to array of triphones for final diphone. */ 00611 assert(!dict_is_single_phone(ps_search_dict(ngs), w)); 00612 ciphone = dict_last_phone(ps_search_dict(ngs),w); 00613 rssid = dict2pid_rssid(ps_search_dict2pid(ngs), 00614 ciphone, 00615 dict_second_last_phone(ps_search_dict(ngs),w)); 00616 tmatid = bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, ciphone); 00617 hmm = ngs->word_chan[w]; 00618 if ((hmm == NULL) || (hmm_nonmpx_ssid(&hmm->hmm) != rssid->ssid[0])) { 00619 hmm = listelem_malloc(ngs->chan_alloc); 00620 hmm->next = ngs->word_chan[w]; 00621 ngs->word_chan[w] = hmm; 00622 00623 hmm->info.rc_id = 0; 00624 hmm->ciphone = ciphone; 00625 hmm_init(ngs->hmmctx, &hmm->hmm, FALSE, rssid->ssid[0], tmatid); 00626 E_DEBUG(3,("allocated rc_id 0 ssid %d ciphone %d lc %d word %s\n", 00627 rssid->ssid[0], hmm->ciphone, 00628 dict_second_last_phone(ps_search_dict(ngs),w), 00629 dict_wordstr(ps_search_dict(ngs),w))); 00630 } 00631 for (i = 1; i < rssid->n_ssid; ++i) { 00632 if ((hmm->next == NULL) || (hmm_nonmpx_ssid(&hmm->next->hmm) != rssid->ssid[i])) { 00633 thmm = listelem_malloc(ngs->chan_alloc); 00634 thmm->next = hmm->next; 00635 hmm->next = thmm; 00636 hmm = thmm; 00637 00638 hmm->info.rc_id = i; 00639 hmm->ciphone = ciphone; 00640 hmm_init(ngs->hmmctx, &hmm->hmm, FALSE, rssid->ssid[i], tmatid); 00641 E_DEBUG(3,("allocated rc_id %d ssid %d ciphone %d lc %d word %s\n", 00642 i, rssid->ssid[i], hmm->ciphone, 00643 dict_second_last_phone(ps_search_dict(ngs),w), 00644 dict_wordstr(ps_search_dict(ngs),w))); 00645 } 00646 else 00647 hmm = hmm->next; 00648 } 00649 } 00650 00651 void 00652 ngram_search_free_all_rc(ngram_search_t *ngs, int32 w) 00653 { 00654 chan_t *hmm, *thmm; 00655 00656 for (hmm = ngs->word_chan[w]; hmm; hmm = thmm) { 00657 thmm = hmm->next; 00658 hmm_deinit(&hmm->hmm); 00659 listelem_free(ngs->chan_alloc, hmm); 00660 } 00661 ngs->word_chan[w] = NULL; 00662 } 00663 00664 int32 00665 ngram_search_exit_score(ngram_search_t *ngs, bptbl_t *pbe, int rcphone) 00666 { 00667 /* DICT2PID */ 00668 /* Get the mapping from right context phone ID to index in the 00669 * right context table and the bscore_stack. */ 00670 if (pbe->last2_phone == -1) { 00671 /* No right context for single phone predecessor words. */ 00672 return pbe->score; 00673 } 00674 else { 00675 xwdssid_t *rssid; 00676 /* Find the index for the last diphone of the previous word + 00677 * the first phone of the current word. */ 00678 rssid = dict2pid_rssid(ps_search_dict2pid(ngs), 00679 pbe->last_phone, pbe->last2_phone); 00680 /* This may be WORST_SCORE, which means that there was no exit 00681 * with rcphone as right context. */ 00682 return ngs->bscore_stack[pbe->s_idx + rssid->cimap[rcphone]]; 00683 } 00684 } 00685 00686 /* 00687 * Compute acoustic and LM scores for a BPTable entry (segment). 00688 */ 00689 void 00690 ngram_compute_seg_score(ngram_search_t *ngs, bptbl_t *be, float32 lwf, 00691 int32 *out_ascr, int32 *out_lscr) 00692 { 00693 bptbl_t *pbe; 00694 int32 start_score; 00695 00696 /* Start of utterance. */ 00697 if (be->bp == NO_BP) { 00698 *out_ascr = be->score; 00699 *out_lscr = 0; 00700 return; 00701 } 00702 00703 /* Otherwise, calculate lscr and ascr. */ 00704 pbe = ngs->bp_table + be->bp; 00705 start_score = ngram_search_exit_score(ngs, pbe, 00706 dict_first_phone(ps_search_dict(ngs),be->wid)); 00707 assert(start_score BETTER_THAN WORST_SCORE); 00708 00709 /* FIXME: These result in positive acoustic scores when filler 00710 words have non-filler pronunciations. That whole business 00711 is still pretty much broken but at least it doesn't 00712 segfault. */ 00713 if (be->wid == ps_search_silence_wid(ngs)) { 00714 *out_lscr = ngs->silpen; 00715 } 00716 else if (dict_filler_word(ps_search_dict(ngs), be->wid)) { 00717 *out_lscr = ngs->fillpen; 00718 } 00719 else { 00720 int32 n_used; 00721 *out_lscr = ngram_tg_score(ngs->lmset, 00722 be->real_wid, 00723 pbe->real_wid, 00724 pbe->prev_real_wid, 00725 &n_used)>>SENSCR_SHIFT; 00726 *out_lscr = *out_lscr * lwf; 00727 } 00728 *out_ascr = be->score - start_score - *out_lscr; 00729 } 00730 00731 static int 00732 ngram_search_start(ps_search_t *search) 00733 { 00734 ngram_search_t *ngs = (ngram_search_t *)search; 00735 00736 ngs->done = FALSE; 00737 ngram_model_flush(ngs->lmset); 00738 if (ngs->fwdtree) 00739 ngram_fwdtree_start(ngs); 00740 else if (ngs->fwdflat) 00741 ngram_fwdflat_start(ngs); 00742 else 00743 return -1; 00744 return 0; 00745 } 00746 00747 static int 00748 ngram_search_step(ps_search_t *search, int frame_idx) 00749 { 00750 ngram_search_t *ngs = (ngram_search_t *)search; 00751 00752 if (ngs->fwdtree) 00753 return ngram_fwdtree_search(ngs, frame_idx); 00754 else if (ngs->fwdflat) 00755 return ngram_fwdflat_search(ngs, frame_idx); 00756 else 00757 return -1; 00758 } 00759 00760 void 00761 dump_bptable(ngram_search_t *ngs) 00762 { 00763 int i; 00764 E_INFO("Backpointer table (%d entries):\n", ngs->bpidx); 00765 for (i = 0; i < ngs->bpidx; ++i) { 00766 bptbl_t *bpe = ngs->bp_table + i; 00767 int j, rcsize; 00768 00769 E_INFO_NOFN("%-5d %-10s start %-3d end %-3d score %-8d bp %-3d real_wid %-5d prev_real_wid %-5d", 00770 i, dict_wordstr(ps_search_dict(ngs), bpe->wid), 00771 (bpe->bp == -1 00772 ? 0 : ngs->bp_table[bpe->bp].frame + 1), 00773 bpe->frame, bpe->score, bpe->bp, 00774 bpe->real_wid, bpe->prev_real_wid); 00775 00776 if (bpe->last2_phone == -1) 00777 rcsize = 0; 00778 else 00779 rcsize = dict2pid_rssid(ps_search_dict2pid(ngs), 00780 bpe->last_phone, bpe->last2_phone)->n_ssid; 00781 if (rcsize) { 00782 E_INFOCONT("\tbss"); 00783 for (j = 0; j < rcsize; ++j) 00784 if (ngs->bscore_stack[bpe->s_idx + j] != WORST_SCORE) 00785 E_INFOCONT(" %d", bpe->score - ngs->bscore_stack[bpe->s_idx + j]); 00786 } 00787 E_INFOCONT("\n"); 00788 } 00789 } 00790 00791 static int 00792 ngram_search_finish(ps_search_t *search) 00793 { 00794 ngram_search_t *ngs = (ngram_search_t *)search; 00795 00796 ngs->n_tot_frame += ngs->n_frame; 00797 if (ngs->fwdtree) { 00798 ngram_fwdtree_finish(ngs); 00799 /* dump_bptable(ngs); */ 00800 00801 /* Now do fwdflat search in its entirety, if requested. */ 00802 if (ngs->fwdflat) { 00803 int i; 00804 /* Rewind the acoustic model. */ 00805 if (acmod_rewind(ps_search_acmod(ngs)) < 0) 00806 return -1; 00807 /* Now redo search. */ 00808 ngram_fwdflat_start(ngs); 00809 i = 0; 00810 while (ps_search_acmod(ngs)->n_feat_frame > 0) { 00811 int nfr; 00812 if ((nfr = ngram_fwdflat_search(ngs, i)) < 0) 00813 return nfr; 00814 acmod_advance(ps_search_acmod(ngs)); 00815 ++i; 00816 } 00817 ngram_fwdflat_finish(ngs); 00818 /* And now, we should have a result... */ 00819 /* dump_bptable(ngs); */ 00820 } 00821 } 00822 else if (ngs->fwdflat) { 00823 ngram_fwdflat_finish(ngs); 00824 } 00825 00826 /* Mark the current utterance as done. */ 00827 ngs->done = TRUE; 00828 return 0; 00829 } 00830 00831 static ps_latlink_t * 00832 ngram_search_bestpath(ps_search_t *search, int32 *out_score, int backward) 00833 { 00834 ngram_search_t *ngs = (ngram_search_t *)search; 00835 00836 if (search->last_link == NULL) { 00837 search->last_link = ps_lattice_bestpath(search->dag, ngs->lmset, 00838 ngs->bestpath_fwdtree_lw_ratio, 00839 ngs->ascale); 00840 if (search->last_link == NULL) 00841 return NULL; 00842 /* Also calculate betas so we can fill in the posterior 00843 * probability field in the segmentation. */ 00844 if (search->post == 0) 00845 search->post = ps_lattice_posterior(search->dag, ngs->lmset, 00846 ngs->ascale); 00847 } 00848 if (out_score) 00849 *out_score = search->last_link->path_scr + search->dag->final_node_ascr; 00850 return search->last_link; 00851 } 00852 00853 static char const * 00854 ngram_search_hyp(ps_search_t *search, int32 *out_score) 00855 { 00856 ngram_search_t *ngs = (ngram_search_t *)search; 00857 00858 /* Only do bestpath search if the utterance is complete. */ 00859 if (ngs->bestpath && ngs->done) { 00860 ps_lattice_t *dag; 00861 ps_latlink_t *link; 00862 char const *hyp; 00863 double n_speech; 00864 00865 ptmr_reset(&ngs->bestpath_perf); 00866 ptmr_start(&ngs->bestpath_perf); 00867 if ((dag = ngram_search_lattice(search)) == NULL) 00868 return NULL; 00869 if ((link = ngram_search_bestpath(search, out_score, FALSE)) == NULL) 00870 return NULL; 00871 hyp = ps_lattice_hyp(dag, link); 00872 ptmr_stop(&ngs->bestpath_perf); 00873 n_speech = (double)dag->n_frames 00874 / cmd_ln_int32_r(ps_search_config(ngs), "-frate"); 00875 E_INFO("bestpath %.2f CPU %.3f xRT\n", 00876 ngs->bestpath_perf.t_cpu, 00877 ngs->bestpath_perf.t_cpu / n_speech); 00878 E_INFO("bestpath %.2f wall %.3f xRT\n", 00879 ngs->bestpath_perf.t_elapsed, 00880 ngs->bestpath_perf.t_elapsed / n_speech); 00881 return hyp; 00882 } 00883 else { 00884 int32 bpidx; 00885 00886 /* fwdtree and fwdflat use same backpointer table. */ 00887 bpidx = ngram_search_find_exit(ngs, -1, out_score); 00888 if (bpidx != NO_BP) 00889 return ngram_search_bp_hyp(ngs, bpidx); 00890 } 00891 00892 return NULL; 00893 } 00894 00895 static void 00896 ngram_search_bp2itor(ps_seg_t *seg, int bp) 00897 { 00898 ngram_search_t *ngs = (ngram_search_t *)seg->search; 00899 bptbl_t *be, *pbe; 00900 00901 be = &ngs->bp_table[bp]; 00902 pbe = be->bp == -1 ? NULL : &ngs->bp_table[be->bp]; 00903 seg->word = dict_wordstr(ps_search_dict(ngs), be->wid); 00904 seg->ef = be->frame; 00905 seg->sf = pbe ? pbe->frame + 1 : 0; 00906 seg->prob = 0; /* Bogus value... */ 00907 /* Compute acoustic and LM scores for this segment. */ 00908 if (pbe == NULL) { 00909 seg->ascr = be->score; 00910 seg->lscr = 0; 00911 seg->lback = 0; 00912 } 00913 else { 00914 int32 start_score; 00915 00916 /* Find ending path score of previous word. */ 00917 start_score = ngram_search_exit_score(ngs, pbe, 00918 dict_first_phone(ps_search_dict(ngs), be->wid)); 00919 assert(start_score BETTER_THAN WORST_SCORE); 00920 if (be->wid == ps_search_silence_wid(ngs)) { 00921 seg->lscr = ngs->silpen; 00922 } 00923 else if (dict_filler_word(ps_search_dict(ngs), be->wid)) { 00924 seg->lscr = ngs->fillpen; 00925 } 00926 else { 00927 seg->lscr = ngram_tg_score(ngs->lmset, 00928 be->real_wid, 00929 pbe->real_wid, 00930 pbe->prev_real_wid, 00931 &seg->lback)>>SENSCR_SHIFT; 00932 seg->lscr = (int32)(seg->lscr * seg->lwf); 00933 } 00934 seg->ascr = be->score - start_score - seg->lscr; 00935 } 00936 } 00937 00938 static void 00939 ngram_bp_seg_free(ps_seg_t *seg) 00940 { 00941 bptbl_seg_t *itor = (bptbl_seg_t *)seg; 00942 00943 ckd_free(itor->bpidx); 00944 ckd_free(itor); 00945 } 00946 00947 static ps_seg_t * 00948 ngram_bp_seg_next(ps_seg_t *seg) 00949 { 00950 bptbl_seg_t *itor = (bptbl_seg_t *)seg; 00951 00952 if (++itor->cur == itor->n_bpidx) { 00953 ngram_bp_seg_free(seg); 00954 return NULL; 00955 } 00956 00957 ngram_search_bp2itor(seg, itor->bpidx[itor->cur]); 00958 return seg; 00959 } 00960 00961 static ps_segfuncs_t ngram_bp_segfuncs = { 00962 /* seg_next */ ngram_bp_seg_next, 00963 /* seg_free */ ngram_bp_seg_free 00964 }; 00965 00966 static ps_seg_t * 00967 ngram_search_bp_iter(ngram_search_t *ngs, int bpidx, float32 lwf) 00968 { 00969 bptbl_seg_t *itor; 00970 int bp, cur; 00971 00972 /* Calling this an "iterator" is a bit of a misnomer since we have 00973 * to get the entire backtrace in order to produce it. On the 00974 * other hand, all we actually need is the bptbl IDs, and we can 00975 * allocate a fixed-size array of them. */ 00976 itor = ckd_calloc(1, sizeof(*itor)); 00977 itor->base.vt = &ngram_bp_segfuncs; 00978 itor->base.search = ps_search_base(ngs); 00979 itor->base.lwf = lwf; 00980 itor->n_bpidx = 0; 00981 bp = bpidx; 00982 while (bp != NO_BP) { 00983 bptbl_t *be = &ngs->bp_table[bp]; 00984 bp = be->bp; 00985 ++itor->n_bpidx; 00986 } 00987 if (itor->n_bpidx == 0) { 00988 ckd_free(itor); 00989 return NULL; 00990 } 00991 itor->bpidx = ckd_calloc(itor->n_bpidx, sizeof(*itor->bpidx)); 00992 cur = itor->n_bpidx - 1; 00993 bp = bpidx; 00994 while (bp != NO_BP) { 00995 bptbl_t *be = &ngs->bp_table[bp]; 00996 itor->bpidx[cur] = bp; 00997 bp = be->bp; 00998 --cur; 00999 } 01000 01001 /* Fill in relevant fields for first element. */ 01002 ngram_search_bp2itor((ps_seg_t *)itor, itor->bpidx[0]); 01003 01004 return (ps_seg_t *)itor; 01005 } 01006 01007 static ps_seg_t * 01008 ngram_search_seg_iter(ps_search_t *search, int32 *out_score) 01009 { 01010 ngram_search_t *ngs = (ngram_search_t *)search; 01011 01012 /* Only do bestpath search if the utterance is done. */ 01013 if (ngs->bestpath && ngs->done) { 01014 ps_lattice_t *dag; 01015 ps_latlink_t *link; 01016 double n_speech; 01017 ps_seg_t *itor; 01018 01019 ptmr_reset(&ngs->bestpath_perf); 01020 ptmr_start(&ngs->bestpath_perf); 01021 if ((dag = ngram_search_lattice(search)) == NULL) 01022 return NULL; 01023 if ((link = ngram_search_bestpath(search, out_score, TRUE)) == NULL) 01024 return NULL; 01025 itor = ps_lattice_seg_iter(dag, link, 01026 ngs->bestpath_fwdtree_lw_ratio); 01027 ptmr_stop(&ngs->bestpath_perf); 01028 n_speech = (double)dag->n_frames 01029 / cmd_ln_int32_r(ps_search_config(ngs), "-frate"); 01030 E_INFO("bestpath %.2f CPU %.3f xRT\n", 01031 ngs->bestpath_perf.t_cpu, 01032 ngs->bestpath_perf.t_cpu / n_speech); 01033 E_INFO("bestpath %.2f wall %.3f xRT\n", 01034 ngs->bestpath_perf.t_elapsed, 01035 ngs->bestpath_perf.t_elapsed / n_speech); 01036 return itor; 01037 } 01038 else { 01039 int32 bpidx; 01040 01041 /* fwdtree and fwdflat use same backpointer table. */ 01042 bpidx = ngram_search_find_exit(ngs, -1, out_score); 01043 return ngram_search_bp_iter(ngs, bpidx, 01044 /* but different language weights... */ 01045 (ngs->done && ngs->fwdflat) 01046 ? ngs->fwdflat_fwdtree_lw_ratio : 1.0); 01047 } 01048 01049 return NULL; 01050 } 01051 01052 static int32 01053 ngram_search_prob(ps_search_t *search) 01054 { 01055 ngram_search_t *ngs = (ngram_search_t *)search; 01056 01057 /* Only do bestpath search if the utterance is done. */ 01058 if (ngs->bestpath && ngs->done) { 01059 ps_lattice_t *dag; 01060 ps_latlink_t *link; 01061 01062 if ((dag = ngram_search_lattice(search)) == NULL) 01063 return 0; 01064 if ((link = ngram_search_bestpath(search, NULL, TRUE)) == NULL) 01065 return 0; 01066 return search->post; 01067 } 01068 else { 01069 /* FIXME: Give some kind of good estimate here, eventually. */ 01070 return 0; 01071 } 01072 } 01073 01074 static void 01075 create_dag_nodes(ngram_search_t *ngs, ps_lattice_t *dag) 01076 { 01077 bptbl_t *bp_ptr; 01078 int32 i; 01079 01080 for (i = 0, bp_ptr = ngs->bp_table; i < ngs->bpidx; ++i, ++bp_ptr) { 01081 int32 sf, ef, wid; 01082 ps_latnode_t *node; 01083 01084 /* Skip invalid backpointers (these result from -maxwpf pruning) */ 01085 if (!bp_ptr->valid) 01086 continue; 01087 01088 sf = (bp_ptr->bp < 0) ? 0 : ngs->bp_table[bp_ptr->bp].frame + 1; 01089 ef = bp_ptr->frame; 01090 wid = bp_ptr->wid; 01091 01092 assert(ef < dag->n_frames); 01093 /* Skip non-final </s> entries. */ 01094 if ((wid == ps_search_finish_wid(ngs)) && (ef < dag->n_frames - 1)) 01095 continue; 01096 01097 /* Skip if word not in LM */ 01098 if ((!dict_filler_word(ps_search_dict(ngs), wid)) 01099 && (!ngram_model_set_known_wid(ngs->lmset, 01100 dict_basewid(ps_search_dict(ngs), wid)))) 01101 continue; 01102 01103 /* See if bptbl entry <wid,sf> already in lattice */ 01104 for (node = dag->nodes; node; node = node->next) { 01105 if ((node->wid == wid) && (node->sf == sf)) 01106 break; 01107 } 01108 01109 /* For the moment, store bptbl indices in node.{fef,lef} */ 01110 if (node) 01111 node->lef = i; 01112 else { 01113 /* New node; link to head of list */ 01114 node = listelem_malloc(dag->latnode_alloc); 01115 node->wid = wid; 01116 node->sf = sf; /* This is a frame index. */ 01117 node->fef = node->lef = i; /* These are backpointer indices (argh) */ 01118 node->reachable = FALSE; 01119 node->entries = NULL; 01120 node->exits = NULL; 01121 01122 /* NOTE: This creates the list of nodes in reverse 01123 * topological order, i.e. a node always precedes its 01124 * antecedents in this list. */ 01125 node->next = dag->nodes; 01126 dag->nodes = node; 01127 ++dag->n_nodes; 01128 } 01129 } 01130 } 01131 01132 static ps_latnode_t * 01133 find_start_node(ngram_search_t *ngs, ps_lattice_t *dag) 01134 { 01135 ps_latnode_t *node; 01136 01137 /* Find start node <s>.0 */ 01138 for (node = dag->nodes; node; node = node->next) { 01139 if ((node->wid == ps_search_start_wid(ngs)) && (node->sf == 0)) 01140 break; 01141 } 01142 if (!node) { 01143 /* This is probably impossible. */ 01144 E_ERROR("Couldn't find <s> in first frame\n"); 01145 return NULL; 01146 } 01147 return node; 01148 } 01149 01150 static ps_latnode_t * 01151 find_end_node(ngram_search_t *ngs, ps_lattice_t *dag, float32 lwf) 01152 { 01153 ps_latnode_t *node; 01154 int32 ef, bestbp, bp, bestscore; 01155 01156 /* Find final node </s>.last_frame; nothing can follow this node */ 01157 for (node = dag->nodes; node; node = node->next) { 01158 int32 lef = ngs->bp_table[node->lef].frame; 01159 if ((node->wid == ps_search_finish_wid(ngs)) 01160 && (lef == dag->n_frames - 1)) 01161 break; 01162 } 01163 if (node != NULL) 01164 return node; 01165 01166 /* It is quite likely that no </s> exited in the last frame. So, 01167 * find the node corresponding to the best exit. */ 01168 /* Find the last frame containing a word exit. */ 01169 for (ef = dag->n_frames - 1; 01170 ef >= 0 && ngs->bp_table_idx[ef] == ngs->bpidx; 01171 --ef); 01172 if (ef < 0) { 01173 E_ERROR("Empty backpointer table: can not build DAG.\n"); 01174 return NULL; 01175 } 01176 01177 /* Find best word exit in that frame. */ 01178 bestscore = WORST_SCORE; 01179 bestbp = NO_BP; 01180 for (bp = ngs->bp_table_idx[ef]; bp < ngs->bp_table_idx[ef + 1]; ++bp) { 01181 int32 n_used, l_scr, wid, prev_wid; 01182 wid = ngs->bp_table[bp].real_wid; 01183 prev_wid = ngs->bp_table[bp].prev_real_wid; 01184 /* Always prefer </s>, of which there will only be one per frame. */ 01185 if (wid == ps_search_finish_wid(ngs)) { 01186 bestbp = bp; 01187 break; 01188 } 01189 l_scr = ngram_tg_score(ngs->lmset, ps_search_finish_wid(ngs), 01190 wid, prev_wid, &n_used) >>SENSCR_SHIFT; 01191 l_scr = l_scr * lwf; 01192 if (ngs->bp_table[bp].score + l_scr BETTER_THAN bestscore) { 01193 bestscore = ngs->bp_table[bp].score + l_scr; 01194 bestbp = bp; 01195 } 01196 } 01197 if (bestbp == NO_BP) { 01198 E_ERROR("No word exits found in last frame (%d), assuming no recognition\n", ef); 01199 return NULL; 01200 } 01201 E_INFO("</s> not found in last frame, using %s.%d instead\n", 01202 dict_basestr(ps_search_dict(ngs), ngs->bp_table[bestbp].wid), ef); 01203 01204 /* Now find the node that corresponds to it. */ 01205 for (node = dag->nodes; node; node = node->next) { 01206 if (node->lef == bestbp) 01207 return node; 01208 } 01209 01210 /* FIXME: This seems to happen a lot! */ 01211 E_ERROR("Failed to find DAG node corresponding to %s\n", 01212 dict_basestr(ps_search_dict(ngs), ngs->bp_table[bestbp].wid)); 01213 return NULL; 01214 } 01215 01216 /* 01217 * Build lattice from bptable. 01218 */ 01219 ps_lattice_t * 01220 ngram_search_lattice(ps_search_t *search) 01221 { 01222 int32 i, score, ascr, lscr; 01223 ps_latnode_t *node, *from, *to; 01224 ngram_search_t *ngs; 01225 ps_lattice_t *dag; 01226 int min_endfr, nlink; 01227 float lwf; 01228 01229 ngs = (ngram_search_t *)search; 01230 min_endfr = cmd_ln_int32_r(ps_search_config(search), "-min_endfr"); 01231 01232 /* If the best score is WORST_SCORE or worse, there is no way to 01233 * make a lattice. */ 01234 if (ngs->best_score == WORST_SCORE || ngs->best_score WORSE_THAN WORST_SCORE) 01235 return NULL; 01236 01237 /* Check to see if a lattice has previously been created over the 01238 * same number of frames, and reuse it if so. */ 01239 if (search->dag && search->dag->n_frames == ngs->n_frame) 01240 return search->dag; 01241 01242 /* Nope, create a new one. */ 01243 ps_lattice_free(search->dag); 01244 search->dag = NULL; 01245 dag = ps_lattice_init_search(search, ngs->n_frame); 01246 /* Compute these such that they agree with the fwdtree language weight. */ 01247 lwf = ngs->fwdflat ? ngs->fwdflat_fwdtree_lw_ratio : 1.0; 01248 create_dag_nodes(ngs, dag); 01249 if ((dag->start = find_start_node(ngs, dag)) == NULL) 01250 goto error_out; 01251 if ((dag->end = find_end_node(ngs, dag, ngs->bestpath_fwdtree_lw_ratio)) == NULL) 01252 goto error_out; 01253 E_INFO("lattice start node %s.%d end node %s.%d\n", 01254 dict_wordstr(search->dict, dag->start->wid), dag->start->sf, 01255 dict_wordstr(search->dict, dag->end->wid), dag->end->sf); 01256 01257 ngram_compute_seg_score(ngs, ngs->bp_table + dag->end->lef, lwf, 01258 &dag->final_node_ascr, &lscr); 01259 01260 /* 01261 * At this point, dag->nodes is ordered such that nodes earlier in 01262 * the list can follow (in time) those later in the list, but not 01263 * vice versa (see above - also note that adjacency is purely 01264 * determined by time which is why we can make this claim). Now 01265 * create precedence links and simultanesously mark all nodes that 01266 * can reach dag->end. (All nodes are reached from dag->start 01267 * simply by definition - they were created that way). 01268 * 01269 * Note that this also means that any nodes before dag->end in the 01270 * list can be discarded, meaning that dag->end will always be 01271 * equal to dag->nodes (FIXME: except when loading from a file but 01272 * we can fix that...) 01273 */ 01274 i = 0; 01275 while (dag->nodes && dag->nodes != dag->end) { 01276 ps_latnode_t *next = dag->nodes->next; 01277 listelem_free(dag->latnode_alloc, dag->nodes); 01278 dag->nodes = next; 01279 ++i; 01280 } 01281 E_INFO("Eliminated %d nodes before end node\n", i); 01282 dag->end->reachable = TRUE; 01283 nlink = 0; 01284 for (to = dag->end; to; to = to->next) { 01285 int fef, lef; 01286 01287 /* Skip if not reachable; it will never be reachable from dag->end */ 01288 if (!to->reachable) 01289 continue; 01290 01291 /* Prune nodes with too few endpoints - heuristic 01292 borrowed from Sphinx3 */ 01293 fef = ngs->bp_table[to->fef].frame; 01294 lef = ngs->bp_table[to->lef].frame; 01295 if (to != dag->end && lef - fef < min_endfr) { 01296 to->reachable = FALSE; 01297 continue; 01298 } 01299 01300 /* Find predecessors of to : from->fef+1 <= to->sf <= from->lef+1 */ 01301 for (from = to->next; from; from = from->next) { 01302 bptbl_t *from_bpe; 01303 01304 fef = ngs->bp_table[from->fef].frame; 01305 lef = ngs->bp_table[from->lef].frame; 01306 01307 if ((to->sf <= fef) || (to->sf > lef + 1)) 01308 continue; 01309 if (lef - fef < min_endfr) { 01310 assert(!from->reachable); 01311 continue; 01312 } 01313 01314 /* Find bptable entry for "from" that exactly precedes "to" */ 01315 i = from->fef; 01316 from_bpe = ngs->bp_table + i; 01317 for (; i <= from->lef; i++, from_bpe++) { 01318 if (from_bpe->wid != from->wid) 01319 continue; 01320 if (from_bpe->frame >= to->sf - 1) 01321 break; 01322 } 01323 01324 if ((i > from->lef) || (from_bpe->frame != to->sf - 1)) 01325 continue; 01326 01327 /* Find acoustic score from.sf->to.sf-1 with right context = to */ 01328 /* This gives us from_bpe's best acoustic score. */ 01329 ngram_compute_seg_score(ngs, from_bpe, lwf, 01330 &ascr, &lscr); 01331 /* Now find the exact path score for from->to, including 01332 * the appropriate final triphone. In fact this might not 01333 * exist. */ 01334 score = ngram_search_exit_score(ngs, from_bpe, 01335 dict_first_phone(ps_search_dict(ngs), to->wid)); 01336 /* Does not exist. Can't create a link here. */ 01337 if (score == WORST_SCORE) 01338 continue; 01339 /* Adjust the arc score to match the correct triphone. */ 01340 else 01341 score = ascr + (score - from_bpe->score); 01342 if (score BETTER_THAN 0) { 01343 /* Scores must be negative, or Bad Things will happen. 01344 In general, they are, except in corner cases 01345 involving filler words. We don't want to throw any 01346 links away so we'll keep these, but with some 01347 arbitrarily improbable but recognizable score. */ 01348 ps_lattice_link(dag, from, to, -424242, from_bpe->frame); 01349 ++nlink; 01350 from->reachable = TRUE; 01351 } 01352 else if (score BETTER_THAN WORST_SCORE) { 01353 ps_lattice_link(dag, from, to, score, from_bpe->frame); 01354 ++nlink; 01355 from->reachable = TRUE; 01356 } 01357 } 01358 } 01359 01360 /* There must be at least one path between dag->start and dag->end */ 01361 if (!dag->start->reachable) { 01362 E_ERROR("End node of lattice isolated; unreachable\n"); 01363 goto error_out; 01364 } 01365 01366 for (node = dag->nodes; node; node = node->next) { 01367 /* Change node->{fef,lef} from bptbl indices to frames. */ 01368 node->fef = ngs->bp_table[node->fef].frame; 01369 node->lef = ngs->bp_table[node->lef].frame; 01370 /* Find base wid for nodes. */ 01371 node->basewid = dict_basewid(search->dict, node->wid); 01372 } 01373 01374 /* Link nodes with alternate pronunciations at the same timepoint. */ 01375 for (node = dag->nodes; node; node = node->next) { 01376 ps_latnode_t *alt; 01377 /* Scan forward to find the next alternate, then stop. */ 01378 for (alt = node->next; alt && alt->sf == node->sf; alt = alt->next) { 01379 if (alt->basewid == node->basewid) { 01380 alt->alt = node->alt; 01381 node->alt = alt; 01382 break; 01383 } 01384 } 01385 } 01386 E_INFO("Lattice has %d nodes, %d links\n", dag->n_nodes, nlink); 01387 01388 /* Minor hack: If the final node is a filler word and not </s>, 01389 * then set its base word ID to </s>, so that the language model 01390 * scores won't be screwed up. */ 01391 if (dict_filler_word(ps_search_dict(ngs), dag->end->wid)) 01392 dag->end->basewid = ps_search_finish_wid(ngs); 01393 01394 /* Free nodes unreachable from dag->end and their links */ 01395 ps_lattice_delete_unreachable(dag); 01396 01397 /* Build links around silence and filler words, since they do not 01398 * exist in the language model. */ 01399 ps_lattice_bypass_fillers(dag, ngs->silpen, ngs->fillpen); 01400 01401 search->dag = dag; 01402 return dag; 01403 01404 error_out: 01405 ps_lattice_free(dag); 01406 return NULL; 01407 }