PocketSphinx
0.6
|
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 00002 /* ==================================================================== 00003 * Copyright (c) 2008 Carnegie Mellon University. All rights 00004 * reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions 00008 * are met: 00009 * 00010 * 1. Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in 00015 * the documentation and/or other materials provided with the 00016 * distribution. 00017 * 00018 * This work was supported in part by funding from the Defense Advanced 00019 * Research Projects Agency and the National Science Foundation of the 00020 * United States of America, and the CMU Sphinx Speech Consortium. 00021 * 00022 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 00023 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 00024 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00025 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 00026 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00027 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00028 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00029 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00030 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00032 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00033 * 00034 * ==================================================================== 00035 * 00036 */ 00037 00042 /* System headers. */ 00043 #include <string.h> 00044 #include <assert.h> 00045 00046 /* SphinxBase headers. */ 00047 #include <sphinxbase/ckd_alloc.h> 00048 #include <sphinxbase/listelem_alloc.h> 00049 #include <sphinxbase/err.h> 00050 00051 /* Local headers. */ 00052 #include "ngram_search_fwdtree.h" 00053 #include "phone_loop_search.h" 00054 00055 /* Turn this on to dump channels for debugging */ 00056 #define __CHAN_DUMP__ 0 00057 #if __CHAN_DUMP__ 00058 #define chan_v_eval(chan) hmm_dump_vit_eval(&(chan)->hmm, stderr) 00059 #else 00060 #define chan_v_eval(chan) hmm_vit_eval(&(chan)->hmm) 00061 #endif 00062 00063 /* 00064 * Allocate that part of the search channel tree structure that is independent of the 00065 * LM in use. 00066 */ 00067 static void 00068 init_search_tree(ngram_search_t *ngs) 00069 { 00070 int32 w, ndiph, i, n_words, n_ci; 00071 dict_t *dict = ps_search_dict(ngs); 00072 bitvec_t *dimap; 00073 00074 n_words = ps_search_n_words(ngs); 00075 ngs->homophone_set = ckd_calloc(n_words, sizeof(*ngs->homophone_set)); 00076 00077 /* Find #single phone words, and #unique first diphones (#root channels) in dict. */ 00078 ndiph = 0; 00079 ngs->n_1ph_words = 0; 00080 n_ci = bin_mdef_n_ciphone(ps_search_acmod(ngs)->mdef); 00081 /* Allocate a bitvector with flags for each possible diphone. */ 00082 dimap = bitvec_alloc(n_ci * n_ci); 00083 for (w = 0; w < n_words; w++) { 00084 if (!dict_real_word(dict, w)) 00085 continue; 00086 if (dict_is_single_phone(dict, w)) 00087 ++ngs->n_1ph_words; 00088 else { 00089 int ph0, ph1; 00090 ph0 = dict_first_phone(dict, w); 00091 ph1 = dict_second_phone(dict, w); 00092 /* Increment ndiph the first time we see a diphone. */ 00093 if (bitvec_is_clear(dimap, ph0 * n_ci + ph1)) { 00094 bitvec_set(dimap, ph0 * n_ci + ph1); 00095 ++ndiph; 00096 } 00097 } 00098 } 00099 E_INFO("%d unique initial diphones\n", ndiph); 00100 bitvec_free(dimap); 00101 00102 /* Add remaining dict words (</s>, <s>, <sil>, noise words) to single-phone words */ 00103 ngs->n_1ph_words += dict_num_fillers(dict) + 2; 00104 ngs->n_root_chan_alloc = ndiph + 1; 00105 /* Verify that these are all *actually* single-phone words, 00106 * otherwise really bad things will happen to us. */ 00107 for (w = 0; w < n_words; ++w) { 00108 if (dict_real_word(dict, w)) 00109 continue; 00110 if (!dict_is_single_phone(dict, w)) { 00111 E_WARN("Filler word %d = %s has more than one phone, ignoring it.\n", 00112 w, dict_wordstr(dict, w)); 00113 --ngs->n_1ph_words; 00114 } 00115 } 00116 00117 /* Allocate and initialize root channels */ 00118 ngs->root_chan = 00119 ckd_calloc(ngs->n_root_chan_alloc, sizeof(*ngs->root_chan)); 00120 for (i = 0; i < ngs->n_root_chan_alloc; i++) { 00121 hmm_init(ngs->hmmctx, &ngs->root_chan[i].hmm, TRUE, -1, -1); 00122 ngs->root_chan[i].penult_phn_wid = -1; 00123 ngs->root_chan[i].next = NULL; 00124 } 00125 00126 /* Permanently allocate and initialize channels for single-phone 00127 * words (1/word). */ 00128 ngs->rhmm_1ph = ckd_calloc(ngs->n_1ph_words, sizeof(*ngs->rhmm_1ph)); 00129 i = 0; 00130 for (w = 0; w < n_words; w++) { 00131 if (!dict_is_single_phone(dict, w)) 00132 continue; 00133 /* Use SIL as right context for these. */ 00134 ngs->rhmm_1ph[i].ci2phone = bin_mdef_silphone(ps_search_acmod(ngs)->mdef); 00135 ngs->rhmm_1ph[i].ciphone = dict_first_phone(dict, w); 00136 hmm_init(ngs->hmmctx, &ngs->rhmm_1ph[i].hmm, TRUE, 00137 bin_mdef_pid2ssid(ps_search_acmod(ngs)->mdef, ngs->rhmm_1ph[i].ciphone), 00138 bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, ngs->rhmm_1ph[i].ciphone)); 00139 ngs->rhmm_1ph[i].next = NULL; 00140 00141 ngs->word_chan[w] = (chan_t *) &(ngs->rhmm_1ph[i]); 00142 i++; 00143 } 00144 00145 ngs->single_phone_wid = ckd_calloc(ngs->n_1ph_words, 00146 sizeof(*ngs->single_phone_wid)); 00147 E_INFO("%d root, %d non-root channels, %d single-phone words\n", 00148 ngs->n_root_chan, ngs->n_nonroot_chan, ngs->n_1ph_words); 00149 } 00150 00151 /* 00152 * One-time initialization of internal channels in HMM tree. 00153 */ 00154 static void 00155 init_nonroot_chan(ngram_search_t *ngs, chan_t * hmm, int32 ph, int32 ci, int32 tmatid) 00156 { 00157 hmm->next = NULL; 00158 hmm->alt = NULL; 00159 hmm->info.penult_phn_wid = -1; 00160 hmm->ciphone = ci; 00161 hmm_init(ngs->hmmctx, &hmm->hmm, FALSE, ph, tmatid); 00162 } 00163 00164 /* 00165 * Allocate and initialize search channel-tree structure. 00166 * At this point, all the root-channels have been allocated and partly initialized 00167 * (as per init_search_tree()), and channels for all the single-phone words have been 00168 * allocated and initialized. None of the interior channels of search-trees have 00169 * been allocated. 00170 * This routine may be called on every utterance, after reinit_search_tree() clears 00171 * the search tree created for the previous utterance. Meant for reconfiguring the 00172 * search tree to suit the currently active LM. 00173 */ 00174 static void 00175 create_search_tree(ngram_search_t *ngs) 00176 { 00177 chan_t *hmm; 00178 root_chan_t *rhmm; 00179 int32 w, i, j, p, ph, tmatid; 00180 int32 n_words; 00181 dict_t *dict = ps_search_dict(ngs); 00182 dict2pid_t *d2p = ps_search_dict2pid(ngs); 00183 00184 n_words = ps_search_n_words(ngs); 00185 00186 E_INFO("Creating search tree\n"); 00187 00188 for (w = 0; w < n_words; w++) 00189 ngs->homophone_set[w] = -1; 00190 00191 E_INFO("before: %d root, %d non-root channels, %d single-phone words\n", 00192 ngs->n_root_chan, ngs->n_nonroot_chan, ngs->n_1ph_words); 00193 00194 ngs->n_1ph_LMwords = 0; 00195 ngs->n_root_chan = 0; 00196 ngs->n_nonroot_chan = 0; 00197 00198 for (w = 0; w < n_words; w++) { 00199 int ciphone, ci2phone; 00200 00201 /* Ignore dictionary words not in LM */ 00202 if (!ngram_model_set_known_wid(ngs->lmset, dict_basewid(dict, w))) 00203 continue; 00204 00205 /* Handle single-phone words individually; not in channel tree */ 00206 if (dict_is_single_phone(dict, w)) { 00207 E_DEBUG(1,("single_phone_wid[%d] = %s\n", 00208 ngs->n_1ph_LMwords, dict_wordstr(dict, w))); 00209 ngs->single_phone_wid[ngs->n_1ph_LMwords++] = w; 00210 continue; 00211 } 00212 00213 /* Find a root channel matching the initial diphone, or 00214 * allocate one if not found. */ 00215 ciphone = dict_first_phone(dict, w); 00216 ci2phone = dict_second_phone(dict, w); 00217 for (i = 0; i < ngs->n_root_chan; ++i) { 00218 if (ngs->root_chan[i].ciphone == ciphone 00219 && ngs->root_chan[i].ci2phone == ci2phone) 00220 break; 00221 } 00222 if (i == ngs->n_root_chan) { 00223 rhmm = &(ngs->root_chan[ngs->n_root_chan]); 00224 rhmm->hmm.tmatid = bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, ciphone); 00225 /* Begin with CI phone? Not sure this makes a difference... */ 00226 hmm_mpx_ssid(&rhmm->hmm, 0) = 00227 bin_mdef_pid2ssid(ps_search_acmod(ngs)->mdef, ciphone); 00228 rhmm->ciphone = ciphone; 00229 rhmm->ci2phone = ci2phone; 00230 ngs->n_root_chan++; 00231 } 00232 else 00233 rhmm = &(ngs->root_chan[i]); 00234 00235 E_DEBUG(3,("word %s rhmm %d\n", dict_wordstr(dict, w), rhmm - ngs->root_chan)); 00236 /* Now, rhmm = root channel for w. Go on to remaining phones */ 00237 if (dict_pronlen(dict, w) == 2) { 00238 /* Next phone is the last; not kept in tree; add w to penult_phn_wid set */ 00239 if ((j = rhmm->penult_phn_wid) < 0) 00240 rhmm->penult_phn_wid = w; 00241 else { 00242 for (; ngs->homophone_set[j] >= 0; j = ngs->homophone_set[j]); 00243 ngs->homophone_set[j] = w; 00244 } 00245 } 00246 else { 00247 /* Add remaining phones, except the last, to tree */ 00248 ph = dict2pid_internal(d2p, w, 1); 00249 tmatid = bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, dict_pron(dict, w, 1)); 00250 hmm = rhmm->next; 00251 if (hmm == NULL) { 00252 rhmm->next = hmm = listelem_malloc(ngs->chan_alloc); 00253 init_nonroot_chan(ngs, hmm, ph, dict_pron(dict, w, 1), tmatid); 00254 ngs->n_nonroot_chan++; 00255 } 00256 else { 00257 chan_t *prev_hmm = NULL; 00258 00259 for (; hmm && (hmm_nonmpx_ssid(&hmm->hmm) != ph); hmm = hmm->alt) 00260 prev_hmm = hmm; 00261 if (!hmm) { /* thanks, rkm! */ 00262 prev_hmm->alt = hmm = listelem_malloc(ngs->chan_alloc); 00263 init_nonroot_chan(ngs, hmm, ph, dict_pron(dict, w, 1), tmatid); 00264 ngs->n_nonroot_chan++; 00265 } 00266 } 00267 E_DEBUG(3,("phone %s = %d\n", 00268 bin_mdef_ciphone_str(ps_search_acmod(ngs)->mdef, 00269 dict_second_phone(dict, w)), ph)); 00270 for (p = 2; p < dict_pronlen(dict, w) - 1; p++) { 00271 ph = dict2pid_internal(d2p, w, p); 00272 tmatid = bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, dict_pron(dict, w, p)); 00273 if (!hmm->next) { 00274 hmm->next = listelem_malloc(ngs->chan_alloc); 00275 hmm = hmm->next; 00276 init_nonroot_chan(ngs, hmm, ph, dict_pron(dict, w, p), tmatid); 00277 ngs->n_nonroot_chan++; 00278 } 00279 else { 00280 chan_t *prev_hmm = NULL; 00281 00282 for (hmm = hmm->next; hmm && (hmm_nonmpx_ssid(&hmm->hmm) != ph); 00283 hmm = hmm->alt) 00284 prev_hmm = hmm; 00285 if (!hmm) { /* thanks, rkm! */ 00286 prev_hmm->alt = hmm = listelem_malloc(ngs->chan_alloc); 00287 init_nonroot_chan(ngs, hmm, ph, dict_pron(dict, w, p), tmatid); 00288 ngs->n_nonroot_chan++; 00289 } 00290 } 00291 E_DEBUG(3,("phone %s = %d\n", 00292 bin_mdef_ciphone_str(ps_search_acmod(ngs)->mdef, 00293 dict_pron(dict, w, p)), ph)); 00294 } 00295 00296 /* All but last phone of w in tree; add w to hmm->info.penult_phn_wid set */ 00297 if ((j = hmm->info.penult_phn_wid) < 0) 00298 hmm->info.penult_phn_wid = w; 00299 else { 00300 for (; ngs->homophone_set[j] >= 0; j = ngs->homophone_set[j]); 00301 ngs->homophone_set[j] = w; 00302 } 00303 } 00304 } 00305 00306 ngs->n_1ph_words = ngs->n_1ph_LMwords; 00307 00308 /* Add filler words to the array of 1ph words. */ 00309 for (w = 0; w < n_words; ++w) { 00310 /* Skip anything that doesn't actually have a single phone. */ 00311 if (!dict_is_single_phone(dict, w)) 00312 continue; 00313 /* Also skip "real words" and things that are in the LM. */ 00314 if (dict_real_word(dict, w)) 00315 continue; 00316 if (ngram_model_set_known_wid(ngs->lmset, dict_basewid(dict, w))) 00317 continue; 00318 E_DEBUG(1,("single_phone_wid[%d] = %s\n", 00319 ngs->n_1ph_words, dict_wordstr(dict, w))); 00320 ngs->single_phone_wid[ngs->n_1ph_words++] = w; 00321 } 00322 00323 if (ngs->n_nonroot_chan >= ngs->max_nonroot_chan) { 00324 /* Give some room for channels for new words added dynamically at run time */ 00325 ngs->max_nonroot_chan = ngs->n_nonroot_chan + 128; 00326 E_INFO("after: max nonroot chan increased to %d\n", ngs->max_nonroot_chan); 00327 00328 /* Free old active channel list array if any and allocate new one */ 00329 if (ngs->active_chan_list) 00330 ckd_free_2d(ngs->active_chan_list); 00331 ngs->active_chan_list = ckd_calloc_2d(2, ngs->max_nonroot_chan, 00332 sizeof(**ngs->active_chan_list)); 00333 } 00334 00335 if (!ngs->n_root_chan) 00336 E_ERROR("No word from the language model has pronunciation in the dictionary\n"); 00337 00338 E_INFO("after: %d root, %d non-root channels, %d single-phone words\n", 00339 ngs->n_root_chan, ngs->n_nonroot_chan, ngs->n_1ph_words); 00340 } 00341 00342 static void 00343 reinit_search_subtree(ngram_search_t *ngs, chan_t * hmm) 00344 { 00345 chan_t *child, *sibling; 00346 00347 /* First free all children under hmm */ 00348 for (child = hmm->next; child; child = sibling) { 00349 sibling = child->alt; 00350 reinit_search_subtree(ngs, child); 00351 } 00352 00353 /* Now free hmm */ 00354 hmm_deinit(&hmm->hmm); 00355 listelem_free(ngs->chan_alloc, hmm); 00356 } 00357 00358 /* 00359 * Delete search tree by freeing all interior channels within search tree and 00360 * restoring root channel state to the init state (i.e., just after init_search_tree()). 00361 */ 00362 static void 00363 reinit_search_tree(ngram_search_t *ngs) 00364 { 00365 int32 i; 00366 chan_t *hmm, *sibling; 00367 00368 for (i = 0; i < ngs->n_root_chan; i++) { 00369 hmm = ngs->root_chan[i].next; 00370 00371 while (hmm) { 00372 sibling = hmm->alt; 00373 reinit_search_subtree(ngs, hmm); 00374 hmm = sibling; 00375 } 00376 00377 ngs->root_chan[i].penult_phn_wid = -1; 00378 ngs->root_chan[i].next = NULL; 00379 } 00380 ngs->n_nonroot_chan = 0; 00381 } 00382 00383 void 00384 ngram_fwdtree_init(ngram_search_t *ngs) 00385 { 00386 /* Allocate bestbp_rc, lastphn_cand, last_ltrans */ 00387 ngs->bestbp_rc = ckd_calloc(bin_mdef_n_ciphone(ps_search_acmod(ngs)->mdef), 00388 sizeof(*ngs->bestbp_rc)); 00389 ngs->lastphn_cand = ckd_calloc(ps_search_n_words(ngs), 00390 sizeof(*ngs->lastphn_cand)); 00391 init_search_tree(ngs); 00392 create_search_tree(ngs); 00393 } 00394 00395 static void 00396 deinit_search_tree(ngram_search_t *ngs) 00397 { 00398 int i, w, n_words; 00399 00400 n_words = ps_search_n_words(ngs); 00401 for (i = 0; i < ngs->n_root_chan_alloc; i++) { 00402 hmm_deinit(&ngs->root_chan[i].hmm); 00403 } 00404 if (ngs->rhmm_1ph) { 00405 for (i = w = 0; w < n_words; ++w) { 00406 if (!dict_is_single_phone(ps_search_dict(ngs), w)) 00407 continue; 00408 hmm_deinit(&ngs->rhmm_1ph[i].hmm); 00409 ++i; 00410 } 00411 ckd_free(ngs->rhmm_1ph); 00412 ngs->rhmm_1ph = NULL; 00413 } 00414 ngs->n_root_chan = 0; 00415 ngs->n_root_chan_alloc = 0; 00416 ckd_free(ngs->root_chan); 00417 ngs->root_chan = NULL; 00418 ckd_free(ngs->single_phone_wid); 00419 ngs->single_phone_wid = NULL; 00420 ckd_free(ngs->homophone_set); 00421 ngs->homophone_set = NULL; 00422 } 00423 00424 void 00425 ngram_fwdtree_deinit(ngram_search_t *ngs) 00426 { 00427 double n_speech = (double)ngs->n_tot_frame 00428 / cmd_ln_int32_r(ps_search_config(ngs), "-frate"); 00429 00430 E_INFO("TOTAL fwdtree %.2f CPU %.3f xRT\n", 00431 ngs->fwdtree_perf.t_tot_cpu, 00432 ngs->fwdtree_perf.t_tot_cpu / n_speech); 00433 E_INFO("TOTAL fwdtree %.2f wall %.3f xRT\n", 00434 ngs->fwdtree_perf.t_tot_elapsed, 00435 ngs->fwdtree_perf.t_tot_elapsed / n_speech); 00436 00437 /* Reset non-root channels. */ 00438 reinit_search_tree(ngs); 00439 /* Free the search tree. */ 00440 deinit_search_tree(ngs); 00441 /* Free other stuff. */ 00442 ngs->max_nonroot_chan = 0; 00443 ckd_free_2d(ngs->active_chan_list); 00444 ngs->active_chan_list = NULL; 00445 ckd_free(ngs->cand_sf); 00446 ngs->cand_sf = NULL; 00447 ckd_free(ngs->bestbp_rc); 00448 ngs->bestbp_rc = NULL; 00449 ckd_free(ngs->lastphn_cand); 00450 ngs->lastphn_cand = NULL; 00451 } 00452 00453 int 00454 ngram_fwdtree_reinit(ngram_search_t *ngs) 00455 { 00456 /* Reset non-root channels. */ 00457 reinit_search_tree(ngs); 00458 /* Free the search tree. */ 00459 deinit_search_tree(ngs); 00460 /* Reallocate things that depend on the number of words. */ 00461 ckd_free(ngs->lastphn_cand); 00462 ngs->lastphn_cand = ckd_calloc(ps_search_n_words(ngs), 00463 sizeof(*ngs->lastphn_cand)); 00464 ckd_free(ngs->word_chan); 00465 ngs->word_chan = ckd_calloc(ps_search_n_words(ngs), 00466 sizeof(*ngs->word_chan)); 00467 /* Rebuild the search tree. */ 00468 init_search_tree(ngs); 00469 create_search_tree(ngs); 00470 return 0; 00471 } 00472 00473 void 00474 ngram_fwdtree_start(ngram_search_t *ngs) 00475 { 00476 ps_search_t *base = (ps_search_t *)ngs; 00477 int32 i, w, n_words; 00478 root_chan_t *rhmm; 00479 00480 n_words = ps_search_n_words(ngs); 00481 00482 /* Reset utterance statistics. */ 00483 memset(&ngs->st, 0, sizeof(ngs->st)); 00484 ptmr_reset(&ngs->fwdtree_perf); 00485 ptmr_start(&ngs->fwdtree_perf); 00486 00487 /* Reset backpointer table. */ 00488 ngs->bpidx = 0; 00489 ngs->bss_head = 0; 00490 00491 /* Reset word lattice. */ 00492 for (i = 0; i < n_words; ++i) 00493 ngs->word_lat_idx[i] = NO_BP; 00494 00495 /* Reset active HMM and word lists. */ 00496 ngs->n_active_chan[0] = ngs->n_active_chan[1] = 0; 00497 ngs->n_active_word[0] = ngs->n_active_word[1] = 0; 00498 00499 /* Reset scores. */ 00500 ngs->best_score = 0; 00501 ngs->renormalized = 0; 00502 00503 /* Reset other stuff. */ 00504 for (i = 0; i < n_words; i++) 00505 ngs->last_ltrans[i].sf = -1; 00506 ngs->n_frame = 0; 00507 00508 /* Clear the hypothesis string. */ 00509 ckd_free(base->hyp_str); 00510 base->hyp_str = NULL; 00511 00512 /* Reset the permanently allocated single-phone words, since they 00513 * may have junk left over in them from FWDFLAT. */ 00514 for (i = 0; i < ngs->n_1ph_words; i++) { 00515 w = ngs->single_phone_wid[i]; 00516 rhmm = (root_chan_t *) ngs->word_chan[w]; 00517 hmm_clear(&rhmm->hmm); 00518 } 00519 00520 /* Start search with <s>; word_chan[<s>] is permanently allocated */ 00521 rhmm = (root_chan_t *) ngs->word_chan[dict_startwid(ps_search_dict(ngs))]; 00522 hmm_clear(&rhmm->hmm); 00523 hmm_enter(&rhmm->hmm, 0, NO_BP, 0); 00524 } 00525 00526 /* 00527 * Mark the active senones for all senones belonging to channels that are active in the 00528 * current frame. 00529 */ 00530 static void 00531 compute_sen_active(ngram_search_t *ngs, int frame_idx) 00532 { 00533 root_chan_t *rhmm; 00534 chan_t *hmm, **acl; 00535 int32 i, w, *awl; 00536 00537 acmod_clear_active(ps_search_acmod(ngs)); 00538 00539 /* Flag active senones for root channels */ 00540 for (i = ngs->n_root_chan, rhmm = ngs->root_chan; i > 0; --i, rhmm++) { 00541 if (hmm_frame(&rhmm->hmm) == frame_idx) 00542 acmod_activate_hmm(ps_search_acmod(ngs), &rhmm->hmm); 00543 } 00544 00545 /* Flag active senones for nonroot channels in HMM tree */ 00546 i = ngs->n_active_chan[frame_idx & 0x1]; 00547 acl = ngs->active_chan_list[frame_idx & 0x1]; 00548 for (hmm = *(acl++); i > 0; --i, hmm = *(acl++)) { 00549 acmod_activate_hmm(ps_search_acmod(ngs), &hmm->hmm); 00550 } 00551 00552 /* Flag active senones for individual word channels */ 00553 i = ngs->n_active_word[frame_idx & 0x1]; 00554 awl = ngs->active_word_list[frame_idx & 0x1]; 00555 for (w = *(awl++); i > 0; --i, w = *(awl++)) { 00556 for (hmm = ngs->word_chan[w]; hmm; hmm = hmm->next) { 00557 acmod_activate_hmm(ps_search_acmod(ngs), &hmm->hmm); 00558 } 00559 } 00560 for (i = 0; i < ngs->n_1ph_words; i++) { 00561 w = ngs->single_phone_wid[i]; 00562 rhmm = (root_chan_t *) ngs->word_chan[w]; 00563 00564 if (hmm_frame(&rhmm->hmm) == frame_idx) 00565 acmod_activate_hmm(ps_search_acmod(ngs), &rhmm->hmm); 00566 } 00567 } 00568 00569 static void 00570 renormalize_scores(ngram_search_t *ngs, int frame_idx, int32 norm) 00571 { 00572 root_chan_t *rhmm; 00573 chan_t *hmm, **acl; 00574 int32 i, w, *awl; 00575 00576 /* Renormalize root channels */ 00577 for (i = ngs->n_root_chan, rhmm = ngs->root_chan; i > 0; --i, rhmm++) { 00578 if (hmm_frame(&rhmm->hmm) == frame_idx) { 00579 hmm_normalize(&rhmm->hmm, norm); 00580 } 00581 } 00582 00583 /* Renormalize nonroot channels in HMM tree */ 00584 i = ngs->n_active_chan[frame_idx & 0x1]; 00585 acl = ngs->active_chan_list[frame_idx & 0x1]; 00586 for (hmm = *(acl++); i > 0; --i, hmm = *(acl++)) { 00587 hmm_normalize(&hmm->hmm, norm); 00588 } 00589 00590 /* Renormalize individual word channels */ 00591 i = ngs->n_active_word[frame_idx & 0x1]; 00592 awl = ngs->active_word_list[frame_idx & 0x1]; 00593 for (w = *(awl++); i > 0; --i, w = *(awl++)) { 00594 for (hmm = ngs->word_chan[w]; hmm; hmm = hmm->next) { 00595 hmm_normalize(&hmm->hmm, norm); 00596 } 00597 } 00598 for (i = 0; i < ngs->n_1ph_words; i++) { 00599 w = ngs->single_phone_wid[i]; 00600 rhmm = (root_chan_t *) ngs->word_chan[w]; 00601 if (hmm_frame(&rhmm->hmm) == frame_idx) { 00602 hmm_normalize(&rhmm->hmm, norm); 00603 } 00604 } 00605 00606 ngs->renormalized = TRUE; 00607 } 00608 00609 static int32 00610 eval_root_chan(ngram_search_t *ngs, int frame_idx) 00611 { 00612 root_chan_t *rhmm; 00613 int32 i, bestscore; 00614 00615 bestscore = WORST_SCORE; 00616 for (i = ngs->n_root_chan, rhmm = ngs->root_chan; i > 0; --i, rhmm++) { 00617 if (hmm_frame(&rhmm->hmm) == frame_idx) { 00618 int32 score = chan_v_eval(rhmm); 00619 if (score BETTER_THAN bestscore) 00620 bestscore = score; 00621 ++ngs->st.n_root_chan_eval; 00622 } 00623 } 00624 return (bestscore); 00625 } 00626 00627 static int32 00628 eval_nonroot_chan(ngram_search_t *ngs, int frame_idx) 00629 { 00630 chan_t *hmm, **acl; 00631 int32 i, bestscore; 00632 00633 i = ngs->n_active_chan[frame_idx & 0x1]; 00634 acl = ngs->active_chan_list[frame_idx & 0x1]; 00635 bestscore = WORST_SCORE; 00636 ngs->st.n_nonroot_chan_eval += i; 00637 00638 for (hmm = *(acl++); i > 0; --i, hmm = *(acl++)) { 00639 int32 score = chan_v_eval(hmm); 00640 assert(hmm_frame(&hmm->hmm) == frame_idx); 00641 if (score BETTER_THAN bestscore) 00642 bestscore = score; 00643 } 00644 00645 return bestscore; 00646 } 00647 00648 static int32 00649 eval_word_chan(ngram_search_t *ngs, int frame_idx) 00650 { 00651 root_chan_t *rhmm; 00652 chan_t *hmm; 00653 int32 i, w, bestscore, *awl, j, k; 00654 00655 k = 0; 00656 bestscore = WORST_SCORE; 00657 awl = ngs->active_word_list[frame_idx & 0x1]; 00658 00659 i = ngs->n_active_word[frame_idx & 0x1]; 00660 for (w = *(awl++); i > 0; --i, w = *(awl++)) { 00661 assert(bitvec_is_set(ngs->word_active, w)); 00662 bitvec_clear(ngs->word_active, w); 00663 assert(ngs->word_chan[w] != NULL); 00664 00665 for (hmm = ngs->word_chan[w]; hmm; hmm = hmm->next) { 00666 int32 score; 00667 00668 assert(hmm_frame(&hmm->hmm) == frame_idx); 00669 score = chan_v_eval(hmm); 00670 /*printf("eval word chan %d score %d\n", w, score); */ 00671 00672 if (score BETTER_THAN bestscore) 00673 bestscore = score; 00674 00675 k++; 00676 } 00677 } 00678 00679 /* Similarly for statically allocated single-phone words */ 00680 j = 0; 00681 for (i = 0; i < ngs->n_1ph_words; i++) { 00682 int32 score; 00683 00684 w = ngs->single_phone_wid[i]; 00685 rhmm = (root_chan_t *) ngs->word_chan[w]; 00686 if (hmm_frame(&rhmm->hmm) < frame_idx) 00687 continue; 00688 00689 score = chan_v_eval(rhmm); 00690 /* printf("eval 1ph word chan %d score %d\n", w, score); */ 00691 if (score BETTER_THAN bestscore && w != ps_search_finish_wid(ngs)) 00692 bestscore = score; 00693 00694 j++; 00695 } 00696 00697 ngs->st.n_last_chan_eval += k + j; 00698 ngs->st.n_nonroot_chan_eval += k + j; 00699 ngs->st.n_word_lastchan_eval += 00700 ngs->n_active_word[frame_idx & 0x1] + j; 00701 00702 return bestscore; 00703 } 00704 00705 static int32 00706 evaluate_channels(ngram_search_t *ngs, int16 const *senone_scores, int frame_idx) 00707 { 00708 int32 bs; 00709 00710 hmm_context_set_senscore(ngs->hmmctx, senone_scores); 00711 ngs->best_score = eval_root_chan(ngs, frame_idx); 00712 if ((bs = eval_nonroot_chan(ngs, frame_idx)) BETTER_THAN ngs->best_score) 00713 ngs->best_score = bs; 00714 if ((bs = eval_word_chan(ngs, frame_idx)) BETTER_THAN ngs->best_score) 00715 ngs->best_score = bs; 00716 ngs->last_phone_best_score = bs; 00717 00718 return ngs->best_score; 00719 } 00720 00721 /* 00722 * Prune currently active root channels for next frame. Also, perform exit 00723 * transitions out of them and activate successors. 00724 * score[] of pruned root chan set to WORST_SCORE elsewhere. 00725 */ 00726 static void 00727 prune_root_chan(ngram_search_t *ngs, int frame_idx) 00728 { 00729 root_chan_t *rhmm; 00730 chan_t *hmm; 00731 int32 i, nf, w; 00732 int32 thresh, newphone_thresh, lastphn_thresh, newphone_score; 00733 chan_t **nacl; /* next active list */ 00734 lastphn_cand_t *candp; 00735 phone_loop_search_t *pls; 00736 00737 nf = frame_idx + 1; 00738 thresh = ngs->best_score + ngs->dynamic_beam; 00739 newphone_thresh = ngs->best_score + ngs->pbeam; 00740 lastphn_thresh = ngs->best_score + ngs->lpbeam; 00741 nacl = ngs->active_chan_list[nf & 0x1]; 00742 pls = (phone_loop_search_t *)ps_search_lookahead(ngs); 00743 00744 for (i = 0, rhmm = ngs->root_chan; i < ngs->n_root_chan; i++, rhmm++) { 00745 E_DEBUG(3,("Root channel %d frame %d score %d thresh %d\n", 00746 i, hmm_frame(&rhmm->hmm), hmm_bestscore(&rhmm->hmm), thresh)); 00747 /* First check if this channel was active in current frame */ 00748 if (hmm_frame(&rhmm->hmm) < frame_idx) 00749 continue; 00750 00751 if (hmm_bestscore(&rhmm->hmm) BETTER_THAN thresh) { 00752 hmm_frame(&rhmm->hmm) = nf; /* rhmm will be active in next frame */ 00753 E_DEBUG(3,("Preserving root channel %d score %d\n", i, hmm_bestscore(&rhmm->hmm))); 00754 /* transitions out of this root channel */ 00755 /* transition to all next-level channels in the HMM tree */ 00756 newphone_score = hmm_out_score(&rhmm->hmm) + ngs->pip; 00757 if (pls != NULL || newphone_score BETTER_THAN newphone_thresh) { 00758 for (hmm = rhmm->next; hmm; hmm = hmm->alt) { 00759 int32 pl_newphone_score = newphone_score 00760 + phone_loop_search_score(pls, hmm->ciphone); 00761 if (pl_newphone_score BETTER_THAN newphone_thresh) { 00762 if ((hmm_frame(&hmm->hmm) < frame_idx) 00763 || (pl_newphone_score BETTER_THAN hmm_in_score(&hmm->hmm))) { 00764 hmm_enter(&hmm->hmm, pl_newphone_score, 00765 hmm_out_history(&rhmm->hmm), nf); 00766 *(nacl++) = hmm; 00767 } 00768 } 00769 } 00770 } 00771 00772 /* 00773 * Transition to last phone of all words for which this is the 00774 * penultimate phone (the last phones may need multiple right contexts). 00775 * Remember to remove the temporary newword_penalty. 00776 */ 00777 if (pls != NULL || newphone_score BETTER_THAN lastphn_thresh) { 00778 for (w = rhmm->penult_phn_wid; w >= 0; 00779 w = ngs->homophone_set[w]) { 00780 int32 pl_newphone_score = newphone_score 00781 + phone_loop_search_score 00782 (pls, dict_last_phone(ps_search_dict(ngs),w)); 00783 E_DEBUG(3,("word %s newphone_score %d\n", dict_wordstr(ps_search_dict(ngs), w), newphone_score)); 00784 if (pl_newphone_score BETTER_THAN lastphn_thresh) { 00785 candp = ngs->lastphn_cand + ngs->n_lastphn_cand; 00786 ngs->n_lastphn_cand++; 00787 candp->wid = w; 00788 candp->score = 00789 pl_newphone_score - ngs->nwpen; 00790 candp->bp = hmm_out_history(&rhmm->hmm); 00791 } 00792 } 00793 } 00794 } 00795 } 00796 ngs->n_active_chan[nf & 0x1] = nacl - ngs->active_chan_list[nf & 0x1]; 00797 } 00798 00799 /* 00800 * Prune currently active nonroot channels in HMM tree for next frame. Also, perform 00801 * exit transitions out of such channels and activate successors. 00802 */ 00803 static void 00804 prune_nonroot_chan(ngram_search_t *ngs, int frame_idx) 00805 { 00806 chan_t *hmm, *nexthmm; 00807 int32 nf, w, i; 00808 int32 thresh, newphone_thresh, lastphn_thresh, newphone_score; 00809 chan_t **acl, **nacl; /* active list, next active list */ 00810 lastphn_cand_t *candp; 00811 phone_loop_search_t *pls; 00812 00813 nf = frame_idx + 1; 00814 00815 thresh = ngs->best_score + ngs->dynamic_beam; 00816 newphone_thresh = ngs->best_score + ngs->pbeam; 00817 lastphn_thresh = ngs->best_score + ngs->lpbeam; 00818 pls = (phone_loop_search_t *)ps_search_lookahead(ngs); 00819 00820 acl = ngs->active_chan_list[frame_idx & 0x1]; /* currently active HMMs in tree */ 00821 nacl = ngs->active_chan_list[nf & 0x1] + ngs->n_active_chan[nf & 0x1]; 00822 00823 for (i = ngs->n_active_chan[frame_idx & 0x1], hmm = *(acl++); i > 0; 00824 --i, hmm = *(acl++)) { 00825 assert(hmm_frame(&hmm->hmm) >= frame_idx); 00826 00827 if (hmm_bestscore(&hmm->hmm) BETTER_THAN thresh) { 00828 /* retain this channel in next frame */ 00829 if (hmm_frame(&hmm->hmm) != nf) { 00830 hmm_frame(&hmm->hmm) = nf; 00831 *(nacl++) = hmm; 00832 } 00833 00834 /* transition to all next-level channel in the HMM tree */ 00835 newphone_score = hmm_out_score(&hmm->hmm) + ngs->pip; 00836 if (pls != NULL || newphone_score BETTER_THAN newphone_thresh) { 00837 for (nexthmm = hmm->next; nexthmm; nexthmm = nexthmm->alt) { 00838 int32 pl_newphone_score = newphone_score 00839 + phone_loop_search_score(pls, nexthmm->ciphone); 00840 if ((pl_newphone_score BETTER_THAN newphone_thresh) 00841 && ((hmm_frame(&nexthmm->hmm) < frame_idx) 00842 || (pl_newphone_score 00843 BETTER_THAN hmm_in_score(&nexthmm->hmm)))) { 00844 if (hmm_frame(&nexthmm->hmm) != nf) { 00845 /* Keep this HMM on the active list */ 00846 *(nacl++) = nexthmm; 00847 } 00848 hmm_enter(&nexthmm->hmm, pl_newphone_score, 00849 hmm_out_history(&hmm->hmm), nf); 00850 } 00851 } 00852 } 00853 00854 /* 00855 * Transition to last phone of all words for which this is the 00856 * penultimate phone (the last phones may need multiple right contexts). 00857 * Remember to remove the temporary newword_penalty. 00858 */ 00859 if (pls != NULL || newphone_score BETTER_THAN lastphn_thresh) { 00860 for (w = hmm->info.penult_phn_wid; w >= 0; 00861 w = ngs->homophone_set[w]) { 00862 int32 pl_newphone_score = newphone_score 00863 + phone_loop_search_score 00864 (pls, dict_last_phone(ps_search_dict(ngs),w)); 00865 if (pl_newphone_score BETTER_THAN lastphn_thresh) { 00866 candp = ngs->lastphn_cand + ngs->n_lastphn_cand; 00867 ngs->n_lastphn_cand++; 00868 candp->wid = w; 00869 candp->score = 00870 pl_newphone_score - ngs->nwpen; 00871 candp->bp = hmm_out_history(&hmm->hmm); 00872 } 00873 } 00874 } 00875 } 00876 else if (hmm_frame(&hmm->hmm) != nf) { 00877 hmm_clear(&hmm->hmm); 00878 } 00879 } 00880 ngs->n_active_chan[nf & 0x1] = nacl - ngs->active_chan_list[nf & 0x1]; 00881 } 00882 00883 /* 00884 * Execute the transition into the last phone for all candidates words emerging from 00885 * the HMM tree. Attach LM scores to such transitions. 00886 * (Executed after pruning root and non-root, but before pruning word-chan.) 00887 */ 00888 static void 00889 last_phone_transition(ngram_search_t *ngs, int frame_idx) 00890 { 00891 int32 i, j, k, nf, bp, bpend, w; 00892 lastphn_cand_t *candp; 00893 int32 *nawl; 00894 int32 thresh; 00895 int32 bestscore, dscr; 00896 chan_t *hmm; 00897 bptbl_t *bpe; 00898 int32 n_cand_sf = 0; 00899 00900 nf = frame_idx + 1; 00901 nawl = ngs->active_word_list[nf & 0x1]; 00902 ngs->st.n_lastphn_cand_utt += ngs->n_lastphn_cand; 00903 00904 /* For each candidate word (entering its last phone) */ 00905 /* If best LM score and bp for candidate known use it, else sort cands by startfrm */ 00906 for (i = 0, candp = ngs->lastphn_cand; i < ngs->n_lastphn_cand; i++, candp++) { 00907 int32 start_score; 00908 00909 /* This can happen if recognition fails. */ 00910 if (candp->bp == -1) 00911 continue; 00912 /* Backpointer entry for it. */ 00913 bpe = &(ngs->bp_table[candp->bp]); 00914 00915 /* Subtract starting score for candidate, leave it with only word score */ 00916 start_score = ngram_search_exit_score 00917 (ngs, bpe, dict_first_phone(ps_search_dict(ngs), candp->wid)); 00918 assert(start_score BETTER_THAN WORST_SCORE); 00919 candp->score -= start_score; 00920 00921 /* 00922 * If this candidate not occurred in an earlier frame, prepare for finding 00923 * best transition score into last phone; sort by start frame. 00924 */ 00925 /* i.e. if we don't have an entry in last_ltrans for this 00926 * <word,sf>, then create one */ 00927 if (ngs->last_ltrans[candp->wid].sf != bpe->frame + 1) { 00928 /* Look for an entry in cand_sf matching the backpointer 00929 * for this candidate. */ 00930 for (j = 0; j < n_cand_sf; j++) { 00931 if (ngs->cand_sf[j].bp_ef == bpe->frame) 00932 break; 00933 } 00934 /* Oh, we found one, so chain onto it. */ 00935 if (j < n_cand_sf) 00936 candp->next = ngs->cand_sf[j].cand; 00937 else { 00938 /* Nope, let's make a new one, allocating cand_sf if necessary. */ 00939 if (n_cand_sf >= ngs->cand_sf_alloc) { 00940 if (ngs->cand_sf_alloc == 0) { 00941 ngs->cand_sf = 00942 ckd_calloc(CAND_SF_ALLOCSIZE, 00943 sizeof(*ngs->cand_sf)); 00944 ngs->cand_sf_alloc = CAND_SF_ALLOCSIZE; 00945 } 00946 else { 00947 ngs->cand_sf_alloc += CAND_SF_ALLOCSIZE; 00948 ngs->cand_sf = ckd_realloc(ngs->cand_sf, 00949 ngs->cand_sf_alloc 00950 * sizeof(*ngs->cand_sf)); 00951 E_INFO("cand_sf[] increased to %d entries\n", 00952 ngs->cand_sf_alloc); 00953 } 00954 } 00955 00956 /* Use the newly created cand_sf. */ 00957 j = n_cand_sf++; 00958 candp->next = -1; /* End of the chain. */ 00959 ngs->cand_sf[j].bp_ef = bpe->frame; 00960 } 00961 /* Update it to point to this candidate. */ 00962 ngs->cand_sf[j].cand = i; 00963 00964 ngs->last_ltrans[candp->wid].dscr = WORST_SCORE; 00965 ngs->last_ltrans[candp->wid].sf = bpe->frame + 1; 00966 } 00967 } 00968 00969 /* Compute best LM score and bp for new cands entered in the sorted lists above */ 00970 for (i = 0; i < n_cand_sf; i++) { 00971 /* For the i-th unique end frame... */ 00972 bp = ngs->bp_table_idx[ngs->cand_sf[i].bp_ef]; 00973 bpend = ngs->bp_table_idx[ngs->cand_sf[i].bp_ef + 1]; 00974 for (bpe = &(ngs->bp_table[bp]); bp < bpend; bp++, bpe++) { 00975 if (!bpe->valid) 00976 continue; 00977 /* For each candidate at the start frame find bp->cand transition-score */ 00978 for (j = ngs->cand_sf[i].cand; j >= 0; j = candp->next) { 00979 int32 n_used; 00980 candp = &(ngs->lastphn_cand[j]); 00981 dscr = 00982 ngram_search_exit_score 00983 (ngs, bpe, dict_first_phone(ps_search_dict(ngs), candp->wid)); 00984 if (dscr BETTER_THAN WORST_SCORE) { 00985 assert(!dict_filler_word(ps_search_dict(ngs), candp->wid)); 00986 dscr += ngram_tg_score(ngs->lmset, 00987 dict_basewid(ps_search_dict(ngs), candp->wid), 00988 bpe->real_wid, 00989 bpe->prev_real_wid, 00990 &n_used)>>SENSCR_SHIFT; 00991 } 00992 00993 if (dscr BETTER_THAN ngs->last_ltrans[candp->wid].dscr) { 00994 ngs->last_ltrans[candp->wid].dscr = dscr; 00995 ngs->last_ltrans[candp->wid].bp = bp; 00996 } 00997 } 00998 } 00999 } 01000 01001 /* Update best transitions for all candidates; also update best lastphone score */ 01002 bestscore = ngs->last_phone_best_score; 01003 for (i = 0, candp = ngs->lastphn_cand; i < ngs->n_lastphn_cand; i++, candp++) { 01004 candp->score += ngs->last_ltrans[candp->wid].dscr; 01005 candp->bp = ngs->last_ltrans[candp->wid].bp; 01006 01007 if (candp->score BETTER_THAN bestscore) 01008 bestscore = candp->score; 01009 } 01010 ngs->last_phone_best_score = bestscore; 01011 01012 /* At this pt, we know the best entry score (with LM component) for all candidates */ 01013 thresh = bestscore + ngs->lponlybeam; 01014 for (i = ngs->n_lastphn_cand, candp = ngs->lastphn_cand; i > 0; --i, candp++) { 01015 if (candp->score BETTER_THAN thresh) { 01016 w = candp->wid; 01017 01018 ngram_search_alloc_all_rc(ngs, w); 01019 01020 k = 0; 01021 for (hmm = ngs->word_chan[w]; hmm; hmm = hmm->next) { 01022 if ((hmm_frame(&hmm->hmm) < frame_idx) 01023 || (candp->score BETTER_THAN hmm_in_score(&hmm->hmm))) { 01024 assert(hmm_frame(&hmm->hmm) != nf); 01025 hmm_enter(&hmm->hmm, 01026 candp->score, candp->bp, nf); 01027 k++; 01028 } 01029 } 01030 if (k > 0) { 01031 assert(bitvec_is_clear(ngs->word_active, w)); 01032 assert(!dict_is_single_phone(ps_search_dict(ngs), w)); 01033 *(nawl++) = w; 01034 bitvec_set(ngs->word_active, w); 01035 } 01036 } 01037 } 01038 ngs->n_active_word[nf & 0x1] = nawl - ngs->active_word_list[nf & 0x1]; 01039 } 01040 01041 /* 01042 * Prune currently active word channels for next frame. Also, perform exit 01043 * transitions out of such channels and active successors. 01044 */ 01045 static void 01046 prune_word_chan(ngram_search_t *ngs, int frame_idx) 01047 { 01048 root_chan_t *rhmm; 01049 chan_t *hmm, *thmm; 01050 chan_t **phmmp; /* previous HMM-pointer */ 01051 int32 nf, w, i, k; 01052 int32 newword_thresh, lastphn_thresh; 01053 int32 *awl, *nawl; 01054 01055 nf = frame_idx + 1; 01056 newword_thresh = ngs->last_phone_best_score + ngs->wbeam; 01057 lastphn_thresh = ngs->last_phone_best_score + ngs->lponlybeam; 01058 01059 awl = ngs->active_word_list[frame_idx & 0x1]; 01060 nawl = ngs->active_word_list[nf & 0x1] + ngs->n_active_word[nf & 0x1]; 01061 01062 /* Dynamically allocated last channels of multi-phone words */ 01063 for (i = ngs->n_active_word[frame_idx & 0x1], w = *(awl++); i > 0; 01064 --i, w = *(awl++)) { 01065 k = 0; 01066 phmmp = &(ngs->word_chan[w]); 01067 for (hmm = ngs->word_chan[w]; hmm; hmm = thmm) { 01068 assert(hmm_frame(&hmm->hmm) >= frame_idx); 01069 01070 thmm = hmm->next; 01071 if (hmm_bestscore(&hmm->hmm) BETTER_THAN lastphn_thresh) { 01072 /* retain this channel in next frame */ 01073 hmm_frame(&hmm->hmm) = nf; 01074 k++; 01075 phmmp = &(hmm->next); 01076 01077 /* Could if ((! skip_alt_frm) || (frame_idx & 0x1)) the following */ 01078 if (hmm_out_score(&hmm->hmm) BETTER_THAN newword_thresh) { 01079 /* can exit channel and recognize word */ 01080 ngram_search_save_bp(ngs, frame_idx, w, 01081 hmm_out_score(&hmm->hmm), 01082 hmm_out_history(&hmm->hmm), 01083 hmm->info.rc_id); 01084 } 01085 } 01086 else if (hmm_frame(&hmm->hmm) == nf) { 01087 phmmp = &(hmm->next); 01088 } 01089 else { 01090 hmm_deinit(&hmm->hmm); 01091 listelem_free(ngs->chan_alloc, hmm); 01092 *phmmp = thmm; 01093 } 01094 } 01095 if ((k > 0) && (bitvec_is_clear(ngs->word_active, w))) { 01096 assert(!dict_is_single_phone(ps_search_dict(ngs), w)); 01097 *(nawl++) = w; 01098 bitvec_set(ngs->word_active, w); 01099 } 01100 } 01101 ngs->n_active_word[nf & 0x1] = nawl - ngs->active_word_list[nf & 0x1]; 01102 01103 /* 01104 * Prune permanently allocated single-phone channels. 01105 * NOTES: score[] of pruned channels set to WORST_SCORE elsewhere. 01106 */ 01107 for (i = 0; i < ngs->n_1ph_words; i++) { 01108 w = ngs->single_phone_wid[i]; 01109 rhmm = (root_chan_t *) ngs->word_chan[w]; 01110 E_DEBUG(3,("Single phone word %s frame %d score %d thresh %d outscore %d nwthresh %d\n", 01111 dict_wordstr(ps_search_dict(ngs),w), 01112 hmm_frame(&rhmm->hmm), hmm_bestscore(&rhmm->hmm), 01113 lastphn_thresh, hmm_out_score(&rhmm->hmm), newword_thresh)); 01114 if (hmm_frame(&rhmm->hmm) < frame_idx) 01115 continue; 01116 if (hmm_bestscore(&rhmm->hmm) BETTER_THAN lastphn_thresh) { 01117 hmm_frame(&rhmm->hmm) = nf; 01118 01119 /* Could if ((! skip_alt_frm) || (frame_idx & 0x1)) the following */ 01120 if (hmm_out_score(&rhmm->hmm) BETTER_THAN newword_thresh) { 01121 E_DEBUG(4,("Exiting single phone word %s with %d > %d, %d\n", 01122 dict_wordstr(ps_search_dict(ngs),w), 01123 hmm_out_score(&rhmm->hmm), 01124 lastphn_thresh, newword_thresh)); 01125 ngram_search_save_bp(ngs, frame_idx, w, 01126 hmm_out_score(&rhmm->hmm), 01127 hmm_out_history(&rhmm->hmm), 0); 01128 } 01129 } 01130 } 01131 } 01132 01133 static void 01134 prune_channels(ngram_search_t *ngs, int frame_idx) 01135 { 01136 /* Clear last phone candidate list. */ 01137 ngs->n_lastphn_cand = 0; 01138 /* Set the dynamic beam based on maxhmmpf here. */ 01139 ngs->dynamic_beam = ngs->beam; 01140 if (ngs->maxhmmpf != -1 01141 && ngs->st.n_root_chan_eval + ngs->st.n_nonroot_chan_eval > ngs->maxhmmpf) { 01142 /* Build a histogram to approximately prune them. */ 01143 int32 bins[256], bw, nhmms, i; 01144 root_chan_t *rhmm; 01145 chan_t **acl, *hmm; 01146 01147 /* Bins go from zero (best score) to edge of beam. */ 01148 bw = -ngs->beam / 256; 01149 memset(bins, 0, sizeof(bins)); 01150 /* For each active root channel. */ 01151 for (i = 0, rhmm = ngs->root_chan; i < ngs->n_root_chan; i++, rhmm++) { 01152 int32 b; 01153 01154 /* Put it in a bin according to its bestscore. */ 01155 b = (ngs->best_score - hmm_bestscore(&rhmm->hmm)) / bw; 01156 if (b >= 256) 01157 b = 255; 01158 ++bins[b]; 01159 } 01160 /* For each active non-root channel. */ 01161 acl = ngs->active_chan_list[frame_idx & 0x1]; /* currently active HMMs in tree */ 01162 for (i = ngs->n_active_chan[frame_idx & 0x1], hmm = *(acl++); 01163 i > 0; --i, hmm = *(acl++)) { 01164 int32 b; 01165 01166 /* Put it in a bin according to its bestscore. */ 01167 b = (ngs->best_score - hmm_bestscore(&hmm->hmm)) / bw; 01168 if (b >= 256) 01169 b = 255; 01170 ++bins[b]; 01171 } 01172 /* Walk down the bins to find the new beam. */ 01173 for (i = nhmms = 0; i < 256; ++i) { 01174 nhmms += bins[i]; 01175 if (nhmms > ngs->maxhmmpf) 01176 break; 01177 } 01178 ngs->dynamic_beam = -(i * bw); 01179 } 01180 01181 prune_root_chan(ngs, frame_idx); 01182 prune_nonroot_chan(ngs, frame_idx); 01183 last_phone_transition(ngs, frame_idx); 01184 prune_word_chan(ngs, frame_idx); 01185 } 01186 01187 /* 01188 * Limit the number of word exits in each frame to maxwpf. And also limit the number of filler 01189 * words to 1. 01190 */ 01191 static void 01192 bptable_maxwpf(ngram_search_t *ngs, int frame_idx) 01193 { 01194 int32 bp, n; 01195 int32 bestscr, worstscr; 01196 bptbl_t *bpe, *bestbpe, *worstbpe; 01197 01198 /* Don't prune if no pruing. */ 01199 if (ngs->maxwpf == -1 || ngs->maxwpf == ps_search_n_words(ngs)) 01200 return; 01201 01202 /* Allow only one filler word exit (the best) per frame */ 01203 bestscr = (int32) 0x80000000; 01204 bestbpe = NULL; 01205 n = 0; 01206 for (bp = ngs->bp_table_idx[frame_idx]; bp < ngs->bpidx; bp++) { 01207 bpe = &(ngs->bp_table[bp]); 01208 if (dict_filler_word(ps_search_dict(ngs), bpe->wid)) { 01209 if (bpe->score BETTER_THAN bestscr) { 01210 bestscr = bpe->score; 01211 bestbpe = bpe; 01212 } 01213 bpe->valid = FALSE; 01214 n++; /* No. of filler words */ 01215 } 01216 } 01217 /* Restore bestbpe to valid state */ 01218 if (bestbpe != NULL) { 01219 bestbpe->valid = TRUE; 01220 --n; 01221 } 01222 01223 /* Allow up to maxwpf best entries to survive; mark the remaining with valid = 0 */ 01224 n = (ngs->bpidx 01225 - ngs->bp_table_idx[frame_idx]) - n; /* No. of entries after limiting fillers */ 01226 for (; n > ngs->maxwpf; --n) { 01227 /* Find worst BPTable entry */ 01228 worstscr = (int32) 0x7fffffff; 01229 worstbpe = NULL; 01230 for (bp = ngs->bp_table_idx[frame_idx]; (bp < ngs->bpidx); bp++) { 01231 bpe = &(ngs->bp_table[bp]); 01232 if (bpe->valid && (bpe->score WORSE_THAN worstscr)) { 01233 worstscr = bpe->score; 01234 worstbpe = bpe; 01235 } 01236 } 01237 /* FIXME: Don't panic! */ 01238 if (worstbpe == NULL) 01239 E_FATAL("PANIC: No worst BPtable entry remaining\n"); 01240 worstbpe->valid = FALSE; 01241 } 01242 } 01243 01244 static void 01245 word_transition(ngram_search_t *ngs, int frame_idx) 01246 { 01247 int32 i, k, bp, w, nf; 01248 int32 rc; 01249 int32 thresh, newscore; 01250 bptbl_t *bpe; 01251 root_chan_t *rhmm; 01252 struct bestbp_rc_s *bestbp_rc_ptr; 01253 phone_loop_search_t *pls; 01254 dict_t *dict = ps_search_dict(ngs); 01255 dict2pid_t *d2p = ps_search_dict2pid(ngs); 01256 01257 /* 01258 * Transition to start of new word instances (HMM tree roots); but only if words 01259 * other than </s> finished here. 01260 * But, first, find the best starting score for each possible right context phone. 01261 */ 01262 for (i = bin_mdef_n_ciphone(ps_search_acmod(ngs)->mdef) - 1; i >= 0; --i) 01263 ngs->bestbp_rc[i].score = WORST_SCORE; 01264 k = 0; 01265 pls = (phone_loop_search_t *)ps_search_lookahead(ngs); 01266 /* Ugh, this is complicated. Scan all word exits for this frame 01267 * (they have already been created by prune_word_chan()). */ 01268 for (bp = ngs->bp_table_idx[frame_idx]; bp < ngs->bpidx; bp++) { 01269 bpe = &(ngs->bp_table[bp]); 01270 ngs->word_lat_idx[bpe->wid] = NO_BP; 01271 01272 if (bpe->wid == ps_search_finish_wid(ngs)) 01273 continue; 01274 k++; 01275 01276 /* DICT2PID */ 01277 /* Array of HMM scores corresponding to all the possible right 01278 * context expansions of the final phone. It's likely that a 01279 * lot of these are going to be missing, actually. */ 01280 if (bpe->last2_phone == -1) { /* implies s_idx == -1 */ 01281 /* No right context expansion. */ 01282 for (rc = 0; rc < bin_mdef_n_ciphone(ps_search_acmod(ngs)->mdef); ++rc) { 01283 if (bpe->score BETTER_THAN ngs->bestbp_rc[rc].score) { 01284 E_DEBUG(4,("bestbp_rc[0] = %d lc %d\n", 01285 bpe->score, bpe->last_phone)); 01286 ngs->bestbp_rc[rc].score = bpe->score; 01287 ngs->bestbp_rc[rc].path = bp; 01288 ngs->bestbp_rc[rc].lc = bpe->last_phone; 01289 } 01290 } 01291 } 01292 else { 01293 xwdssid_t *rssid = dict2pid_rssid(d2p, bpe->last_phone, bpe->last2_phone); 01294 int32 *rcss = &(ngs->bscore_stack[bpe->s_idx]); 01295 for (rc = 0; rc < bin_mdef_n_ciphone(ps_search_acmod(ngs)->mdef); ++rc) { 01296 if (rcss[rssid->cimap[rc]] BETTER_THAN ngs->bestbp_rc[rc].score) { 01297 E_DEBUG(4,("bestbp_rc[%d] = %d lc %d\n", 01298 rc, rcss[rssid->cimap[rc]], bpe->last_phone)); 01299 ngs->bestbp_rc[rc].score = rcss[rssid->cimap[rc]]; 01300 ngs->bestbp_rc[rc].path = bp; 01301 ngs->bestbp_rc[rc].lc = bpe->last_phone; 01302 } 01303 } 01304 } 01305 } 01306 if (k == 0) 01307 return; 01308 01309 nf = frame_idx + 1; 01310 thresh = ngs->best_score + ngs->dynamic_beam; 01311 /* 01312 * Hypothesize successors to words finished in this frame. 01313 * Main dictionary, multi-phone words transition to HMM-trees roots. 01314 */ 01315 for (i = ngs->n_root_chan, rhmm = ngs->root_chan; i > 0; --i, rhmm++) { 01316 bestbp_rc_ptr = &(ngs->bestbp_rc[rhmm->ciphone]); 01317 01318 newscore = bestbp_rc_ptr->score + ngs->nwpen + ngs->pip 01319 + phone_loop_search_score(pls, rhmm->ciphone); 01320 if (newscore BETTER_THAN thresh) { 01321 if ((hmm_frame(&rhmm->hmm) < frame_idx) 01322 || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) { 01323 hmm_enter(&rhmm->hmm, newscore, 01324 bestbp_rc_ptr->path, nf); 01325 /* DICT2PID: Another place where mpx ssids are entered. */ 01326 /* Look up the ssid to use when entering this mpx triphone. */ 01327 hmm_mpx_ssid(&rhmm->hmm, 0) = 01328 dict2pid_ldiph_lc(d2p, rhmm->ciphone, rhmm->ci2phone, bestbp_rc_ptr->lc); 01329 assert(hmm_mpx_ssid(&rhmm->hmm, 0) != BAD_SSID); 01330 } 01331 } 01332 } 01333 01334 /* 01335 * Single phone words; no right context for these. Cannot use bestbp_rc as 01336 * LM scores have to be included. First find best transition to these words. 01337 */ 01338 for (i = 0; i < ngs->n_1ph_LMwords; i++) { 01339 w = ngs->single_phone_wid[i]; 01340 ngs->last_ltrans[w].dscr = (int32) 0x80000000; 01341 } 01342 for (bp = ngs->bp_table_idx[frame_idx]; bp < ngs->bpidx; bp++) { 01343 bpe = &(ngs->bp_table[bp]); 01344 if (!bpe->valid) 01345 continue; 01346 01347 for (i = 0; i < ngs->n_1ph_LMwords; i++) { 01348 int32 n_used; 01349 w = ngs->single_phone_wid[i]; 01350 newscore = ngram_search_exit_score 01351 (ngs, bpe, dict_first_phone(dict, w)); 01352 E_DEBUG(4, ("initial newscore for %s: %d\n", 01353 dict_wordstr(dict, w), newscore)); 01354 if (newscore != WORST_SCORE) 01355 newscore += ngram_tg_score(ngs->lmset, 01356 dict_basewid(dict, w), 01357 bpe->real_wid, 01358 bpe->prev_real_wid, 01359 &n_used)>>SENSCR_SHIFT; 01360 01361 /* FIXME: Not sure how WORST_SCORE could be better, but it 01362 * apparently happens. */ 01363 if (newscore BETTER_THAN ngs->last_ltrans[w].dscr) { 01364 ngs->last_ltrans[w].dscr = newscore; 01365 ngs->last_ltrans[w].bp = bp; 01366 } 01367 } 01368 } 01369 01370 /* Now transition to in-LM single phone words */ 01371 for (i = 0; i < ngs->n_1ph_LMwords; i++) { 01372 w = ngs->single_phone_wid[i]; 01373 /* Never transition into the start word (for one thing, it is 01374 a non-event in the language model.) */ 01375 if (w == dict_startwid(ps_search_dict(ngs))) 01376 continue; 01377 rhmm = (root_chan_t *) ngs->word_chan[w]; 01378 newscore = ngs->last_ltrans[w].dscr + ngs->pip 01379 + phone_loop_search_score(pls, rhmm->ciphone); 01380 if (newscore BETTER_THAN thresh) { 01381 bpe = ngs->bp_table + ngs->last_ltrans[w].bp; 01382 if ((hmm_frame(&rhmm->hmm) < frame_idx) 01383 || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) { 01384 hmm_enter(&rhmm->hmm, 01385 newscore, ngs->last_ltrans[w].bp, nf); 01386 /* DICT2PID: another place where mpx ssids are entered. */ 01387 /* Look up the ssid to use when entering this mpx triphone. */ 01388 hmm_mpx_ssid(&rhmm->hmm, 0) = 01389 dict2pid_ldiph_lc(d2p, rhmm->ciphone, rhmm->ci2phone, 01390 dict_last_phone(dict, bpe->wid)); 01391 assert(hmm_mpx_ssid(&rhmm->hmm, 0) != BAD_SSID); 01392 } 01393 } 01394 } 01395 01396 /* Remaining words: <sil>, noise words. No mpx for these! */ 01397 w = ps_search_silence_wid(ngs); 01398 rhmm = (root_chan_t *) ngs->word_chan[w]; 01399 bestbp_rc_ptr = &(ngs->bestbp_rc[ps_search_acmod(ngs)->mdef->sil]); 01400 newscore = bestbp_rc_ptr->score + ngs->silpen + ngs->pip 01401 + phone_loop_search_score(pls, rhmm->ciphone); 01402 if (newscore BETTER_THAN thresh) { 01403 if ((hmm_frame(&rhmm->hmm) < frame_idx) 01404 || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) { 01405 hmm_enter(&rhmm->hmm, 01406 newscore, bestbp_rc_ptr->path, nf); 01407 } 01408 } 01409 for (w = dict_filler_start(dict); w <= dict_filler_end(dict); w++) { 01410 if (w == ps_search_silence_wid(ngs)) 01411 continue; 01412 /* Never transition into the start word (for one thing, it is 01413 a non-event in the language model.) */ 01414 if (w == dict_startwid(ps_search_dict(ngs))) 01415 continue; 01416 rhmm = (root_chan_t *) ngs->word_chan[w]; 01417 /* If this was not actually a single-phone word, rhmm will be NULL. */ 01418 if (rhmm == NULL) 01419 continue; 01420 newscore = bestbp_rc_ptr->score + ngs->fillpen + ngs->pip 01421 + phone_loop_search_score(pls, rhmm->ciphone); 01422 if (newscore BETTER_THAN thresh) { 01423 if ((hmm_frame(&rhmm->hmm) < frame_idx) 01424 || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) { 01425 hmm_enter(&rhmm->hmm, 01426 newscore, bestbp_rc_ptr->path, nf); 01427 } 01428 } 01429 } 01430 } 01431 01432 static void 01433 deactivate_channels(ngram_search_t *ngs, int frame_idx) 01434 { 01435 root_chan_t *rhmm; 01436 int i; 01437 01438 /* Clear score[] of pruned root channels */ 01439 for (i = ngs->n_root_chan, rhmm = ngs->root_chan; i > 0; --i, rhmm++) { 01440 if (hmm_frame(&rhmm->hmm) == frame_idx) { 01441 hmm_clear(&rhmm->hmm); 01442 } 01443 } 01444 /* Clear score[] of pruned single-phone channels */ 01445 for (i = 0; i < ngs->n_1ph_words; i++) { 01446 int32 w = ngs->single_phone_wid[i]; 01447 rhmm = (root_chan_t *) ngs->word_chan[w]; 01448 if (hmm_frame(&rhmm->hmm) == frame_idx) { 01449 hmm_clear(&rhmm->hmm); 01450 } 01451 } 01452 } 01453 01454 int 01455 ngram_fwdtree_search(ngram_search_t *ngs, int frame_idx) 01456 { 01457 int16 const *senscr; 01458 01459 /* Activate our HMMs for the current frame if need be. */ 01460 if (!ps_search_acmod(ngs)->compallsen) 01461 compute_sen_active(ngs, frame_idx); 01462 01463 /* Compute GMM scores for the current frame. */ 01464 if ((senscr = acmod_score(ps_search_acmod(ngs), &frame_idx)) == NULL) 01465 return 0; 01466 ngs->st.n_senone_active_utt += ps_search_acmod(ngs)->n_senone_active; 01467 01468 /* Mark backpointer table for current frame. */ 01469 ngram_search_mark_bptable(ngs, frame_idx); 01470 01471 /* If the best score is equal to or worse than WORST_SCORE, 01472 * recognition has failed, don't bother to keep trying. */ 01473 if (ngs->best_score == WORST_SCORE || ngs->best_score WORSE_THAN WORST_SCORE) 01474 return 0; 01475 /* Renormalize if necessary */ 01476 if (ngs->best_score + (2 * ngs->beam) WORSE_THAN WORST_SCORE) { 01477 E_INFO("Renormalizing Scores at frame %d, best score %d\n", 01478 frame_idx, ngs->best_score); 01479 renormalize_scores(ngs, frame_idx, ngs->best_score); 01480 } 01481 01482 /* Evaluate HMMs */ 01483 evaluate_channels(ngs, senscr, frame_idx); 01484 /* Prune HMMs and do phone transitions. */ 01485 prune_channels(ngs, frame_idx); 01486 /* Do absolute pruning on word exits. */ 01487 bptable_maxwpf(ngs, frame_idx); 01488 /* Do word transitions. */ 01489 word_transition(ngs, frame_idx); 01490 /* Deactivate pruned HMMs. */ 01491 deactivate_channels(ngs, frame_idx); 01492 01493 ++ngs->n_frame; 01494 /* Return the number of frames processed. */ 01495 return 1; 01496 } 01497 01498 void 01499 ngram_fwdtree_finish(ngram_search_t *ngs) 01500 { 01501 int32 i, w, cf, *awl; 01502 root_chan_t *rhmm; 01503 chan_t *hmm, **acl; 01504 01505 /* This is the number of frames processed. */ 01506 cf = ps_search_acmod(ngs)->output_frame; 01507 /* Add a mark in the backpointer table for one past the final frame. */ 01508 ngram_search_mark_bptable(ngs, cf); 01509 01510 /* Deactivate channels lined up for the next frame */ 01511 /* First, root channels of HMM tree */ 01512 for (i = ngs->n_root_chan, rhmm = ngs->root_chan; i > 0; --i, rhmm++) { 01513 hmm_clear(&rhmm->hmm); 01514 } 01515 01516 /* nonroot channels of HMM tree */ 01517 i = ngs->n_active_chan[cf & 0x1]; 01518 acl = ngs->active_chan_list[cf & 0x1]; 01519 for (hmm = *(acl++); i > 0; --i, hmm = *(acl++)) { 01520 hmm_clear(&hmm->hmm); 01521 } 01522 01523 /* word channels */ 01524 i = ngs->n_active_word[cf & 0x1]; 01525 awl = ngs->active_word_list[cf & 0x1]; 01526 for (w = *(awl++); i > 0; --i, w = *(awl++)) { 01527 /* Don't accidentally free single-phone words! */ 01528 if (dict_is_single_phone(ps_search_dict(ngs), w)) 01529 continue; 01530 bitvec_clear(ngs->word_active, w); 01531 if (ngs->word_chan[w] == NULL) 01532 continue; 01533 ngram_search_free_all_rc(ngs, w); 01534 } 01535 01536 /* 01537 * The previous search code did a postprocessing of the 01538 * backpointer table here, but we will postpone this until it is 01539 * absolutely necessary, i.e. when generating a word graph. 01540 * Likewise we don't actually have to decide what the exit word is 01541 * until somebody requests a backtrace. 01542 */ 01543 01544 ptmr_stop(&ngs->fwdtree_perf); 01545 /* Print out some statistics. */ 01546 if (cf > 0) { 01547 double n_speech = (double)(cf + 1) 01548 / cmd_ln_int32_r(ps_search_config(ngs), "-frate"); 01549 E_INFO("%8d words recognized (%d/fr)\n", 01550 ngs->bpidx, (ngs->bpidx + (cf >> 1)) / (cf + 1)); 01551 E_INFO("%8d senones evaluated (%d/fr)\n", ngs->st.n_senone_active_utt, 01552 (ngs->st.n_senone_active_utt + (cf >> 1)) / (cf + 1)); 01553 E_INFO("%8d channels searched (%d/fr), %d 1st, %d last\n", 01554 ngs->st.n_root_chan_eval + ngs->st.n_nonroot_chan_eval, 01555 (ngs->st.n_root_chan_eval + ngs->st.n_nonroot_chan_eval) / (cf + 1), 01556 ngs->st.n_root_chan_eval, ngs->st.n_last_chan_eval); 01557 E_INFO("%8d words for which last channels evaluated (%d/fr)\n", 01558 ngs->st.n_word_lastchan_eval, 01559 ngs->st.n_word_lastchan_eval / (cf + 1)); 01560 E_INFO("%8d candidate words for entering last phone (%d/fr)\n", 01561 ngs->st.n_lastphn_cand_utt, ngs->st.n_lastphn_cand_utt / (cf + 1)); 01562 E_INFO("fwdtree %.2f CPU %.3f xRT\n", 01563 ngs->fwdtree_perf.t_cpu, 01564 ngs->fwdtree_perf.t_cpu / n_speech); 01565 E_INFO("fwdtree %.2f wall %.3f xRT\n", 01566 ngs->fwdtree_perf.t_elapsed, 01567 ngs->fwdtree_perf.t_elapsed / n_speech); 01568 } 01569 /* dump_bptable(ngs); */ 01570 }