PocketSphinx
0.6
|
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 00002 /* ==================================================================== 00003 * Copyright (c) 2008 Carnegie Mellon University. All rights 00004 * reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions 00008 * are met: 00009 * 00010 * 1. Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in 00015 * the documentation and/or other materials provided with the 00016 * distribution. 00017 * 00018 * This work was supported in part by funding from the Defense Advanced 00019 * Research Projects Agency and the National Science Foundation of the 00020 * United States of America, and the CMU Sphinx Speech Consortium. 00021 * 00022 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 00023 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 00024 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00025 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 00026 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00027 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00028 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00029 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00030 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00032 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00033 * 00034 * ==================================================================== 00035 * 00036 */ 00037 00042 /* System headers. */ 00043 #include <string.h> 00044 #include <assert.h> 00045 00046 /* SphinxBase headers. */ 00047 #include <sphinxbase/ckd_alloc.h> 00048 #include <sphinxbase/listelem_alloc.h> 00049 #include <sphinxbase/err.h> 00050 00051 /* Local headers. */ 00052 #include "ngram_search.h" 00053 #include "ps_lattice_internal.h" 00054 00055 /* Turn this on to dump channels for debugging */ 00056 #define __CHAN_DUMP__ 0 00057 #if __CHAN_DUMP__ 00058 #define chan_v_eval(chan) hmm_dump_vit_eval(&(chan)->hmm, stderr) 00059 #else 00060 #define chan_v_eval(chan) hmm_vit_eval(&(chan)->hmm) 00061 #endif 00062 00063 static void 00064 ngram_fwdflat_expand_all(ngram_search_t *ngs) 00065 { 00066 int n_words, i; 00067 00068 /* For all "real words" (not fillers or <s>/</s>) in the dictionary, 00069 * 00070 * 1) Add the ones which are in the LM to the fwdflat wordlist 00071 * 2) And to the expansion list (since we are expanding all) 00072 */ 00073 ngs->n_expand_words = 0; 00074 n_words = ps_search_n_words(ngs); 00075 bitvec_clear_all(ngs->expand_word_flag, ps_search_n_words(ngs)); 00076 for (i = 0; i < n_words; ++i) { 00077 if (!ngram_model_set_known_wid(ngs->lmset, 00078 dict_basewid(ps_search_dict(ngs),i))) 00079 continue; 00080 ngs->fwdflat_wordlist[ngs->n_expand_words] = i; 00081 ngs->expand_word_list[ngs->n_expand_words] = i; 00082 bitvec_set(ngs->expand_word_flag, i); 00083 ngs->n_expand_words++; 00084 } 00085 E_INFO("Utterance vocabulary contains %d words\n", ngs->n_expand_words); 00086 ngs->expand_word_list[ngs->n_expand_words] = -1; 00087 ngs->fwdflat_wordlist[ngs->n_expand_words] = -1; 00088 } 00089 00090 static void 00091 ngram_fwdflat_allocate_1ph(ngram_search_t *ngs) 00092 { 00093 dict_t *dict = ps_search_dict(ngs); 00094 int n_words = ps_search_n_words(ngs); 00095 int i, w; 00096 00097 /* Allocate single-phone words, since they won't have 00098 * been allocated for us by fwdtree initialization. */ 00099 ngs->n_1ph_words = 0; 00100 for (w = 0; w < n_words; w++) { 00101 if (dict_is_single_phone(dict, w)) 00102 ++ngs->n_1ph_words; 00103 } 00104 ngs->single_phone_wid = ckd_calloc(ngs->n_1ph_words, 00105 sizeof(*ngs->single_phone_wid)); 00106 ngs->rhmm_1ph = ckd_calloc(ngs->n_1ph_words, sizeof(*ngs->rhmm_1ph)); 00107 i = 0; 00108 for (w = 0; w < n_words; w++) { 00109 if (!dict_is_single_phone(dict, w)) 00110 continue; 00111 00112 /* DICT2PID location */ 00113 ngs->rhmm_1ph[i].ciphone = dict_first_phone(dict, w); 00114 ngs->rhmm_1ph[i].ci2phone = bin_mdef_silphone(ps_search_acmod(ngs)->mdef); 00115 hmm_init(ngs->hmmctx, &ngs->rhmm_1ph[i].hmm, TRUE, 00116 /* ssid */ bin_mdef_pid2ssid(ps_search_acmod(ngs)->mdef, 00117 ngs->rhmm_1ph[i].ciphone), 00118 /* tmatid */ bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, 00119 ngs->rhmm_1ph[i].ciphone)); 00120 ngs->rhmm_1ph[i].next = NULL; 00121 ngs->word_chan[w] = (chan_t *) &(ngs->rhmm_1ph[i]); 00122 ngs->single_phone_wid[i] = w; 00123 i++; 00124 } 00125 } 00126 00127 static void 00128 ngram_fwdflat_free_1ph(ngram_search_t *ngs) 00129 { 00130 int i, w; 00131 int n_words = ps_search_n_words(ngs); 00132 00133 for (i = w = 0; w < n_words; ++w) { 00134 if (!dict_is_single_phone(ps_search_dict(ngs), w)) 00135 continue; 00136 hmm_deinit(&ngs->rhmm_1ph[i].hmm); 00137 ++i; 00138 } 00139 ckd_free(ngs->rhmm_1ph); 00140 ngs->rhmm_1ph = NULL; 00141 ckd_free(ngs->single_phone_wid); 00142 } 00143 00144 void 00145 ngram_fwdflat_init(ngram_search_t *ngs) 00146 { 00147 int n_words; 00148 00149 n_words = ps_search_n_words(ngs); 00150 ngs->fwdflat_wordlist = ckd_calloc(n_words + 1, sizeof(*ngs->fwdflat_wordlist)); 00151 ngs->expand_word_flag = bitvec_alloc(n_words); 00152 ngs->expand_word_list = ckd_calloc(n_words + 1, sizeof(*ngs->expand_word_list)); 00153 ngs->frm_wordlist = ckd_calloc(ngs->n_frame_alloc, sizeof(*ngs->frm_wordlist)); 00154 ngs->min_ef_width = cmd_ln_int32_r(ps_search_config(ngs), "-fwdflatefwid"); 00155 ngs->max_sf_win = cmd_ln_int32_r(ps_search_config(ngs), "-fwdflatsfwin"); 00156 E_INFO("fwdflat: min_ef_width = %d, max_sf_win = %d\n", 00157 ngs->min_ef_width, ngs->max_sf_win); 00158 00159 /* No tree-search; pre-build the expansion list, including all LM words. */ 00160 if (!ngs->fwdtree) { 00161 /* Build full expansion list from LM words. */ 00162 ngram_fwdflat_expand_all(ngs); 00163 /* Allocate single phone words. */ 00164 ngram_fwdflat_allocate_1ph(ngs); 00165 } 00166 } 00167 00168 void 00169 ngram_fwdflat_deinit(ngram_search_t *ngs) 00170 { 00171 double n_speech = (double)ngs->n_tot_frame 00172 / cmd_ln_int32_r(ps_search_config(ngs), "-frate"); 00173 00174 E_INFO("TOTAL fwdflat %.2f CPU %.3f xRT\n", 00175 ngs->fwdflat_perf.t_tot_cpu, 00176 ngs->fwdflat_perf.t_tot_cpu / n_speech); 00177 E_INFO("TOTAL fwdflat %.2f wall %.3f xRT\n", 00178 ngs->fwdflat_perf.t_tot_elapsed, 00179 ngs->fwdflat_perf.t_tot_elapsed / n_speech); 00180 00181 /* Free single-phone words if we allocated them. */ 00182 if (!ngs->fwdtree) { 00183 ngram_fwdflat_free_1ph(ngs); 00184 } 00185 ckd_free(ngs->fwdflat_wordlist); 00186 bitvec_free(ngs->expand_word_flag); 00187 ckd_free(ngs->expand_word_list); 00188 ckd_free(ngs->frm_wordlist); 00189 } 00190 00191 int 00192 ngram_fwdflat_reinit(ngram_search_t *ngs) 00193 { 00194 /* Reallocate things that depend on the number of words. */ 00195 int n_words; 00196 00197 ckd_free(ngs->fwdflat_wordlist); 00198 ckd_free(ngs->expand_word_list); 00199 bitvec_free(ngs->expand_word_flag); 00200 n_words = ps_search_n_words(ngs); 00201 ngs->fwdflat_wordlist = ckd_calloc(n_words + 1, sizeof(*ngs->fwdflat_wordlist)); 00202 ngs->expand_word_flag = bitvec_alloc(n_words); 00203 ngs->expand_word_list = ckd_calloc(n_words + 1, sizeof(*ngs->expand_word_list)); 00204 00205 /* No tree-search; take care of the expansion list and single phone words. */ 00206 if (!ngs->fwdtree) { 00207 /* Free single-phone words. */ 00208 ngram_fwdflat_free_1ph(ngs); 00209 /* Reallocate word_chan. */ 00210 ckd_free(ngs->word_chan); 00211 ngs->word_chan = ckd_calloc(dict_size(ps_search_dict(ngs)), 00212 sizeof(*ngs->word_chan)); 00213 /* Rebuild full expansion list from LM words. */ 00214 ngram_fwdflat_expand_all(ngs); 00215 /* Allocate single phone words. */ 00216 ngram_fwdflat_allocate_1ph(ngs); 00217 } 00218 /* Otherwise there is nothing to do since the wordlist is 00219 * generated anew every utterance. */ 00220 return 0; 00221 } 00222 00226 static void 00227 build_fwdflat_wordlist(ngram_search_t *ngs) 00228 { 00229 int32 i, f, sf, ef, wid, nwd; 00230 dict_t *dict; 00231 bptbl_t *bp; 00232 ps_latnode_t *node, *prevnode, *nextnode; 00233 00234 /* No tree-search, use statically allocated wordlist. */ 00235 if (!ngs->fwdtree) 00236 return; 00237 00238 dict = ps_search_dict(ngs); 00239 00240 memset(ngs->frm_wordlist, 0, ngs->n_frame_alloc * sizeof(*ngs->frm_wordlist)); 00241 00242 /* Scan the backpointer table for all active words and record 00243 * their exit frames. */ 00244 for (i = 0, bp = ngs->bp_table; i < ngs->bpidx; i++, bp++) { 00245 sf = (bp->bp < 0) ? 0 : ngs->bp_table[bp->bp].frame + 1; 00246 ef = bp->frame; 00247 wid = bp->wid; 00248 00249 /* Anything that can be transitioned to in the LM can go in 00250 * the word list. */ 00251 if (!ngram_model_set_known_wid(ngs->lmset, 00252 dict_basewid(ps_search_dict(ngs), wid))) 00253 continue; 00254 00255 /* Look for it in the wordlist. */ 00256 for (node = ngs->frm_wordlist[sf]; node && (node->wid != wid); 00257 node = node->next); 00258 00259 /* Update last end frame. */ 00260 if (node) 00261 node->lef = ef; 00262 else { 00263 /* New node; link to head of list */ 00264 node = listelem_malloc(ngs->latnode_alloc); 00265 node->wid = wid; 00266 node->fef = node->lef = ef; 00267 00268 node->next = ngs->frm_wordlist[sf]; 00269 ngs->frm_wordlist[sf] = node; 00270 } 00271 } 00272 00273 /* Eliminate "unlikely" words, for which there are too few end points */ 00274 for (f = 0; f < ngs->n_frame; f++) { 00275 prevnode = NULL; 00276 for (node = ngs->frm_wordlist[f]; node; node = nextnode) { 00277 nextnode = node->next; 00278 /* Word has too few endpoints */ 00279 if ((node->lef - node->fef < ngs->min_ef_width) || 00280 /* Word is </s> and doesn't actually end in last frame */ 00281 ((node->wid == ps_search_finish_wid(ngs)) && (node->lef < ngs->n_frame - 1))) { 00282 if (!prevnode) 00283 ngs->frm_wordlist[f] = nextnode; 00284 else 00285 prevnode->next = nextnode; 00286 listelem_free(ngs->latnode_alloc, node); 00287 } 00288 else 00289 prevnode = node; 00290 } 00291 } 00292 00293 /* Form overall wordlist for 2nd pass */ 00294 nwd = 0; 00295 bitvec_clear_all(ngs->word_active, ps_search_n_words(ngs)); 00296 for (f = 0; f < ngs->n_frame; f++) { 00297 for (node = ngs->frm_wordlist[f]; node; node = node->next) { 00298 if (!bitvec_is_set(ngs->word_active, node->wid)) { 00299 bitvec_set(ngs->word_active, node->wid); 00300 ngs->fwdflat_wordlist[nwd++] = node->wid; 00301 } 00302 } 00303 } 00304 ngs->fwdflat_wordlist[nwd] = -1; 00305 E_INFO("Utterance vocabulary contains %d words\n", nwd); 00306 } 00307 00311 static void 00312 build_fwdflat_chan(ngram_search_t *ngs) 00313 { 00314 int32 i, wid, p; 00315 root_chan_t *rhmm; 00316 chan_t *hmm, *prevhmm; 00317 dict_t *dict; 00318 dict2pid_t *d2p; 00319 00320 dict = ps_search_dict(ngs); 00321 d2p = ps_search_dict2pid(ngs); 00322 00323 /* Build word HMMs for each word in the lattice. */ 00324 for (i = 0; ngs->fwdflat_wordlist[i] >= 0; i++) { 00325 wid = ngs->fwdflat_wordlist[i]; 00326 00327 /* Single-phone words are permanently allocated */ 00328 if (dict_is_single_phone(dict, wid)) 00329 continue; 00330 00331 assert(ngs->word_chan[wid] == NULL); 00332 00333 /* Multiplex root HMM for first phone (one root per word, flat 00334 * lexicon). diphone is irrelevant here, for the time being, 00335 * at least. */ 00336 rhmm = listelem_malloc(ngs->root_chan_alloc); 00337 rhmm->ci2phone = dict_second_phone(dict, wid); 00338 rhmm->ciphone = dict_first_phone(dict, wid); 00339 rhmm->next = NULL; 00340 hmm_init(ngs->hmmctx, &rhmm->hmm, TRUE, 00341 bin_mdef_pid2ssid(ps_search_acmod(ngs)->mdef, rhmm->ciphone), 00342 bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, rhmm->ciphone)); 00343 00344 /* HMMs for word-internal phones */ 00345 prevhmm = NULL; 00346 for (p = 1; p < dict_pronlen(dict, wid) - 1; p++) { 00347 hmm = listelem_malloc(ngs->chan_alloc); 00348 hmm->ciphone = dict_pron(dict, wid, p); 00349 hmm->info.rc_id = (p == dict_pronlen(dict, wid) - 1) ? 0 : -1; 00350 hmm->next = NULL; 00351 hmm_init(ngs->hmmctx, &hmm->hmm, FALSE, 00352 dict2pid_internal(d2p,wid,p), 00353 bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, hmm->ciphone)); 00354 00355 if (prevhmm) 00356 prevhmm->next = hmm; 00357 else 00358 rhmm->next = hmm; 00359 00360 prevhmm = hmm; 00361 } 00362 00363 /* Right-context phones */ 00364 ngram_search_alloc_all_rc(ngs, wid); 00365 00366 /* Link in just allocated right-context phones */ 00367 if (prevhmm) 00368 prevhmm->next = ngs->word_chan[wid]; 00369 else 00370 rhmm->next = ngs->word_chan[wid]; 00371 ngs->word_chan[wid] = (chan_t *) rhmm; 00372 } 00373 00374 } 00375 00376 void 00377 ngram_fwdflat_start(ngram_search_t *ngs) 00378 { 00379 root_chan_t *rhmm; 00380 int i; 00381 00382 ptmr_reset(&ngs->fwdflat_perf); 00383 ptmr_start(&ngs->fwdflat_perf); 00384 build_fwdflat_wordlist(ngs); 00385 build_fwdflat_chan(ngs); 00386 00387 ngs->bpidx = 0; 00388 ngs->bss_head = 0; 00389 00390 for (i = 0; i < ps_search_n_words(ngs); i++) 00391 ngs->word_lat_idx[i] = NO_BP; 00392 00393 /* Reset the permanently allocated single-phone words, since they 00394 * may have junk left over in them from previous searches. */ 00395 for (i = 0; i < ngs->n_1ph_words; i++) { 00396 int32 w = ngs->single_phone_wid[i]; 00397 rhmm = (root_chan_t *) ngs->word_chan[w]; 00398 hmm_clear(&rhmm->hmm); 00399 } 00400 00401 /* Start search with <s>; word_chan[<s>] is permanently allocated */ 00402 rhmm = (root_chan_t *) ngs->word_chan[ps_search_start_wid(ngs)]; 00403 hmm_enter(&rhmm->hmm, 0, NO_BP, 0); 00404 ngs->active_word_list[0][0] = ps_search_start_wid(ngs); 00405 ngs->n_active_word[0] = 1; 00406 00407 ngs->best_score = 0; 00408 ngs->renormalized = FALSE; 00409 00410 for (i = 0; i < ps_search_n_words(ngs); i++) 00411 ngs->last_ltrans[i].sf = -1; 00412 00413 if (!ngs->fwdtree) 00414 ngs->n_frame = 0; 00415 00416 ngs->st.n_fwdflat_chan = 0; 00417 ngs->st.n_fwdflat_words = 0; 00418 ngs->st.n_fwdflat_word_transition = 0; 00419 ngs->st.n_senone_active_utt = 0; 00420 } 00421 00422 static void 00423 compute_fwdflat_sen_active(ngram_search_t *ngs, int frame_idx) 00424 { 00425 int32 i, w; 00426 int32 *awl; 00427 root_chan_t *rhmm; 00428 chan_t *hmm; 00429 00430 acmod_clear_active(ps_search_acmod(ngs)); 00431 00432 i = ngs->n_active_word[frame_idx & 0x1]; 00433 awl = ngs->active_word_list[frame_idx & 0x1]; 00434 00435 for (w = *(awl++); i > 0; --i, w = *(awl++)) { 00436 rhmm = (root_chan_t *)ngs->word_chan[w]; 00437 if (hmm_frame(&rhmm->hmm) == frame_idx) { 00438 acmod_activate_hmm(ps_search_acmod(ngs), &rhmm->hmm); 00439 } 00440 00441 for (hmm = rhmm->next; hmm; hmm = hmm->next) { 00442 if (hmm_frame(&hmm->hmm) == frame_idx) { 00443 acmod_activate_hmm(ps_search_acmod(ngs), &hmm->hmm); 00444 } 00445 } 00446 } 00447 } 00448 00449 static void 00450 fwdflat_eval_chan(ngram_search_t *ngs, int frame_idx) 00451 { 00452 int32 i, w, bestscore; 00453 int32 *awl; 00454 root_chan_t *rhmm; 00455 chan_t *hmm; 00456 00457 i = ngs->n_active_word[frame_idx & 0x1]; 00458 awl = ngs->active_word_list[frame_idx & 0x1]; 00459 bestscore = WORST_SCORE; 00460 00461 ngs->st.n_fwdflat_words += i; 00462 00463 /* Scan all active words. */ 00464 for (w = *(awl++); i > 0; --i, w = *(awl++)) { 00465 rhmm = (root_chan_t *) ngs->word_chan[w]; 00466 if (hmm_frame(&rhmm->hmm) == frame_idx) { 00467 int32 score = chan_v_eval(rhmm); 00468 if ((score BETTER_THAN bestscore) && (w != ps_search_finish_wid(ngs))) 00469 bestscore = score; 00470 ngs->st.n_fwdflat_chan++; 00471 } 00472 00473 for (hmm = rhmm->next; hmm; hmm = hmm->next) { 00474 if (hmm_frame(&hmm->hmm) == frame_idx) { 00475 int32 score = chan_v_eval(hmm); 00476 if (score BETTER_THAN bestscore) 00477 bestscore = score; 00478 ngs->st.n_fwdflat_chan++; 00479 } 00480 } 00481 } 00482 00483 ngs->best_score = bestscore; 00484 } 00485 00486 static void 00487 fwdflat_prune_chan(ngram_search_t *ngs, int frame_idx) 00488 { 00489 int32 i, cf, nf, w, pip, newscore, thresh, wordthresh; 00490 int32 *awl; 00491 root_chan_t *rhmm; 00492 chan_t *hmm, *nexthmm; 00493 00494 cf = frame_idx; 00495 nf = cf + 1; 00496 i = ngs->n_active_word[cf & 0x1]; 00497 awl = ngs->active_word_list[cf & 0x1]; 00498 bitvec_clear_all(ngs->word_active, ps_search_n_words(ngs)); 00499 00500 thresh = ngs->best_score + ngs->fwdflatbeam; 00501 wordthresh = ngs->best_score + ngs->fwdflatwbeam; 00502 pip = ngs->pip; 00503 E_DEBUG(3,("frame %d thresh %d wordthresh %d\n", frame_idx, thresh, wordthresh)); 00504 00505 /* Scan all active words. */ 00506 for (w = *(awl++); i > 0; --i, w = *(awl++)) { 00507 rhmm = (root_chan_t *) ngs->word_chan[w]; 00508 /* Propagate active root channels */ 00509 if (hmm_frame(&rhmm->hmm) == cf 00510 && hmm_bestscore(&rhmm->hmm) BETTER_THAN thresh) { 00511 hmm_frame(&rhmm->hmm) = nf; 00512 bitvec_set(ngs->word_active, w); 00513 00514 /* Transitions out of root channel */ 00515 newscore = hmm_out_score(&rhmm->hmm); 00516 if (rhmm->next) { 00517 assert(!dict_is_single_phone(ps_search_dict(ngs), w)); 00518 00519 newscore += pip; 00520 if (newscore BETTER_THAN thresh) { 00521 hmm = rhmm->next; 00522 /* Enter all right context phones */ 00523 if (hmm->info.rc_id >= 0) { 00524 for (; hmm; hmm = hmm->next) { 00525 if ((hmm_frame(&hmm->hmm) < cf) 00526 || (newscore BETTER_THAN hmm_in_score(&hmm->hmm))) { 00527 hmm_enter(&hmm->hmm, newscore, 00528 hmm_out_history(&rhmm->hmm), nf); 00529 } 00530 } 00531 } 00532 /* Just a normal word internal phone */ 00533 else { 00534 if ((hmm_frame(&hmm->hmm) < cf) 00535 || (newscore BETTER_THAN hmm_in_score(&hmm->hmm))) { 00536 hmm_enter(&hmm->hmm, newscore, 00537 hmm_out_history(&rhmm->hmm), nf); 00538 } 00539 } 00540 } 00541 } 00542 else { 00543 assert(dict_is_single_phone(ps_search_dict(ngs), w)); 00544 00545 /* Word exit for single-phone words (where did their 00546 * whmms come from?) (either from 00547 * ngram_search_fwdtree, or from 00548 * ngram_fwdflat_allocate_1ph(), that's where) */ 00549 if (newscore BETTER_THAN wordthresh) { 00550 ngram_search_save_bp(ngs, cf, w, newscore, 00551 hmm_out_history(&rhmm->hmm), 0); 00552 } 00553 } 00554 } 00555 00556 /* Transitions out of non-root channels. */ 00557 for (hmm = rhmm->next; hmm; hmm = hmm->next) { 00558 if (hmm_frame(&hmm->hmm) >= cf) { 00559 /* Propagate forward HMMs inside the beam. */ 00560 if (hmm_bestscore(&hmm->hmm) BETTER_THAN thresh) { 00561 hmm_frame(&hmm->hmm) = nf; 00562 bitvec_set(ngs->word_active, w); 00563 00564 newscore = hmm_out_score(&hmm->hmm); 00565 /* Word-internal phones */ 00566 if (hmm->info.rc_id < 0) { 00567 newscore += pip; 00568 if (newscore BETTER_THAN thresh) { 00569 nexthmm = hmm->next; 00570 /* Enter all right-context phones. */ 00571 if (nexthmm->info.rc_id >= 0) { 00572 for (; nexthmm; nexthmm = nexthmm->next) { 00573 if ((hmm_frame(&nexthmm->hmm) < cf) 00574 || (newscore BETTER_THAN 00575 hmm_in_score(&nexthmm->hmm))) { 00576 hmm_enter(&nexthmm->hmm, 00577 newscore, 00578 hmm_out_history(&hmm->hmm), 00579 nf); 00580 } 00581 } 00582 } 00583 /* Enter single word-internal phone. */ 00584 else { 00585 if ((hmm_frame(&nexthmm->hmm) < cf) 00586 || (newscore BETTER_THAN 00587 hmm_in_score(&nexthmm->hmm))) { 00588 hmm_enter(&nexthmm->hmm, newscore, 00589 hmm_out_history(&hmm->hmm), nf); 00590 } 00591 } 00592 } 00593 } 00594 /* Right-context phones - apply word beam and exit. */ 00595 else { 00596 if (newscore BETTER_THAN wordthresh) { 00597 ngram_search_save_bp(ngs, cf, w, newscore, 00598 hmm_out_history(&hmm->hmm), 00599 hmm->info.rc_id); 00600 } 00601 } 00602 } 00603 /* Zero out inactive HMMs. */ 00604 else if (hmm_frame(&hmm->hmm) != nf) { 00605 hmm_clear_scores(&hmm->hmm); 00606 } 00607 } 00608 } 00609 } 00610 } 00611 00612 static void 00613 get_expand_wordlist(ngram_search_t *ngs, int32 frm, int32 win) 00614 { 00615 int32 f, sf, ef; 00616 ps_latnode_t *node; 00617 00618 if (!ngs->fwdtree) { 00619 ngs->st.n_fwdflat_word_transition += ngs->n_expand_words; 00620 return; 00621 } 00622 00623 sf = frm - win; 00624 if (sf < 0) 00625 sf = 0; 00626 ef = frm + win; 00627 if (ef > ngs->n_frame) 00628 ef = ngs->n_frame; 00629 00630 bitvec_clear_all(ngs->expand_word_flag, ps_search_n_words(ngs)); 00631 ngs->n_expand_words = 0; 00632 00633 for (f = sf; f < ef; f++) { 00634 for (node = ngs->frm_wordlist[f]; node; node = node->next) { 00635 if (!bitvec_is_set(ngs->expand_word_flag, node->wid)) { 00636 ngs->expand_word_list[ngs->n_expand_words++] = node->wid; 00637 bitvec_set(ngs->expand_word_flag, node->wid); 00638 } 00639 } 00640 } 00641 ngs->expand_word_list[ngs->n_expand_words] = -1; 00642 ngs->st.n_fwdflat_word_transition += ngs->n_expand_words; 00643 } 00644 00645 static void 00646 fwdflat_word_transition(ngram_search_t *ngs, int frame_idx) 00647 { 00648 int32 cf, nf, b, thresh, pip, i, w, newscore; 00649 int32 best_silrc_score = 0, best_silrc_bp = 0; /* FIXME: good defaults? */ 00650 bptbl_t *bp; 00651 int32 *rcss; 00652 root_chan_t *rhmm; 00653 int32 *awl; 00654 float32 lwf; 00655 dict_t *dict = ps_search_dict(ngs); 00656 dict2pid_t *d2p = ps_search_dict2pid(ngs); 00657 00658 cf = frame_idx; 00659 nf = cf + 1; 00660 thresh = ngs->best_score + ngs->fwdflatbeam; 00661 pip = ngs->pip; 00662 best_silrc_score = WORST_SCORE; 00663 lwf = ngs->fwdflat_fwdtree_lw_ratio; 00664 00665 /* Search for all words starting within a window of this frame. 00666 * These are the successors for words exiting now. */ 00667 get_expand_wordlist(ngs, cf, ngs->max_sf_win); 00668 00669 /* Scan words exited in current frame */ 00670 for (b = ngs->bp_table_idx[cf]; b < ngs->bpidx; b++) { 00671 xwdssid_t *rssid; 00672 int32 silscore; 00673 00674 bp = ngs->bp_table + b; 00675 ngs->word_lat_idx[bp->wid] = NO_BP; 00676 00677 if (bp->wid == ps_search_finish_wid(ngs)) 00678 continue; 00679 00680 /* DICT2PID location */ 00681 /* Get the mapping from right context phone ID to index in the 00682 * right context table and the bscore_stack. */ 00683 rcss = ngs->bscore_stack + bp->s_idx; 00684 if (bp->last2_phone == -1) 00685 rssid = NULL; 00686 else 00687 rssid = dict2pid_rssid(d2p, bp->last_phone, bp->last2_phone); 00688 00689 /* Transition to all successor words. */ 00690 for (i = 0; ngs->expand_word_list[i] >= 0; i++) { 00691 int32 n_used; 00692 00693 w = ngs->expand_word_list[i]; 00694 00695 /* Get the exit score we recorded in save_bwd_ptr(), or 00696 * something approximating it. */ 00697 if (rssid) 00698 newscore = rcss[rssid->cimap[dict_first_phone(dict, w)]]; 00699 else 00700 newscore = bp->score; 00701 if (newscore == WORST_SCORE) 00702 continue; 00703 /* FIXME: Floating point... */ 00704 newscore += lwf 00705 * (ngram_tg_score(ngs->lmset, 00706 dict_basewid(dict, w), 00707 bp->real_wid, 00708 bp->prev_real_wid, 00709 &n_used) >> SENSCR_SHIFT); 00710 newscore += pip; 00711 00712 /* Enter the next word */ 00713 if (newscore BETTER_THAN thresh) { 00714 rhmm = (root_chan_t *) ngs->word_chan[w]; 00715 if ((hmm_frame(&rhmm->hmm) < cf) 00716 || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) { 00717 hmm_enter(&rhmm->hmm, newscore, b, nf); 00718 /* DICT2PID: This is where mpx ssids get introduced. */ 00719 /* Look up the ssid to use when entering this mpx triphone. */ 00720 hmm_mpx_ssid(&rhmm->hmm, 0) = 00721 dict2pid_ldiph_lc(d2p, rhmm->ciphone, rhmm->ci2phone, 00722 dict_last_phone(dict, bp->wid)); 00723 assert(IS_S3SSID(hmm_mpx_ssid(&rhmm->hmm, 0))); 00724 E_DEBUG(6,("ssid %d(%d,%d) = %d\n", 00725 rhmm->ciphone, dict_last_phone(dict, bp->wid), rhmm->ci2phone, 00726 hmm_mpx_ssid(&rhmm->hmm, 0))); 00727 bitvec_set(ngs->word_active, w); 00728 } 00729 } 00730 } 00731 00732 /* Get the best exit into silence. */ 00733 if (rssid) 00734 silscore = rcss[rssid->cimap[ps_search_acmod(ngs)->mdef->sil]]; 00735 else 00736 silscore = bp->score; 00737 if (silscore BETTER_THAN best_silrc_score) { 00738 best_silrc_score = silscore; 00739 best_silrc_bp = b; 00740 } 00741 } 00742 00743 /* Transition to <sil> */ 00744 newscore = best_silrc_score + ngs->silpen + pip; 00745 if ((newscore BETTER_THAN thresh) && (newscore BETTER_THAN WORST_SCORE)) { 00746 w = ps_search_silence_wid(ngs); 00747 rhmm = (root_chan_t *) ngs->word_chan[w]; 00748 if ((hmm_frame(&rhmm->hmm) < cf) 00749 || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) { 00750 hmm_enter(&rhmm->hmm, newscore, 00751 best_silrc_bp, nf); 00752 bitvec_set(ngs->word_active, w); 00753 } 00754 } 00755 /* Transition to noise words */ 00756 newscore = best_silrc_score + ngs->fillpen + pip; 00757 if ((newscore BETTER_THAN thresh) && (newscore BETTER_THAN WORST_SCORE)) { 00758 for (w = ps_search_silence_wid(ngs) + 1; w < ps_search_n_words(ngs); w++) { 00759 rhmm = (root_chan_t *) ngs->word_chan[w]; 00760 /* Noise words that aren't a single phone will have NULL here. */ 00761 if (rhmm == NULL) 00762 continue; 00763 if ((hmm_frame(&rhmm->hmm) < cf) 00764 || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) { 00765 hmm_enter(&rhmm->hmm, newscore, 00766 best_silrc_bp, nf); 00767 bitvec_set(ngs->word_active, w); 00768 } 00769 } 00770 } 00771 00772 /* Reset initial channels of words that have become inactive even after word trans. */ 00773 i = ngs->n_active_word[cf & 0x1]; 00774 awl = ngs->active_word_list[cf & 0x1]; 00775 for (w = *(awl++); i > 0; --i, w = *(awl++)) { 00776 rhmm = (root_chan_t *) ngs->word_chan[w]; 00777 if (hmm_frame(&rhmm->hmm) == cf) { 00778 hmm_clear_scores(&rhmm->hmm); 00779 } 00780 } 00781 } 00782 00783 static void 00784 fwdflat_renormalize_scores(ngram_search_t *ngs, int frame_idx, int32 norm) 00785 { 00786 root_chan_t *rhmm; 00787 chan_t *hmm; 00788 int32 i, cf, w, *awl; 00789 00790 cf = frame_idx; 00791 00792 /* Renormalize individual word channels */ 00793 i = ngs->n_active_word[cf & 0x1]; 00794 awl = ngs->active_word_list[cf & 0x1]; 00795 for (w = *(awl++); i > 0; --i, w = *(awl++)) { 00796 rhmm = (root_chan_t *) ngs->word_chan[w]; 00797 if (hmm_frame(&rhmm->hmm) == cf) { 00798 hmm_normalize(&rhmm->hmm, norm); 00799 } 00800 for (hmm = rhmm->next; hmm; hmm = hmm->next) { 00801 if (hmm_frame(&hmm->hmm) == cf) { 00802 hmm_normalize(&hmm->hmm, norm); 00803 } 00804 } 00805 } 00806 00807 ngs->renormalized = TRUE; 00808 } 00809 00810 int 00811 ngram_fwdflat_search(ngram_search_t *ngs, int frame_idx) 00812 { 00813 int16 const *senscr; 00814 int32 nf, i, j; 00815 int32 *nawl; 00816 00817 /* Activate our HMMs for the current frame if need be. */ 00818 if (!ps_search_acmod(ngs)->compallsen) 00819 compute_fwdflat_sen_active(ngs, frame_idx); 00820 00821 /* Compute GMM scores for the current frame. */ 00822 senscr = acmod_score(ps_search_acmod(ngs), &frame_idx); 00823 ngs->st.n_senone_active_utt += ps_search_acmod(ngs)->n_senone_active; 00824 00825 /* Mark backpointer table for current frame. */ 00826 ngram_search_mark_bptable(ngs, frame_idx); 00827 00828 /* If the best score is equal to or worse than WORST_SCORE, 00829 * recognition has failed, don't bother to keep trying. */ 00830 if (ngs->best_score == WORST_SCORE || ngs->best_score WORSE_THAN WORST_SCORE) 00831 return 0; 00832 /* Renormalize if necessary */ 00833 if (ngs->best_score + (2 * ngs->beam) WORSE_THAN WORST_SCORE) { 00834 E_INFO("Renormalizing Scores at frame %d, best score %d\n", 00835 frame_idx, ngs->best_score); 00836 fwdflat_renormalize_scores(ngs, frame_idx, ngs->best_score); 00837 } 00838 00839 ngs->best_score = WORST_SCORE; 00840 hmm_context_set_senscore(ngs->hmmctx, senscr); 00841 00842 /* Evaluate HMMs */ 00843 fwdflat_eval_chan(ngs, frame_idx); 00844 /* Prune HMMs and do phone transitions. */ 00845 fwdflat_prune_chan(ngs, frame_idx); 00846 /* Do word transitions. */ 00847 fwdflat_word_transition(ngs, frame_idx); 00848 00849 /* Create next active word list */ 00850 nf = frame_idx + 1; 00851 nawl = ngs->active_word_list[nf & 0x1]; 00852 for (i = 0, j = 0; ngs->fwdflat_wordlist[i] >= 0; i++) { 00853 if (bitvec_is_set(ngs->word_active, ngs->fwdflat_wordlist[i])) { 00854 *(nawl++) = ngs->fwdflat_wordlist[i]; 00855 j++; 00856 } 00857 } 00858 for (i = ps_search_start_wid(ngs); i < ps_search_n_words(ngs); i++) { 00859 if (bitvec_is_set(ngs->word_active, i)) { 00860 *(nawl++) = i; 00861 j++; 00862 } 00863 } 00864 if (!ngs->fwdtree) 00865 ++ngs->n_frame; 00866 ngs->n_active_word[nf & 0x1] = j; 00867 00868 /* Return the number of frames processed. */ 00869 return 1; 00870 } 00871 00875 static void 00876 destroy_fwdflat_wordlist(ngram_search_t *ngs) 00877 { 00878 ps_latnode_t *node, *tnode; 00879 int32 f; 00880 00881 if (!ngs->fwdtree) 00882 return; 00883 00884 for (f = 0; f < ngs->n_frame; f++) { 00885 for (node = ngs->frm_wordlist[f]; node; node = tnode) { 00886 tnode = node->next; 00887 listelem_free(ngs->latnode_alloc, node); 00888 } 00889 } 00890 } 00891 00895 static void 00896 destroy_fwdflat_chan(ngram_search_t *ngs) 00897 { 00898 int32 i, wid; 00899 00900 for (i = 0; ngs->fwdflat_wordlist[i] >= 0; i++) { 00901 root_chan_t *rhmm; 00902 chan_t *thmm; 00903 wid = ngs->fwdflat_wordlist[i]; 00904 if (dict_is_single_phone(ps_search_dict(ngs),wid)) 00905 continue; 00906 assert(ngs->word_chan[wid] != NULL); 00907 00908 /* The first HMM in ngs->word_chan[wid] was allocated with 00909 * ngs->root_chan_alloc, but this will attempt to free it 00910 * using ngs->chan_alloc, which will not work. Therefore we 00911 * free it manually and move the list forward before handing 00912 * it off. */ 00913 rhmm = (root_chan_t *)ngs->word_chan[wid]; 00914 thmm = rhmm->next; 00915 listelem_free(ngs->root_chan_alloc, rhmm); 00916 ngs->word_chan[wid] = thmm; 00917 ngram_search_free_all_rc(ngs, wid); 00918 } 00919 } 00920 00921 void 00922 ngram_fwdflat_finish(ngram_search_t *ngs) 00923 { 00924 int32 cf; 00925 00926 destroy_fwdflat_chan(ngs); 00927 destroy_fwdflat_wordlist(ngs); 00928 bitvec_clear_all(ngs->word_active, ps_search_n_words(ngs)); 00929 00930 /* This is the number of frames processed. */ 00931 cf = ps_search_acmod(ngs)->output_frame; 00932 /* Add a mark in the backpointer table for one past the final frame. */ 00933 ngram_search_mark_bptable(ngs, cf); 00934 00935 ptmr_stop(&ngs->fwdflat_perf); 00936 /* Print out some statistics. */ 00937 if (cf > 0) { 00938 double n_speech = (double)(cf + 1) 00939 / cmd_ln_int32_r(ps_search_config(ngs), "-frate"); 00940 E_INFO("%8d words recognized (%d/fr)\n", 00941 ngs->bpidx, (ngs->bpidx + (cf >> 1)) / (cf + 1)); 00942 E_INFO("%8d senones evaluated (%d/fr)\n", ngs->st.n_senone_active_utt, 00943 (ngs->st.n_senone_active_utt + (cf >> 1)) / (cf + 1)); 00944 E_INFO("%8d channels searched (%d/fr)\n", 00945 ngs->st.n_fwdflat_chan, ngs->st.n_fwdflat_chan / (cf + 1)); 00946 E_INFO("%8d words searched (%d/fr)\n", 00947 ngs->st.n_fwdflat_words, ngs->st.n_fwdflat_words / (cf + 1)); 00948 E_INFO("%8d word transitions (%d/fr)\n", 00949 ngs->st.n_fwdflat_word_transition, 00950 ngs->st.n_fwdflat_word_transition / (cf + 1)); 00951 E_INFO("fwdflat %.2f CPU %.3f xRT\n", 00952 ngs->fwdflat_perf.t_cpu, 00953 ngs->fwdflat_perf.t_cpu / n_speech); 00954 E_INFO("fwdflat %.2f wall %.3f xRT\n", 00955 ngs->fwdflat_perf.t_elapsed, 00956 ngs->fwdflat_perf.t_elapsed / n_speech); 00957 } 00958 }