PocketSphinx
0.6
|
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 00002 /* ==================================================================== 00003 * Copyright (c) 2008 Carnegie Mellon University. All rights 00004 * reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions 00008 * are met: 00009 * 00010 * 1. Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in 00015 * the documentation and/or other materials provided with the 00016 * distribution. 00017 * 00018 * This work was supported in part by funding from the Defense Advanced 00019 * Research Projects Agency and the National Science Foundation of the 00020 * United States of America, and the CMU Sphinx Speech Consortium. 00021 * 00022 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 00023 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 00024 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00025 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 00026 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00027 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00028 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00029 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00030 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00032 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00033 * 00034 * ==================================================================== 00035 * 00036 */ 00037 00042 #ifndef __NGRAM_SEARCH_H__ 00043 #define __NGRAM_SEARCH_H__ 00044 00045 /* SphinxBase headers. */ 00046 #include <sphinxbase/cmd_ln.h> 00047 #include <sphinxbase/logmath.h> 00048 #include <sphinxbase/ngram_model.h> 00049 #include <sphinxbase/listelem_alloc.h> 00050 #include <sphinxbase/err.h> 00051 00052 /* Local headers. */ 00053 #include "pocketsphinx_internal.h" 00054 #include "hmm.h" 00055 00064 typedef struct chan_s { 00065 hmm_t hmm; 00068 struct chan_s *next; 00071 struct chan_s *alt; 00073 int32 ciphone; 00074 union { 00075 int32 penult_phn_wid; 00079 int32 rc_id; 00080 } info; 00081 } chan_t; 00082 00090 typedef struct root_chan_s { 00091 hmm_t hmm; 00094 chan_t *next; 00096 int32 penult_phn_wid; 00097 int32 this_phn_wid; 00100 int16 ciphone; 00102 int16 ci2phone; 00104 } root_chan_t; 00105 00109 typedef struct bptbl_s { 00110 int16 frame; 00111 uint8 valid; 00112 uint8 refcnt; 00113 int32 wid; 00114 int32 bp; 00115 int32 score; 00116 int32 s_idx; 00117 int32 real_wid; 00118 int32 prev_real_wid; 00119 int16 last_phone; 00120 int16 last2_phone; 00121 } bptbl_t; 00122 00126 typedef struct bptbl_seg_s { 00127 ps_seg_t base; 00128 int32 *bpidx; 00129 int16 n_bpidx; 00130 int16 cur; 00131 } bptbl_seg_t; 00132 00133 /* 00134 * Candidates words for entering their last phones. Cleared and rebuilt in each 00135 * frame. 00136 * NOTE: candidates can only be multi-phone, real dictionary words. 00137 */ 00138 typedef struct lastphn_cand_s { 00139 int32 wid; 00140 int32 score; 00141 int32 bp; 00142 int32 next; /* next candidate starting at the same frame */ 00143 } lastphn_cand_t; 00144 00145 /* 00146 * Since the same instance of a word (i.e., <word,start-frame>) reaches its last 00147 * phone several times, we can compute its best BP and LM transition score info 00148 * just the first time and cache it for future occurrences. Structure for such 00149 * a cache. 00150 */ 00151 typedef struct { 00152 int32 sf; /* Start frame */ 00153 int32 dscr; /* Delta-score upon entering last phone */ 00154 int32 bp; /* Best BP */ 00155 } last_ltrans_t; 00156 00157 #define CAND_SF_ALLOCSIZE 32 00158 typedef struct { 00159 int32 bp_ef; 00160 int32 cand; 00161 } cand_sf_t; 00162 00163 /* 00164 * Structure for reorganizing the BP table entries in the current frame according 00165 * to distinct right context ci-phones. Each entry contains the best BP entry for 00166 * a given right context. Each successor word will pick up the correct entry based 00167 * on its first ci-phone. 00168 */ 00169 typedef struct bestbp_rc_s { 00170 int32 score; 00171 int32 path; /* BP table index corresponding to this entry */ 00172 int32 lc; /* right most ci-phone of above BP entry word */ 00173 } bestbp_rc_t; 00174 00175 #define NO_BP -1 00176 00180 typedef struct ngram_search_stats_s { 00181 int32 n_phone_eval; 00182 int32 n_root_chan_eval; 00183 int32 n_nonroot_chan_eval; 00184 int32 n_last_chan_eval; 00185 int32 n_word_lastchan_eval; 00186 int32 n_lastphn_cand_utt; 00187 int32 n_fwdflat_chan; 00188 int32 n_fwdflat_words; 00189 int32 n_fwdflat_word_transition; 00190 int32 n_senone_active_utt; 00191 } ngram_search_stats_t; 00192 00193 00197 struct ngram_search_s { 00198 ps_search_t base; 00199 ngram_model_t *lmset; 00200 hmm_context_t *hmmctx; 00202 /* Flags to quickly indicate which passes are enabled. */ 00203 uint8 fwdtree; 00204 uint8 fwdflat; 00205 uint8 bestpath; 00206 00207 /* State of procesing. */ 00208 uint8 done; 00209 00210 /* Allocators */ 00211 listelem_alloc_t *chan_alloc; 00212 listelem_alloc_t *root_chan_alloc; 00213 listelem_alloc_t *latnode_alloc; 00231 root_chan_t *root_chan; 00232 int32 n_root_chan_alloc; 00233 int32 n_root_chan; 00234 int32 n_nonroot_chan; 00235 int32 max_nonroot_chan; 00236 root_chan_t *rhmm_1ph; 00246 chan_t **word_chan; 00247 bitvec_t *word_active; 00263 int32 *homophone_set; 00264 int32 *single_phone_wid; 00265 int32 n_1ph_words; 00266 int32 n_1ph_LMwords; 00275 chan_t ***active_chan_list; 00276 int32 n_active_chan[2]; 00287 int32 **active_word_list; 00288 int32 n_active_word[2]; 00290 /* 00291 * FIXME: Document all of these bits. 00292 */ 00293 lastphn_cand_t *lastphn_cand; 00294 int32 n_lastphn_cand; 00295 last_ltrans_t *last_ltrans; /* one per word */ 00296 int32 cand_sf_alloc; 00297 cand_sf_t *cand_sf; 00298 bestbp_rc_t *bestbp_rc; 00299 00300 bptbl_t *bp_table; /* Forward pass lattice */ 00301 int32 bpidx; /* First free BPTable entry */ 00302 int32 bp_table_size; 00303 int32 *bscore_stack; /* Score stack for all possible right contexts */ 00304 int32 bss_head; /* First free BScoreStack entry */ 00305 int32 bscore_stack_size; 00306 00307 int32 n_frame_alloc; 00308 int32 n_frame; 00309 int32 *bp_table_idx; /* First BPTable entry for each frame */ 00310 int32 *word_lat_idx; /* BPTable index for any word in current frame; 00311 cleared before each frame */ 00312 00313 /* 00314 * Flat lexicon (2nd pass) search stuff. 00315 */ 00316 ps_latnode_t **frm_wordlist; 00317 int32 *fwdflat_wordlist; 00318 bitvec_t *expand_word_flag; 00319 int32 *expand_word_list; 00320 int32 n_expand_words; 00321 int32 min_ef_width; 00322 int32 max_sf_win; 00323 float32 fwdflat_fwdtree_lw_ratio; 00324 00325 int32 best_score; 00326 int32 last_phone_best_score; 00327 int32 renormalized; 00328 00329 /* 00330 * DAG (3rd pass) search stuff. 00331 */ 00332 float32 bestpath_fwdtree_lw_ratio; 00333 float32 ascale; 00335 ngram_search_stats_t st; 00336 ptmr_t fwdtree_perf; 00337 ptmr_t fwdflat_perf; 00338 ptmr_t bestpath_perf; 00339 int32 n_tot_frame; 00340 00341 /* A collection of beam widths. */ 00342 int32 beam; 00343 int32 dynamic_beam; 00344 int32 pbeam; 00345 int32 wbeam; 00346 int32 lpbeam; 00347 int32 lponlybeam; 00348 int32 fwdflatbeam; 00349 int32 fwdflatwbeam; 00350 int32 fillpen; 00351 int32 silpen; 00352 int32 wip; 00353 int32 nwpen; 00354 int32 pip; 00355 int32 maxwpf; 00356 int32 maxhmmpf; 00357 }; 00358 typedef struct ngram_search_s ngram_search_t; 00359 00363 ps_search_t *ngram_search_init(cmd_ln_t *config, 00364 acmod_t *acmod, 00365 dict_t *dict, 00366 dict2pid_t *d2p); 00367 00371 void ngram_search_free(ps_search_t *ngs); 00372 00378 int ngram_search_mark_bptable(ngram_search_t *ngs, int frame_idx); 00379 00383 void ngram_search_save_bp(ngram_search_t *ngs, int frame_idx, int32 w, 00384 int32 score, int32 path, int32 rc); 00385 00389 void ngram_search_alloc_all_rc(ngram_search_t *ngs, int32 w); 00390 00394 void ngram_search_free_all_rc(ngram_search_t *ngs, int32 w); 00395 00401 int ngram_search_find_exit(ngram_search_t *ngs, int frame_idx, int32 *out_best_score); 00402 00408 char const *ngram_search_bp_hyp(ngram_search_t *ngs, int bpidx); 00409 00413 void ngram_compute_seg_scores(ngram_search_t *ngs, float32 lwf); 00414 00418 ps_lattice_t *ngram_search_lattice(ps_search_t *search); 00419 00423 int32 ngram_search_exit_score(ngram_search_t *ngs, bptbl_t *pbe, int rcphone); 00424 00425 #endif /* __NGRAM_SEARCH_H__ */