PocketSphinx  0.6
src/libpocketsphinx/ngram_search.h
Go to the documentation of this file.
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 2008 Carnegie Mellon University.  All rights
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 
00042 #ifndef __NGRAM_SEARCH_H__
00043 #define __NGRAM_SEARCH_H__
00044 
00045 /* SphinxBase headers. */
00046 #include <sphinxbase/cmd_ln.h>
00047 #include <sphinxbase/logmath.h>
00048 #include <sphinxbase/ngram_model.h>
00049 #include <sphinxbase/listelem_alloc.h>
00050 #include <sphinxbase/err.h>
00051 
00052 /* Local headers. */
00053 #include "pocketsphinx_internal.h"
00054 #include "hmm.h"
00055 
00064 typedef struct chan_s {
00065     hmm_t hmm;                  
00068     struct chan_s *next;        
00071     struct chan_s *alt;         
00073     int32    ciphone;           
00074     union {
00075         int32 penult_phn_wid;   
00079         int32 rc_id;            
00080     } info;
00081 } chan_t;
00082 
00090 typedef struct root_chan_s {
00091     hmm_t hmm;                  
00094     chan_t *next;               
00096     int32  penult_phn_wid;
00097     int32  this_phn_wid;        
00100     int16    ciphone;           
00102     int16    ci2phone;          
00104 } root_chan_t;
00105 
00109 typedef struct bptbl_s {
00110     int16    frame;             
00111     uint8    valid;             
00112     uint8    refcnt;            
00113     int32    wid;               
00114     int32    bp;                
00115     int32    score;             
00116     int32    s_idx;             
00117     int32    real_wid;          
00118     int32    prev_real_wid;     
00119     int16    last_phone;        
00120     int16    last2_phone;       
00121 } bptbl_t;
00122 
00126 typedef struct bptbl_seg_s {
00127     ps_seg_t base;  
00128     int32 *bpidx;   
00129     int16 n_bpidx;  
00130     int16 cur;      
00131 } bptbl_seg_t;
00132 
00133 /*
00134  * Candidates words for entering their last phones.  Cleared and rebuilt in each
00135  * frame.
00136  * NOTE: candidates can only be multi-phone, real dictionary words.
00137  */
00138 typedef struct lastphn_cand_s {
00139     int32 wid;
00140     int32 score;
00141     int32 bp;
00142     int32 next;                 /* next candidate starting at the same frame */
00143 } lastphn_cand_t;
00144 
00145 /*
00146  * Since the same instance of a word (i.e., <word,start-frame>) reaches its last
00147  * phone several times, we can compute its best BP and LM transition score info
00148  * just the first time and cache it for future occurrences.  Structure for such
00149  * a cache.
00150  */
00151 typedef struct {
00152     int32 sf;                   /* Start frame */
00153     int32 dscr;                 /* Delta-score upon entering last phone */
00154     int32 bp;                   /* Best BP */
00155 } last_ltrans_t;
00156 
00157 #define CAND_SF_ALLOCSIZE       32
00158 typedef struct {
00159     int32 bp_ef;
00160     int32 cand;
00161 } cand_sf_t;
00162 
00163 /*
00164  * Structure for reorganizing the BP table entries in the current frame according
00165  * to distinct right context ci-phones.  Each entry contains the best BP entry for
00166  * a given right context.  Each successor word will pick up the correct entry based
00167  * on its first ci-phone.
00168  */
00169 typedef struct bestbp_rc_s {
00170     int32 score;
00171     int32 path;                 /* BP table index corresponding to this entry */
00172     int32 lc;                   /* right most ci-phone of above BP entry word */
00173 } bestbp_rc_t;
00174 
00175 #define NO_BP           -1
00176 
00180 typedef struct ngram_search_stats_s {
00181     int32 n_phone_eval;
00182     int32 n_root_chan_eval;
00183     int32 n_nonroot_chan_eval;
00184     int32 n_last_chan_eval;
00185     int32 n_word_lastchan_eval;
00186     int32 n_lastphn_cand_utt;
00187     int32 n_fwdflat_chan;
00188     int32 n_fwdflat_words;
00189     int32 n_fwdflat_word_transition;
00190     int32 n_senone_active_utt;
00191 } ngram_search_stats_t;
00192 
00193 
00197 struct ngram_search_s {
00198     ps_search_t base;
00199     ngram_model_t *lmset;  
00200     hmm_context_t *hmmctx; 
00202     /* Flags to quickly indicate which passes are enabled. */
00203     uint8 fwdtree;
00204     uint8 fwdflat;
00205     uint8 bestpath;
00206 
00207     /* State of procesing. */
00208     uint8 done;
00209 
00210     /* Allocators */
00211     listelem_alloc_t *chan_alloc; 
00212     listelem_alloc_t *root_chan_alloc; 
00213     listelem_alloc_t *latnode_alloc; 
00231     root_chan_t *root_chan;  
00232     int32 n_root_chan_alloc; 
00233     int32 n_root_chan;       
00234     int32 n_nonroot_chan;    
00235     int32 max_nonroot_chan;  
00236     root_chan_t *rhmm_1ph;   
00246     chan_t **word_chan;
00247     bitvec_t *word_active;      
00263     int32 *homophone_set;
00264     int32 *single_phone_wid; 
00265     int32 n_1ph_words;       
00266     int32 n_1ph_LMwords;     
00275     chan_t ***active_chan_list;
00276     int32 n_active_chan[2];  
00287     int32 **active_word_list;
00288     int32 n_active_word[2];  
00290     /*
00291      * FIXME: Document all of these bits.
00292      */
00293     lastphn_cand_t *lastphn_cand;
00294     int32 n_lastphn_cand;
00295     last_ltrans_t *last_ltrans;      /* one per word */
00296     int32 cand_sf_alloc;
00297     cand_sf_t *cand_sf;
00298     bestbp_rc_t *bestbp_rc;
00299 
00300     bptbl_t *bp_table;       /* Forward pass lattice */
00301     int32 bpidx;             /* First free BPTable entry */
00302     int32 bp_table_size;
00303     int32 *bscore_stack;     /* Score stack for all possible right contexts */
00304     int32 bss_head;          /* First free BScoreStack entry */
00305     int32 bscore_stack_size;
00306 
00307     int32 n_frame_alloc; 
00308     int32 n_frame;       
00309     int32 *bp_table_idx; /* First BPTable entry for each frame */
00310     int32 *word_lat_idx; /* BPTable index for any word in current frame;
00311                             cleared before each frame */
00312 
00313     /*
00314      * Flat lexicon (2nd pass) search stuff.
00315      */
00316     ps_latnode_t **frm_wordlist;   
00317     int32 *fwdflat_wordlist;    
00318     bitvec_t *expand_word_flag;
00319     int32 *expand_word_list;
00320     int32 n_expand_words;
00321     int32 min_ef_width;
00322     int32 max_sf_win;
00323     float32 fwdflat_fwdtree_lw_ratio;
00324 
00325     int32 best_score; 
00326     int32 last_phone_best_score; 
00327     int32 renormalized;
00328 
00329     /*
00330      * DAG (3rd pass) search stuff.
00331      */
00332     float32 bestpath_fwdtree_lw_ratio;
00333     float32 ascale; 
00335     ngram_search_stats_t st; 
00336     ptmr_t fwdtree_perf;
00337     ptmr_t fwdflat_perf;
00338     ptmr_t bestpath_perf;
00339     int32 n_tot_frame;
00340 
00341     /* A collection of beam widths. */
00342     int32 beam;
00343     int32 dynamic_beam;
00344     int32 pbeam;
00345     int32 wbeam;
00346     int32 lpbeam;
00347     int32 lponlybeam;
00348     int32 fwdflatbeam;
00349     int32 fwdflatwbeam;
00350     int32 fillpen;
00351     int32 silpen;
00352     int32 wip;
00353     int32 nwpen;
00354     int32 pip;
00355     int32 maxwpf;
00356     int32 maxhmmpf;
00357 };
00358 typedef struct ngram_search_s ngram_search_t;
00359 
00363 ps_search_t *ngram_search_init(cmd_ln_t *config,
00364                                acmod_t *acmod,
00365                                dict_t *dict,
00366                                dict2pid_t *d2p);
00367 
00371 void ngram_search_free(ps_search_t *ngs);
00372 
00378 int ngram_search_mark_bptable(ngram_search_t *ngs, int frame_idx);
00379 
00383 void ngram_search_save_bp(ngram_search_t *ngs, int frame_idx, int32 w,
00384                           int32 score, int32 path, int32 rc);
00385 
00389 void ngram_search_alloc_all_rc(ngram_search_t *ngs, int32 w);
00390 
00394 void ngram_search_free_all_rc(ngram_search_t *ngs, int32 w);
00395 
00401 int ngram_search_find_exit(ngram_search_t *ngs, int frame_idx, int32 *out_best_score);
00402 
00408 char const *ngram_search_bp_hyp(ngram_search_t *ngs, int bpidx);
00409 
00413 void ngram_compute_seg_scores(ngram_search_t *ngs, float32 lwf);
00414 
00418 ps_lattice_t *ngram_search_lattice(ps_search_t *search);
00419 
00423 int32 ngram_search_exit_score(ngram_search_t *ngs, bptbl_t *pbe, int rcphone);
00424 
00425 #endif /* __NGRAM_SEARCH_H__ */