PocketSphinx
0.6
|
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 00002 /* ==================================================================== 00003 * Copyright (c) 1999-2004 Carnegie Mellon University. All rights 00004 * reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions 00008 * are met: 00009 * 00010 * 1. Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in 00015 * the documentation and/or other materials provided with the 00016 * distribution. 00017 * 00018 * 00019 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 00020 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 00021 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00022 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 00023 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00024 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00025 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00026 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00027 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00028 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00029 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00030 * 00031 * ==================================================================== 00032 * 00033 */ 00034 /* 00035 * fsg_lextree.h -- The collection of all the lextrees for the entire FSM. 00036 * 00037 * ********************************************** 00038 * CMU ARPA Speech Project 00039 * 00040 * Copyright (c) 2004 Carnegie Mellon University. 00041 * ALL RIGHTS RESERVED. 00042 * ********************************************** 00043 * 00044 * HISTORY 00045 * 00046 * $Log: fsg_lextree.h,v $ 00047 * Revision 1.1.1.1 2006/05/23 18:45:02 dhuggins 00048 * re-importation 00049 * 00050 * Revision 1.1 2004/07/16 00:57:12 egouvea 00051 * Added Ravi's implementation of FSG support. 00052 * 00053 * Revision 1.3 2004/06/23 20:32:16 rkm 00054 * *** empty log message *** 00055 * 00056 * Revision 1.2 2004/05/27 14:22:57 rkm 00057 * FSG cross-word triphones completed (but for single-phone words) 00058 * 00059 * Revision 1.1.1.1 2004/03/01 14:30:31 rkm 00060 * 00061 * 00062 * Revision 1.1 2004/02/23 15:53:45 rkm 00063 * Renamed from fst to fsg 00064 * 00065 * Revision 1.2 2004/02/19 21:16:54 rkm 00066 * Added fsg_search.{c,h} 00067 * 00068 * Revision 1.1 2004/02/18 15:02:34 rkm 00069 * Added fsg_lextree.{c,h} 00070 * 00071 * 00072 * 18-Feb-2004 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon 00073 * Started. 00074 */ 00075 00076 00077 #ifndef __S2_FSG_LEXTREE_H__ 00078 #define __S2_FSG_LEXTREE_H__ 00079 00080 /* SphinxBase headers. */ 00081 #include <sphinxbase/cmd_ln.h> 00082 #include <sphinxbase/fsg_model.h> 00083 00084 /* Local headers. */ 00085 #include "hmm.h" 00086 #include "dict.h" 00087 #include "dict2pid.h" 00088 00089 /* 00090 * **HACK-ALERT**!! Compile-time constant determining the size of the 00091 * bitvector fsg_pnode_t.fsg_pnode_ctxt_t.bv. (See below.) 00092 * But it makes memory allocation simpler and more efficient. 00093 */ 00094 #define FSG_PNODE_CTXT_BVSZ 2 00095 00096 typedef struct { 00097 uint32 bv[FSG_PNODE_CTXT_BVSZ]; 00098 } fsg_pnode_ctxt_t; 00099 00100 00101 /* 00102 * All transitions (words) out of any given FSG state represented are by a 00103 * phonetic prefix lextree (except for epsilon or null transitions; they 00104 * are not part of the lextree). Lextree leaf nodes represent individual 00105 * FSG transitions, so no sharing is allowed at the leaf nodes. The FSG 00106 * transition probs are distributed along the lextree: the prob at a node 00107 * is the max of the probs of all leaf nodes (and, hence, FSG transitions) 00108 * reachable from that node. 00109 * 00110 * To conserve memory, the underlying HMMs with state-level information are 00111 * allocated only as needed. Root and leaf nodes must also account for all 00112 * the possible phonetic contexts, with an independent HMM for each distinct 00113 * context. 00114 */ 00115 typedef struct fsg_pnode_s { 00116 /* 00117 * If this is not a leaf node, the first successor (child) node. Otherwise 00118 * the parent FSG transition for which this is the leaf node (for figuring 00119 * the FSG destination state, and word emitted by the transition). A node 00120 * may have several children. The succ ptr gives just the first; the rest 00121 * are linked via the sibling ptr below. 00122 */ 00123 union { 00124 struct fsg_pnode_s *succ; 00125 fsg_link_t *fsglink; 00126 } next; 00127 00128 /* 00129 * For simplicity of memory management (i.e., freeing the pnodes), all 00130 * pnodes allocated for all transitions out of a state are maintained in a 00131 * linear linked list through the alloc_next pointer. 00132 */ 00133 struct fsg_pnode_s *alloc_next; 00134 00135 /* 00136 * The next node that is also a child of the parent of this node; NULL if 00137 * none. 00138 */ 00139 struct fsg_pnode_s *sibling; 00140 00141 /* 00142 * The transition (log) probability to be incurred upon transitioning to 00143 * this node. (Transition probabilities are really associated with the 00144 * transitions. But a lextree node has exactly one incoming transition. 00145 * Hence, the prob can be associated with the node.) 00146 * This is a logs2(prob) value, and includes the language weight. 00147 */ 00148 int32 logs2prob; 00149 00150 /* 00151 * The root and leaf positions associated with any transition have to deal 00152 * with multiple phonetic contexts. However, different contexts may result 00153 * in the same SSID (senone-seq ID), and can share a single pnode with that 00154 * SSID. But the pnode should track the set of context CI phones that share 00155 * it. Hence the fsg_pnode_ctxt_t bit-vector set-representation. (For 00156 * simplicity of implementation, its size is a compile-time constant for 00157 * now.) Single phone words would need a 2-D array of context, but that's 00158 * too expensive. For now, they simply use SIL as right context, so only 00159 * the left context is properly modelled. 00160 * (For word-internal phones, this field is unused, of course.) 00161 */ 00162 fsg_pnode_ctxt_t ctxt; 00163 00164 uint16 ci_ext; /* This node's CIphone as viewed externally (context) */ 00165 uint8 ppos; /* Phoneme position in pronunciation */ 00166 uint8 leaf; /* Whether this is a leaf node */ 00167 00168 /* HMM-state-level stuff here */ 00169 hmm_context_t *ctx; 00170 hmm_t hmm; 00171 } fsg_pnode_t; 00172 00173 /* Access macros */ 00174 #define fsg_pnode_leaf(p) ((p)->leaf) 00175 #define fsg_pnode_logs2prob(p) ((p)->logs2prob) 00176 #define fsg_pnode_succ(p) ((p)->next.succ) 00177 #define fsg_pnode_fsglink(p) ((p)->next.fsglink) 00178 #define fsg_pnode_sibling(p) ((p)->sibling) 00179 #define fsg_pnode_hmmptr(p) (&((p)->hmm)) 00180 #define fsg_pnode_ci_ext(p) ((p)->ci_ext) 00181 #define fsg_pnode_ppos(p) ((p)->ppos) 00182 #define fsg_pnode_leaf(p) ((p)->leaf) 00183 #define fsg_pnode_ctxt(p) ((p)->ctxt) 00184 00185 #define fsg_pnode_add_ctxt(p,c) ((p)->ctxt.bv[(c)>>5] |= (1 << ((c)&0x001f))) 00186 00187 /* 00188 * The following is macroized because its called very frequently 00189 * ::: uint32 fsg_pnode_ctxt_sub (fsg_pnode_ctxt_t *src, fsg_pnode_ctxt_t *sub); 00190 */ 00191 /* 00192 * Subtract bitvector sub from bitvector src (src updated with the result). 00193 * Return 0 if result is all 0, non-zero otherwise. 00194 */ 00195 00196 #if (FSG_PNODE_CTXT_BVSZ == 1) 00197 #define FSG_PNODE_CTXT_SUB(src,sub) \ 00198 ((src)->bv[0] = (~((sub)->bv[0]) & (src)->bv[0])) 00199 #elif (FSG_PNODE_CTXT_BVSZ == 2) 00200 #define FSG_PNODE_CTXT_SUB(src,sub) \ 00201 (((src)->bv[0] = (~((sub)->bv[0]) & (src)->bv[0])) | \ 00202 ((src)->bv[1] = (~((sub)->bv[1]) & (src)->bv[1]))) 00203 #elif (FSG_PNODE_CTXT_BVSZ == 4) 00204 #define FSG_PNODE_CTXT_SUB(src,sub) \ 00205 (((src)->bv[0] = (~((sub)->bv[0]) & (src)->bv[0])) | \ 00206 ((src)->bv[1] = (~((sub)->bv[1]) & (src)->bv[1])) | \ 00207 ((src)->bv[2] = (~((sub)->bv[2]) & (src)->bv[2])) | \ 00208 ((src)->bv[3] = (~((sub)->bv[3]) & (src)->bv[3]))) 00209 #endif 00210 00214 typedef struct fsg_lextree_s { 00215 fsg_model_t *fsg; 00216 hmm_context_t *ctx; 00217 dict_t *dict; 00218 dict2pid_t *d2p; 00219 bin_mdef_t *mdef; 00221 /* 00222 * Left and right CIphone sets for each state. 00223 * Left context CIphones for a state S: If word W transitions into S, W's 00224 * final CIphone is in S's {lc}. Words transitioning out of S must consider 00225 * these left context CIphones. 00226 * Similarly, right contexts for state S: If word W transitions out of S, 00227 * W's first CIphone is in S's {rc}. Words transitioning into S must consider 00228 * these right contexts. 00229 * 00230 * NOTE: Words may transition into and out of S INDIRECTLY, with intermediate 00231 * null transitions. 00232 * NOTE: Single-phone words are difficult; only SILENCE right context is 00233 * modelled for them. 00234 * NOTE: Non-silence filler phones aren't included in these sets. Filler 00235 * words don't use context, and present the SILENCE phone as context to 00236 * adjacent words. 00237 */ 00238 int16 **lc; 00239 int16 **rc; 00241 fsg_pnode_t **root; /* root[s] = lextree representing all transitions 00242 out of state s. Note that the "tree" for each 00243 state is actually a collection of trees, linked 00244 via fsg_pnode_t.sibling (root[s]->sibling) */ 00245 fsg_pnode_t **alloc_head; /* alloc_head[s] = head of linear list of all 00246 pnodes allocated for state s */ 00247 int32 n_pnode; /* #HMM nodes in search structure */ 00248 int32 wip; 00249 int32 pip; 00250 } fsg_lextree_t; 00251 00252 /* Access macros */ 00253 #define fsg_lextree_root(lt,s) ((lt)->root[s]) 00254 #define fsg_lextree_n_pnode(lt) ((lt)->n_pnode) 00255 00259 fsg_lextree_t *fsg_lextree_init(fsg_model_t *fsg, dict_t *dict, 00260 dict2pid_t *d2p, 00261 bin_mdef_t *mdef, hmm_context_t *ctx, 00262 int32 wip, int32 pip); 00263 00267 void fsg_lextree_free(fsg_lextree_t *fsg); 00268 00272 void fsg_lextree_dump(fsg_lextree_t *fsg, FILE *fh); 00273 00277 void fsg_psubtree_pnode_deactivate(fsg_pnode_t *pnode); 00278 00282 void fsg_pnode_add_all_ctxt(fsg_pnode_ctxt_t *ctxt); 00283 00284 00285 00286 #endif