SphinxBase
0.6
|
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 00002 /* 00003 * Copyright (c) 2008 Beyond Access, Inc. All rights reserved. 00004 * 00005 * Redistribution and use in source and binary forms, with or without 00006 * modification, are permitted provided that the following conditions 00007 * are met: 00008 * 00009 * 1. Redistributions of source code must retain the above copyright 00010 * notice, this list of conditions and the following disclaimer. 00011 * 00012 * 2. Redistributions in binary form must reproduce the above copyright 00013 * notice, this list of conditions and the following disclaimer in 00014 * the documentation and/or other materials provided with the 00015 * distribution. 00016 * 00017 * THIS SOFTWARE IS PROVIDED BY BEYOND ACCESS, INC. ``AS IS'' AND ANY 00018 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00019 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00020 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL BEYOND ACCESS, INC. NOR 00021 * ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 00022 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 00023 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00024 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00025 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00026 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00027 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00028 */ 00029 00035 /* This implements part of the YIN algorithm: 00036 * 00037 * "YIN, a fundamental frequency estimator for speech and music". 00038 * Alain de Cheveigné and Hideki Kawahara. Journal of the Acoustical 00039 * Society of America, 111 (4), April 2002. 00040 */ 00041 00042 #include "sphinxbase/prim_type.h" 00043 #include "sphinxbase/ckd_alloc.h" 00044 #include "sphinxbase/fixpoint.h" 00045 00046 #include "sphinxbase/yin.h" 00047 00048 #include <stdio.h> 00049 00050 struct yin_s { 00051 uint16 frame_size; 00052 uint16 search_threshold; 00053 uint16 search_range; 00054 uint16 nfr; 00056 unsigned char wsize; 00057 unsigned char wstart; 00058 unsigned char wcur; 00059 unsigned char endut; 00061 fixed32 **diff_window; 00062 uint16 *period_window; 00063 }; 00064 00068 static void 00069 cmn_diff(int16 const *signal, int32 *out_diff, int ndiff) 00070 { 00071 uint32 cum, cshift; 00072 int32 t, tscale; 00073 00074 out_diff[0] = 32768; 00075 cum = 0; 00076 cshift = 0; 00077 00078 /* Determine how many bits we can scale t up by below. */ 00079 for (tscale = 0; tscale < 32; ++tscale) 00080 if (ndiff & (1<<(31-tscale))) 00081 break; 00082 --tscale; /* Avoid teh overflowz. */ 00083 /* printf("tscale is %d (ndiff - 1) << tscale is %d\n", 00084 tscale, (ndiff-1) << tscale); */ 00085 00086 /* Somewhat elaborate block floating point implementation. 00087 * The fp implementation of this is really a lot simpler. */ 00088 for (t = 1; t < ndiff; ++t) { 00089 uint32 dd, dshift, norm; 00090 int j; 00091 00092 dd = 0; 00093 dshift = 0; 00094 for (j = 0; j < ndiff; ++j) { 00095 int diff = signal[j] - signal[t + j]; 00096 /* Guard against overflows. */ 00097 if (dd > (1UL<<tscale)) { 00098 dd >>= 1; 00099 ++dshift; 00100 } 00101 dd += (diff * diff) >> dshift; 00102 } 00103 /* Make sure the diffs and cum are shifted to the same 00104 * scaling factor (usually dshift will be zero) */ 00105 if (dshift > cshift) { 00106 cum += dd << (dshift-cshift); 00107 } 00108 else { 00109 cum += dd >> (cshift-dshift); 00110 } 00111 00112 /* Guard against overflows and also ensure that (t<<tscale) > cum. */ 00113 while (cum > (1UL<<tscale)) { 00114 cum >>= 1; 00115 ++cshift; 00116 } 00117 /* Avoid divide-by-zero! */ 00118 if (cum == 0) cum = 1; 00119 /* Calculate the normalizer in high precision. */ 00120 norm = (t << tscale) / cum; 00121 /* Do a long multiply and shift down to Q15. */ 00122 out_diff[t] = (int32)(((long long)dd * norm) 00123 >> (tscale - 15 + cshift - dshift)); 00124 /* printf("dd %d cshift %d dshift %d scaledt %d cum %d norm %d cmn %d\n", 00125 dd, cshift, dshift, (t<<tscale), cum, norm, out_diff[t]); */ 00126 } 00127 } 00128 00129 yin_t * 00130 yin_init(int frame_size, float search_threshold, 00131 float search_range, int smooth_window) 00132 { 00133 yin_t *pe; 00134 00135 pe = ckd_calloc(1, sizeof(*pe)); 00136 pe->frame_size = frame_size; 00137 pe->search_threshold = (uint16)(search_threshold * 32768); 00138 pe->search_range = (uint16)(search_range * 32768); 00139 pe->wsize = smooth_window * 2 + 1; 00140 pe->diff_window = ckd_calloc_2d(pe->wsize, 00141 pe->frame_size / 2, 00142 sizeof(**pe->diff_window)); 00143 pe->period_window = ckd_calloc(pe->wsize, 00144 sizeof(*pe->period_window)); 00145 return pe; 00146 } 00147 00148 void 00149 yin_free(yin_t *pe) 00150 { 00151 ckd_free_2d(pe->diff_window); 00152 ckd_free(pe->period_window); 00153 ckd_free(pe); 00154 } 00155 00156 void 00157 yin_start(yin_t *pe) 00158 { 00159 /* Reset the circular window pointers. */ 00160 pe->wstart = pe->endut = 0; 00161 pe->nfr = 0; 00162 } 00163 00164 void 00165 yin_end(yin_t *pe) 00166 { 00167 pe->endut = 1; 00168 } 00169 00170 int 00171 thresholded_search(int32 *diff_window, fixed32 threshold, int start, int end) 00172 { 00173 int i, min, argmin; 00174 00175 min = INT_MAX; 00176 argmin = 0; 00177 for (i = start; i < end; ++i) { 00178 int diff = diff_window[i]; 00179 00180 if (diff < threshold) { 00181 min = diff; 00182 argmin = i; 00183 break; 00184 } 00185 if (diff < min) { 00186 min = diff; 00187 argmin = i; 00188 } 00189 } 00190 return argmin; 00191 } 00192 00193 void 00194 yin_write(yin_t *pe, int16 const *frame) 00195 { 00196 int outptr, difflen; 00197 00198 /* Rotate the window one frame forward. */ 00199 ++pe->wstart; 00200 /* Fill in the frame before wstart. */ 00201 outptr = pe->wstart - 1; 00202 /* Wrap around the window pointer. */ 00203 if (pe->wstart == pe->wsize) 00204 pe->wstart = 0; 00205 00206 /* Now calculate normalized difference function. */ 00207 difflen = pe->frame_size / 2; 00208 cmn_diff(frame, pe->diff_window[outptr], difflen); 00209 00210 /* Find the first point under threshold. If not found, then 00211 * use the absolute minimum. */ 00212 pe->period_window[outptr] 00213 = thresholded_search(pe->diff_window[outptr], 00214 pe->search_threshold, 0, difflen); 00215 00216 /* Increment total number of frames. */ 00217 ++pe->nfr; 00218 } 00219 00220 int 00221 yin_read(yin_t *pe, uint16 *out_period, uint16 *out_bestdiff) 00222 { 00223 int wstart, wlen, half_wsize, i; 00224 int best, best_diff, search_width, low_period, high_period; 00225 00226 half_wsize = (pe->wsize-1)/2; 00227 /* Without any smoothing, just return the current value (don't 00228 * need to do anything to the current poitner either). */ 00229 if (half_wsize == 0) { 00230 if (pe->endut) 00231 return 0; 00232 *out_period = pe->period_window[0]; 00233 *out_bestdiff = pe->diff_window[0][pe->period_window[0]]; 00234 return 1; 00235 } 00236 00237 /* We can't do anything unless we have at least (wsize-1)/2 + 1 00238 * frames, unless we're at the end of the utterance. */ 00239 if (pe->endut == 0 && pe->nfr < half_wsize + 1) { 00240 /* Don't increment the current pointer either. */ 00241 return 0; 00242 } 00243 00244 /* Establish the smoothing window. */ 00245 /* End of utterance. */ 00246 if (pe->endut) { 00247 /* We are done (no more data) when pe->wcur = pe->wstart. */ 00248 if (pe->wcur == pe->wstart) 00249 return 0; 00250 /* I.e. pe->wcur (circular minus) half_wsize. */ 00251 wstart = (pe->wcur + pe->wsize - half_wsize) % pe->wsize; 00252 /* Number of frames from wstart up to pe->wstart. */ 00253 wlen = pe->wstart - wstart; 00254 if (wlen < 0) wlen += pe->wsize; 00255 /*printf("ENDUT! ");*/ 00256 } 00257 /* Beginning of utterance. */ 00258 else if (pe->nfr < pe->wsize) { 00259 wstart = 0; 00260 wlen = pe->nfr; 00261 } 00262 /* Normal case, it is what it is. */ 00263 else { 00264 wstart = pe->wstart; 00265 wlen = pe->wsize; 00266 } 00267 00268 /* Now (finally) look for the best local estimate. */ 00269 /* printf("Searching for local estimate in %d frames around %d\n", 00270 wlen, pe->nfr + 1 - wlen); */ 00271 best = pe->period_window[pe->wcur]; 00272 best_diff = pe->diff_window[pe->wcur][best]; 00273 for (i = 0; i < wlen; ++i) { 00274 int j = wstart + i; 00275 int diff; 00276 00277 j %= pe->wsize; 00278 diff = pe->diff_window[j][pe->period_window[j]]; 00279 /* printf("%.2f,%.2f ", 1.0 - (double)diff/32768, 00280 pe->period_window[j] ? 8000.0/pe->period_window[j] : 8000.0); */ 00281 if (diff < best_diff) { 00282 best_diff = diff; 00283 best = pe->period_window[j]; 00284 } 00285 } 00286 /* printf("best: %.2f, %.2f\n", 1.0 - (double)best_diff/32768, 00287 best ? 8000.0/best : 8000.0); */ 00288 /* If it's the same as the current one then return it. */ 00289 if (best == pe->period_window[pe->wcur]) { 00290 /* Increment the current pointer. */ 00291 if (++pe->wcur == pe->wsize) 00292 pe->wcur = 0; 00293 *out_period = best; 00294 *out_bestdiff = best_diff; 00295 return 1; 00296 } 00297 /* Otherwise, redo the search inside a narrower range. */ 00298 search_width = best * pe->search_range / 32768; 00299 /* printf("Search width = %d * %.2f = %d\n", 00300 best, (double)pe->search_range/32768, search_width); */ 00301 if (search_width == 0) search_width = 1; 00302 low_period = best - search_width; 00303 high_period = best + search_width; 00304 if (low_period < 0) low_period = 0; 00305 if (high_period > pe->frame_size / 2) high_period = pe->frame_size / 2; 00306 /* printf("Searching from %d to %d\n", low_period, high_period); */ 00307 best = thresholded_search(pe->diff_window[pe->wcur], 00308 pe->search_threshold, 00309 low_period, high_period); 00310 best_diff = pe->diff_window[pe->wcur][best]; 00311 00312 if (out_period) 00313 *out_period = (best > 65535) ? 65535 : best; 00314 if (out_bestdiff) 00315 *out_bestdiff = (best_diff > 65535) ? 65535 : best_diff; 00316 00317 /* Increment the current pointer. */ 00318 if (++pe->wcur == pe->wsize) 00319 pe->wcur = 0; 00320 return 1; 00321 }