00001 #include <math.h> 00002 #include "c_lpc10.h" 00003 00004 /**********************************************************/ 00005 /* Voicing coefficient and Linear Discriminant Analysis variables: 00006 Max number of VDC's and VDC levels */ 00007 #define MAXVDC 10 00008 #define NVDCL 5 00009 00010 /* Voicing Decision Parameter vector ((*) denotes zero coefficient): 00011 . maxmin (*) 00012 . lbe/lbve 00013 . zc 00014 . rc1 00015 . qs 00016 . ivrc2 00017 . ar_b 00018 . ar_f 00019 . log(lbe/lbve) (*) 00020 Define 2-D voicing decision coefficient vector according to the voicing 00021 parameter order above. Each column (VDC vector) is optimized for a specific 00022 SNR. The last element of the vector is the constant. */ 00023 static FLOAT G_vdc[MAXVDC][NVDCL] = { 00024 { 0, 0, 0, 0, 0 }, 00025 { 1714, 874, 510, 500, 500 }, 00026 { -110, -97, -70, -10, 0 }, 00027 { 334, 300, 250, 200, 0 }, 00028 { -4096, -4096, -4096, -4096, -4096 }, 00029 { -654, -1021, -1270, -1300, -1300 }, 00030 { 3752, 2451, 2194, 2000, 2000 }, 00031 { 3769, 2527, 2491, 2000, 2000 }, 00032 { 0, 0, 0, 0, 0 }, 00033 { 1181, -500, -1500, -2000, -2500 } 00034 }; 00035 00036 /* SNR levels */ 00037 static FLOAT G_vdcl[NVDCL] = { 600, 450, 300, 200, 0 }; 00038 00039 /**********************************************************/ 00040 /* Voicing Detection: makes voicing decisions for each half 00041 frame of input speech. Tentative voicing decisions are made two frames 00042 in the future (2F) for each half frame. These decisions are carried 00043 through one frame in the future (1F) to the present (P) frame where 00044 they are examined and smoothed, resulting in the final voicing 00045 decisions for each half frame. 00046 The voicing parameter (signal measurement) column vector ({value}) 00047 is based on a rectangular window of speech samples determined by the 00048 window placement algorithm. The voicing parameter vector contains the 00049 AMDF windowed maximum-to-minimum ratio, the zero crossing rate, energy 00050 measures, reflection coefficients, and prediction gains. The voicing 00051 window is placed to avoid contamination of the voicing parameter vector 00052 with speech onsets. 00053 The input signal is then classified as unvoiced (including 00054 silence) or voiced. This decision is made by a linear discriminant 00055 function consisting of a dot product of the voicing decision 00056 coefficient (vdc) row vector with the measurement column vector 00057 ({value}). The {vdc} vector is 2-dimensional, each row vector is optimized 00058 for a particular signal-to-noise ratio (SNR). So, before the dot 00059 product is performed, the SNR is estimated to select the appropriate 00060 {vdc} vector. 00061 The smoothing algorithm is a modified median smoother. The 00062 voicing discriminant function is used by the smoother to determine how 00063 strongly voiced or unvoiced a signal is. The smoothing is further 00064 modified if a speech onset and a voicing decision transition occur 00065 within one half frame. In this case, the voicing decision transition 00066 is extended to the speech onset. For transmission purposes, there are 00067 constraints on the duration and transition of voicing decisions. The 00068 smoother takes these constraints into account. 00069 Finally, the energy estimates are updated along with the dither 00070 threshold used to calculate the zero crossing rate (ZC). 00071 00072 Inputs: 00073 . vwin[AF][2] - Voicing window limits (vwin[AF-1][0] to vwin[AF-1][1]) 00074 . inbuf[] - Raw input speech (with DC bias removed each frame) (vwin limited) 00075 . lpbuf[] - Low-pass filtered speech buffer (vwin limited) 00076 . half - Present analysis half frame number 00077 . minamd - Minimum value of the AMDF 00078 . maxamd - Maximum value of the AMDF 00079 . mintau - Pointer to the lag of the minimum AMDF value 00080 . ivrc[2] - Inverse filter's RC's 00081 . obound[AF] - Onset boundary descriptions 00082 Output: 00083 . voibuf[AF+1][2] - Buffer of voicing decisions 00084 Internal global: 00085 . maxmin - AMDF's 1 octave windowed maximum-to-minimum ratio 00086 . lbve - Low band voiced energy 00087 . lbue - Low band unvoiced energy 00088 . fbve - Full band voiced energy 00089 . fbue - Full band unvoiced energy 00090 . slbue - Scaled low band unvoiced energy 00091 . sfbue - Scaled full band unvoiced energy 00092 . voice[AF][2] - History of LDA results 00093 . snr - Estimate of signal-to-noise ratio 00094 . dither - Zero crossing threshold level 00095 . ofbue - Previous full-band unvoiced energy 00096 . olbue - Previous low-band unvoiced energy 00097 00098 Internal: 00099 . qs - Ratio of preemphasized to full-band energies 00100 . rc1 - First reflection coefficient 00101 . ar_b - Product of the causal forward and reverse pitch prediction gains 00102 . ar_f - Product of the noncausal forward and rev. pitch prediction gains 00103 . zc - Zero crossing rate 00104 . vdcl[NVDCL] - SNR values corresponding to the set of vdc's 00105 . vdc[MAXVDC][NVDCL] - 2-D voicing decision coefficient vector 00106 . value[9] - Voicing Parameters 00107 . lbe - Ratio of low-band instantaneous to average energies 00108 . fbe - Ratio of full-band instantaneous to average energies 00109 . snr2 - Estimate of low-band signal-to-noise ratio 00110 . snrl - SNR level number 00111 . ot - Onset transition present 00112 . vstate - Decimal interpretation of binary voicing classifications 00113 Constants: 00114 . AF - The analysis frame number 00115 . REF - Reference energy for initialization and {dither} threshold 00116 . MAXVDC - Max number of VDC's 00117 . NVDCL - Number of VDC levels */ 00118 00119 VOID voicin( INDEX vwin[AF][2], FLOAT inbuf[], FLOAT lpbuf[], INDEX half, 00120 FLOAT minamd, FLOAT maxamd, INDEX mintau, FLOAT ivrc[2], 00121 INDEX* obound, BOOL voibuf[AF+1][2] ) 00122 { 00123 INDEX zc, lbe, fbe, i, snrl, vstate; 00124 FLOAT snr2, qs, rc1, ar_b, ar_f; 00125 FLOAT value[9]; 00126 BOOL ot; 00127 00128 /* The {voice} array contains the result of the linear discriminant function 00129 (analog values). The {voibuf} array contains the hard-limited binary 00130 voicing decisions. The {voice} and {voibuf} arrays, according to C 00131 memory allocation, are addressed as: [future-frame num][half-frame num] 00132 | Past | Present | Future1 | Future2 | 00133 | 0,0 | 0,1 | 1,0 | 1,1 | 2,0 | 2,1 | 3,0 | 3,1 | ---> time */ 00134 00135 /* Update linear discriminant function history each frame: */ 00136 if (!half) { 00137 g_voice[0][0] = g_voice[1][0]; 00138 g_voice[0][1] = g_voice[1][1]; 00139 g_voice[1][0] = g_voice[2][0]; 00140 g_voice[1][1] = g_voice[2][1]; 00141 if (minamd>1) 00142 g_maxmin = maxamd / minamd; 00143 else 00144 g_maxmin = maxamd; 00145 } 00146 00147 /* Calculate voicing parameters, twice per frame: */ 00148 vparms(vwin, inbuf, lpbuf, half, &g_dither, mintau, &zc, 00149 &lbe, &fbe, &qs, &rc1, &ar_b, &ar_f); 00150 00151 /* Estimate signal-to-noise ratio to select the appropriate {vdc} vector. 00152 The SNR is estimated as the running average of the ratio of the 00153 running average full-band voiced energy to the running average 00154 full-band unvoiced energy. SNR filter has gain of 63. */ 00155 g_snr = (FLOAT)((63.*(g_snr + ((g_fbue>1)?g_fbve/(FLOAT)g_fbue:g_fbve))/64.)+.5); 00156 snr2 = (g_lbue>1) ? (g_snr*g_fbue)/g_lbue : (g_snr*g_fbue); 00157 00158 /* Quantize SNR to {snrl} according to {vdcl} thresholds. 00159 Note: {snrl} can reach up to NVDCL-1 here */ 00160 for (snrl = 0; snrl < (NVDCL-1); snrl++) 00161 if (snr2 > G_vdcl[snrl]) 00162 break; 00163 00164 /* Linear discriminant voicing parameters: */ 00165 value[0] = g_maxmin; 00166 value[1] = (g_lbve>1) ? (FLOAT)lbe/(FLOAT)g_lbve : (FLOAT)lbe; 00167 value[2] = (FLOAT)zc; 00168 value[3] = rc1; 00169 value[4] = qs; 00170 value[5] = ivrc[1]; 00171 value[6] = ar_b; 00172 value[7] = ar_f; 00173 value[8] = 0.; 00174 00175 /* Evaluation of linear discriminant function: */ 00176 g_voice[AF-1][half] = G_vdc[9][snrl]; 00177 for (i = 0; i < 9; i++) 00178 g_voice[AF-1][half] += G_vdc[i][snrl] * value[i]; 00179 00180 /* Classify as voiced if discriminant > 0, otherwise unvoiced 00181 Voicing decision for current half-frame: TRUE = voiced; FALSE = Unvoiced */ 00182 voibuf[AF][half] = (g_voice[AF-1][half] > 0.); 00183 00184 if (half) { /* Skip voicing decision smoothing in first half-frame: */ 00185 /* Voicing decision smoothing rules (override of linear combination): 00186 - Unvoiced half-frames: At least two in a row. 00187 - voiced half-frames: At least two in a row in one frame. Otherwise 00188 at least three in a row. (Due to the way transition frames are encoded) 00189 00190 In many cases, the discriminant function determines how to smooth. 00191 In the following chart, the decisions marked with a * may be overridden. 00192 00193 Voicing override of transitions at onsets: If a V/UV or UV/V voicing 00194 decision transition occurs within one-half frame of an onset bounding 00195 a voicing window, then the transition is moved to occur at the onset. 00196 00197 P 1F 00198 ----- ----- 00199 0 0 0 0 00200 0 0 0* 1 (If there is an onset there) 00201 0 0 1* 0* (Based on 2F and discriminant distance) 00202 0 0 1 1 00203 0 1* 0 0 (Always) 00204 0 1* 0* 1 (Based on discriminant distance) 00205 0* 1 1 0* (Based on past, 2F, and discriminant distance) 00206 0 1* 1 1 (If there is an onset there) 00207 1 0* 0 0 (If there is an onset there) 00208 1 0 0 1 00209 1 0* 1* 0 (Based on discriminant distance) 00210 1 0* 1 1 (Always) 00211 1 1 0 0 00212 1 1 0* 1* (Based on 2F and discriminant distance) 00213 1 1 1* 0 (If there is an onset there) 00214 1 1 1 1 */ 00215 00216 /*Determine if there is an onset transition between P and 1F. 00217 {ot} (Onset Transition) is true if there is an onset between 00218 P and 1F but not after 1F. */ 00219 ot = ((obound[0] & 2) || (obound[1]==1)) && !(obound[2] & 1); 00220 00221 /* Multi-way dispatch on voicing decision history: */ 00222 vstate = (INDEX)(voibuf[1][0])*8 + (INDEX)(voibuf[1][1])*4 00223 + (INDEX)(voibuf[2][0])*2 + (INDEX)(voibuf[2][1]); 00224 switch (vstate) { 00225 case 0: 00226 break; 00227 case 1: 00228 if (ot && voibuf[3][0]) 00229 voibuf[2][0] = TRUE; 00230 break; 00231 case 2: 00232 if ((!voibuf[3][0]) || (g_voice[1][0] < -g_voice[1][1])) 00233 voibuf[2][0] = FALSE; 00234 else 00235 voibuf[2][1] = TRUE; 00236 break; 00237 case 3: 00238 break; 00239 case 4: 00240 voibuf[1][1] = FALSE; 00241 break; 00242 case 5: 00243 if (g_voice[0][1] < -g_voice[1][0]) 00244 voibuf[1][1] = FALSE; 00245 else 00246 voibuf[2][0] = TRUE; 00247 break; 00248 case 6: 00249 if (voibuf[0][0] || voibuf[3][0] || (g_voice[1][1] > g_voice[0][0])) 00250 voibuf[2][1] = TRUE; 00251 else 00252 voibuf[1][0] = TRUE; 00253 break; 00254 case 7: 00255 if (ot) 00256 voibuf[1][1] = FALSE; 00257 break; 00258 case 8: 00259 if (ot) 00260 voibuf[1][1] = TRUE; 00261 break; 00262 case 9: 00263 break; 00264 case 10: 00265 if (g_voice[1][0] < -g_voice[0][1]) 00266 voibuf[2][0] = FALSE; 00267 else 00268 voibuf[1][1] = TRUE; 00269 break; 00270 case 11: 00271 voibuf[1][1] = TRUE; 00272 break; 00273 case 12: 00274 break; 00275 case 13: 00276 if ((!voibuf[3][0]) && (g_voice[1][1] < -g_voice[1][0])) 00277 voibuf[2][1] = FALSE; 00278 else 00279 voibuf[2][0] = TRUE; 00280 break; 00281 case 14: 00282 if (ot && (!voibuf[3][0])) 00283 voibuf[2][0] = FALSE; 00284 break; 00285 default: 00286 break; 00287 } 00288 } 00289 00290 /* Now update parameters: 00291 During unvoiced half-frames, update the low band and full band unvoiced 00292 energy estimates ({lbue} and {fbue}) and also the zero crossing 00293 threshold ({dither}). (The input to the unvoiced energy filters is 00294 restricted to be less than 10dB (*3.) above the previous inputs of the 00295 filters.) 00296 During voiced half-frames, update the low-pass ({lbve}) and all-pass 00297 ({fbve}) voiced energy estimates. */ 00298 if (!voibuf[3][half]) { 00299 g_ofbue *= 3; /* +10dB */ 00300 g_sfbue = (LINDEX)((63. * g_sfbue + 8. * MMIN(fbe, g_ofbue)) / 64. + .5); 00301 g_fbue = (INDEX)(g_sfbue / 8); 00302 g_ofbue = fbe; 00303 g_olbue *= 3; /* +10dB */ 00304 g_slbue = (LINDEX)((63. * g_slbue + 8. * MMIN(lbe, g_olbue)) / 64. + .5); 00305 g_lbue = (INDEX)(g_slbue / 8); 00306 g_olbue = lbe; 00307 } 00308 else { 00309 g_lbve = (INDEX)((63. * g_lbve + lbe) / 64. + .5); 00310 g_fbve = (INDEX)((63. * g_fbve + fbe) / 64. + .5); 00311 } 00312 00313 /* Set dither threshold to yield proper zero crossing rates in the 00314 presence of low frequency noise and low level signal input. 00315 NOTE: The divisor is a function of REF, the expected energies. */ 00316 g_dither = (FLOAT)(64. * sqrt((FLOAT)g_lbue * (FLOAT)g_lbve) / REF); 00317 if (g_dither<1.) 00318 g_dither = 1.; 00319 else if (g_dither>20.) 00320 g_dither = 20.; 00321 00322 /* Voicing decisions are returned in {voibuf}. */ 00323 } 00324 00325 /**********************************************************/ 00326