AhoLib: Fichero Fuente c

00001 #include <math.h>
00002 #include "c_lpc10.h"
00003 
00004 /**********************************************************/
00005 /* Voicing coefficient and Linear Discriminant Analysis variables:
00006 Max number of VDC's and VDC levels */
00007 #define MAXVDC 10
00008 #define NVDCL 5
00009 
00010 /* Voicing Decision Parameter vector ((*) denotes zero coefficient):
00011 .    maxmin (*)
00012 .    lbe/lbve
00013 .    zc
00014 .    rc1
00015 .    qs
00016 .    ivrc2
00017 .    ar_b
00018 .    ar_f
00019 .    log(lbe/lbve) (*)
00020 Define 2-D voicing decision coefficient vector according to the voicing
00021 parameter order above.  Each column (VDC vector) is optimized for a specific
00022 SNR. The last element of the vector is the constant. */
00023 static FLOAT G_vdc[MAXVDC][NVDCL] = {
00024     {     0,     0,     0,     0,     0 },
00025     {  1714,   874,   510,   500,   500 },
00026     {  -110,   -97,   -70,   -10,     0 },
00027     {   334,   300,   250,   200,     0 },
00028     { -4096, -4096, -4096, -4096, -4096 },
00029     {  -654, -1021, -1270, -1300, -1300 },
00030     {  3752,  2451,  2194,  2000,  2000 },
00031     {  3769,  2527,  2491,  2000,  2000 },
00032     {     0,     0,     0,     0,     0 },
00033     {  1181,  -500, -1500, -2000, -2500 }
00034 };
00035 
00036 /* SNR levels */
00037 static FLOAT G_vdcl[NVDCL] = { 600, 450, 300, 200, 0 };
00038 
00039 /**********************************************************/
00040 /* Voicing Detection: makes voicing decisions for each half
00041 frame of input speech.  Tentative voicing decisions are made two frames
00042 in the future (2F) for each half frame.  These decisions are carried
00043 through one frame in the future (1F) to the present (P) frame where
00044 they are examined and smoothed, resulting in the final voicing
00045 decisions for each half frame.
00046 The voicing parameter (signal measurement) column vector ({value})
00047 is based on a rectangular window of speech samples determined by the
00048 window placement algorithm.  The voicing parameter vector contains the
00049 AMDF windowed maximum-to-minimum ratio, the zero crossing rate, energy
00050 measures, reflection coefficients, and prediction gains.  The voicing
00051 window is placed to avoid contamination of the voicing parameter vector
00052 with speech onsets.
00053 The input signal is then classified as unvoiced (including
00054 silence) or voiced.  This decision is made by a linear discriminant
00055 function consisting of a dot product of the voicing decision
00056 coefficient (vdc) row vector with the measurement column vector
00057 ({value}).  The {vdc} vector is 2-dimensional, each row vector is optimized
00058 for a particular signal-to-noise ratio (SNR).  So, before the dot
00059 product is performed, the SNR is estimated to select the appropriate
00060 {vdc} vector.
00061 The smoothing algorithm is a modified median smoother.  The
00062 voicing discriminant function is used by the smoother to determine how
00063 strongly voiced or unvoiced a signal is.  The smoothing is further
00064 modified if a speech onset and a voicing decision transition occur
00065 within one half frame.  In this case, the voicing decision transition
00066 is extended to the speech onset.  For transmission purposes, there are
00067 constraints on the duration and transition of voicing decisions.  The
00068 smoother takes these constraints into account.
00069 Finally, the energy estimates are updated along with the dither
00070 threshold used to calculate the zero crossing rate (ZC).
00071 
00072 Inputs:
00073 .  vwin[AF][2] - Voicing window limits (vwin[AF-1][0] to vwin[AF-1][1])
00074 .  inbuf[] - Raw input speech (with DC bias removed each frame) (vwin limited)
00075 .  lpbuf[] - Low-pass filtered speech buffer (vwin limited)
00076 .  half - Present analysis half frame number
00077 .  minamd - Minimum value of the AMDF
00078 .  maxamd - Maximum value of the AMDF
00079 .  mintau - Pointer to the lag of the minimum AMDF value
00080 .  ivrc[2] - Inverse filter's RC's
00081 .  obound[AF] - Onset boundary descriptions
00082 Output:
00083 .  voibuf[AF+1][2] - Buffer of voicing decisions
00084 Internal global:
00085 .  maxmin - AMDF's 1 octave windowed maximum-to-minimum ratio
00086 .  lbve - Low band voiced energy
00087 .  lbue - Low band unvoiced energy
00088 .  fbve - Full band voiced energy
00089 .  fbue - Full band unvoiced energy
00090 .  slbue - Scaled low band unvoiced energy
00091 .  sfbue - Scaled full band unvoiced energy
00092 .  voice[AF][2] - History of LDA results
00093 .  snr - Estimate of signal-to-noise ratio
00094 .  dither - Zero crossing threshold level
00095 .  ofbue - Previous full-band unvoiced energy
00096 .  olbue - Previous low-band unvoiced energy
00097 
00098 Internal:
00099 .  qs - Ratio of preemphasized to full-band energies
00100 .  rc1 - First reflection coefficient
00101 .  ar_b - Product of the causal forward and reverse pitch prediction gains
00102 .  ar_f - Product of the noncausal forward and rev. pitch prediction gains
00103 .  zc - Zero crossing rate
00104 .  vdcl[NVDCL] - SNR values corresponding to the set of vdc's
00105 .  vdc[MAXVDC][NVDCL] - 2-D voicing decision coefficient vector
00106 .  value[9] - Voicing Parameters
00107 .  lbe - Ratio of low-band instantaneous to average energies
00108 .  fbe - Ratio of full-band instantaneous to average energies
00109 .  snr2 - Estimate of low-band signal-to-noise ratio
00110 .  snrl - SNR level number
00111 .  ot - Onset transition present
00112 .  vstate - Decimal interpretation of binary voicing classifications
00113 Constants:
00114 .  AF - The analysis frame number
00115 .  REF - Reference energy for initialization and {dither} threshold
00116 .  MAXVDC - Max number of VDC's
00117 .  NVDCL - Number of VDC levels */
00118 
00119 VOID voicin( INDEX vwin[AF][2], FLOAT inbuf[], FLOAT lpbuf[], INDEX half,
00120         FLOAT minamd, FLOAT maxamd, INDEX mintau, FLOAT ivrc[2],
00121         INDEX* obound, BOOL voibuf[AF+1][2] )
00122 {
00123     INDEX zc, lbe, fbe, i, snrl, vstate;
00124     FLOAT snr2, qs, rc1, ar_b, ar_f;
00125     FLOAT value[9];
00126     BOOL ot;
00127 
00128 /* The {voice} array contains the result of the linear discriminant function
00129 (analog values).  The {voibuf} array contains the hard-limited binary
00130 voicing decisions.  The {voice} and {voibuf} arrays, according to C
00131 memory allocation, are addressed as: [future-frame num][half-frame num]
00132 |   Past    |  Present  |  Future1  |  Future2  |
00133 | 0,0 | 0,1 | 1,0 | 1,1 | 2,0 | 2,1 | 3,0 | 3,1 |  --->  time */
00134 
00135 /* Update linear discriminant function history each frame: */
00136     if (!half) {
00137         g_voice[0][0] = g_voice[1][0];
00138         g_voice[0][1] = g_voice[1][1];
00139         g_voice[1][0] = g_voice[2][0];
00140         g_voice[1][1] = g_voice[2][1];
00141         if (minamd>1)
00142             g_maxmin = maxamd / minamd;
00143         else
00144             g_maxmin = maxamd;
00145     }
00146 
00147 /* Calculate voicing parameters, twice per frame: */
00148     vparms(vwin, inbuf, lpbuf, half, &g_dither, mintau, &zc,
00149             &lbe, &fbe, &qs, &rc1, &ar_b, &ar_f);
00150 
00151 /* Estimate signal-to-noise ratio to select the appropriate {vdc} vector.
00152 The SNR is estimated as the running average of the ratio of the
00153 running average full-band voiced energy to the running average
00154 full-band unvoiced energy. SNR filter has gain of 63. */
00155     g_snr = (FLOAT)((63.*(g_snr + ((g_fbue>1)?g_fbve/(FLOAT)g_fbue:g_fbve))/64.)+.5);
00156     snr2 = (g_lbue>1) ? (g_snr*g_fbue)/g_lbue : (g_snr*g_fbue);
00157 
00158 /* Quantize SNR to {snrl} according to {vdcl} thresholds.
00159 Note: {snrl} can reach up to NVDCL-1 here */
00160     for (snrl = 0; snrl < (NVDCL-1); snrl++)
00161         if (snr2 > G_vdcl[snrl])
00162             break;
00163 
00164 /* Linear discriminant voicing parameters: */
00165     value[0] = g_maxmin;
00166     value[1] = (g_lbve>1) ? (FLOAT)lbe/(FLOAT)g_lbve : (FLOAT)lbe;
00167     value[2] = (FLOAT)zc;
00168     value[3] = rc1;
00169     value[4] = qs;
00170     value[5] = ivrc[1];
00171     value[6] = ar_b;
00172     value[7] = ar_f;
00173     value[8] = 0.;
00174 
00175 /* Evaluation of linear discriminant function: */
00176     g_voice[AF-1][half] = G_vdc[9][snrl];
00177     for (i = 0; i < 9; i++)
00178         g_voice[AF-1][half] += G_vdc[i][snrl] * value[i];
00179 
00180 /* Classify as voiced if discriminant > 0, otherwise unvoiced
00181 Voicing decision for current half-frame: TRUE = voiced; FALSE = Unvoiced */
00182     voibuf[AF][half] = (g_voice[AF-1][half] > 0.);
00183 
00184     if (half) {  /* Skip voicing decision smoothing in first half-frame: */
00185 /* Voicing decision smoothing rules (override of linear combination):
00186 - Unvoiced half-frames: At least two in a row.
00187 - voiced half-frames: At least two in a row in one frame. Otherwise
00188 at least three in a row. (Due to the way transition frames are encoded)
00189 
00190 In many cases, the discriminant function determines how to smooth.
00191 In the following chart, the decisions marked with a * may be overridden.
00192 
00193 Voicing override of transitions at onsets: If a V/UV or UV/V voicing
00194 decision transition occurs within one-half frame of an onset bounding
00195 a voicing window, then the transition is moved to occur at the onset.
00196 
00197 P       1F
00198 -----   -----
00199 0   0   0   0
00200 0   0   0*  1   (If there is an onset there)
00201 0   0   1*  0*  (Based on 2F and discriminant distance)
00202 0   0   1   1
00203 0   1*  0   0   (Always)
00204 0   1*  0*  1   (Based on discriminant distance)
00205 0*  1   1   0*  (Based on past, 2F, and discriminant distance)
00206 0   1*  1   1   (If there is an onset there)
00207 1   0*  0   0   (If there is an onset there)
00208 1   0   0   1
00209 1   0*  1*  0   (Based on discriminant distance)
00210 1   0*  1   1   (Always)
00211 1   1   0   0
00212 1   1   0*  1*  (Based on 2F and discriminant distance)
00213 1   1   1*  0   (If there is an onset there)
00214 1   1   1   1  */
00215 
00216 /*Determine if there is an onset transition between P and 1F.
00217 {ot} (Onset Transition) is true if there is an onset between
00218 P and 1F but not after 1F. */
00219         ot = ((obound[0] & 2) || (obound[1]==1)) && !(obound[2] & 1);
00220 
00221 /* Multi-way dispatch on voicing decision history: */
00222         vstate = (INDEX)(voibuf[1][0])*8 + (INDEX)(voibuf[1][1])*4
00223                 + (INDEX)(voibuf[2][0])*2 + (INDEX)(voibuf[2][1]);
00224         switch (vstate) {
00225         case 0:
00226             break;
00227         case 1:
00228             if (ot && voibuf[3][0])
00229                 voibuf[2][0] = TRUE;
00230             break;
00231         case 2:
00232             if ((!voibuf[3][0]) || (g_voice[1][0] < -g_voice[1][1]))
00233                 voibuf[2][0] = FALSE;
00234             else
00235                 voibuf[2][1] = TRUE;
00236             break;
00237         case 3:
00238             break;
00239         case 4:
00240             voibuf[1][1] = FALSE;
00241             break;
00242         case 5:
00243             if (g_voice[0][1] < -g_voice[1][0])
00244                 voibuf[1][1] = FALSE;
00245             else
00246                 voibuf[2][0] = TRUE;
00247             break;
00248         case 6:
00249             if (voibuf[0][0] || voibuf[3][0] || (g_voice[1][1] > g_voice[0][0]))
00250                 voibuf[2][1] = TRUE;
00251             else
00252                 voibuf[1][0] = TRUE;
00253             break;
00254         case 7:
00255             if (ot)
00256                 voibuf[1][1] = FALSE;
00257             break;
00258         case 8:
00259             if (ot)
00260                 voibuf[1][1] = TRUE;
00261             break;
00262         case 9:
00263             break;
00264         case 10:
00265             if (g_voice[1][0] < -g_voice[0][1])
00266                 voibuf[2][0] = FALSE;
00267             else
00268                 voibuf[1][1] = TRUE;
00269             break;
00270         case 11:
00271             voibuf[1][1] = TRUE;
00272             break;
00273         case 12:
00274             break;
00275         case 13:
00276             if ((!voibuf[3][0]) && (g_voice[1][1] < -g_voice[1][0]))
00277                 voibuf[2][1] = FALSE;
00278             else
00279                 voibuf[2][0] = TRUE;
00280             break;
00281         case 14:
00282             if (ot && (!voibuf[3][0]))
00283                 voibuf[2][0] = FALSE;
00284             break;
00285         default:
00286             break;
00287         }
00288     }
00289 
00290 /* Now update parameters:
00291 During unvoiced half-frames, update the low band and full band unvoiced
00292 energy estimates ({lbue} and {fbue}) and also the zero crossing
00293 threshold ({dither}).  (The input to the unvoiced energy filters is
00294 restricted to be less than 10dB (*3.) above the previous inputs of the
00295 filters.)
00296 During voiced half-frames, update the low-pass ({lbve}) and all-pass
00297 ({fbve}) voiced energy estimates. */
00298     if (!voibuf[3][half]) {
00299         g_ofbue *= 3;   /* +10dB */
00300         g_sfbue = (LINDEX)((63. * g_sfbue + 8. * MMIN(fbe, g_ofbue)) / 64. + .5);
00301         g_fbue = (INDEX)(g_sfbue / 8);
00302         g_ofbue = fbe;
00303         g_olbue *= 3;   /* +10dB */
00304         g_slbue = (LINDEX)((63. * g_slbue + 8. * MMIN(lbe, g_olbue)) / 64. + .5);
00305         g_lbue = (INDEX)(g_slbue / 8);
00306         g_olbue = lbe;
00307     }
00308     else {
00309         g_lbve = (INDEX)((63. * g_lbve + lbe) / 64. + .5);
00310         g_fbve = (INDEX)((63. * g_fbve + fbe) / 64. + .5);
00311     }
00312 
00313 /* Set dither threshold to yield proper zero crossing rates in the
00314 presence of low frequency noise and low level signal input.
00315 NOTE: The divisor is a function of REF, the expected energies. */
00316     g_dither = (FLOAT)(64. * sqrt((FLOAT)g_lbue * (FLOAT)g_lbve) / REF);
00317     if (g_dither<1.)
00318         g_dither = 1.;
00319     else if (g_dither>20.)
00320         g_dither = 20.;
00321 
00322 /* Voicing decisions are returned in {voibuf}. */
00323 }
00324 
00325 /**********************************************************/
00326
c_voice.c