diff options
Diffstat (limited to 'nlp.c')
-rw-r--r-- | nlp.c | 702 |
1 files changed, 702 insertions, 0 deletions
@@ -0,0 +1,702 @@ | |||
1 | /*---------------------------------------------------------------------------*\ | ||
2 | |||
3 | FILE........: nlp.c | ||
4 | AUTHOR......: David Rowe | ||
5 | DATE CREATED: 23/3/93 | ||
6 | |||
7 | Non Linear Pitch (NLP) estimation functions. | ||
8 | |||
9 | \*---------------------------------------------------------------------------*/ | ||
10 | |||
11 | /* | ||
12 | Copyright (C) 2009 David Rowe | ||
13 | |||
14 | All rights reserved. | ||
15 | |||
16 | This program is free software; you can redistribute it and/or modify | ||
17 | it under the terms of the GNU Lesser General Public License version 2.1, as | ||
18 | published by the Free Software Foundation. This program is | ||
19 | distributed in the hope that it will be useful, but WITHOUT ANY | ||
20 | WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
21 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public | ||
22 | License for more details. | ||
23 | |||
24 | You should have received a copy of the GNU Lesser General Public License | ||
25 | along with this program; if not, see <http://www.gnu.org/licenses/>. | ||
26 | */ | ||
27 | |||
28 | #include "defines.h" | ||
29 | #include "nlp.h" | ||
30 | #include "dump.h" | ||
31 | #include "codec2_fft.h" | ||
32 | #undef PROFILE | ||
33 | #include "machdep.h" | ||
34 | #include "os.h" | ||
35 | |||
36 | #include <assert.h> | ||
37 | #include <math.h> | ||
38 | #include <stdlib.h> | ||
39 | |||
40 | /*---------------------------------------------------------------------------*\ | ||
41 | |||
42 | DEFINES | ||
43 | |||
44 | \*---------------------------------------------------------------------------*/ | ||
45 | |||
46 | #define PMAX_M 320 /* maximum NLP analysis window size */ | ||
47 | #define COEFF 0.95 /* notch filter parameter */ | ||
48 | #define PE_FFT_SIZE 512 /* DFT size for pitch estimation */ | ||
49 | #define DEC 5 /* decimation factor */ | ||
50 | #define SAMPLE_RATE 8000 | ||
51 | #define PI 3.141592654 /* mathematical constant */ | ||
52 | #define T 0.1 /* threshold for local minima candidate */ | ||
53 | #define F0_MAX 500 | ||
54 | #define CNLP 0.3 /* post processor constant */ | ||
55 | #define NLP_NTAP 48 /* Decimation LPF order */ | ||
56 | #undef POST_PROCESS_MBE /* choose post processor */ | ||
57 | |||
58 | /* 8 to 16 kHz sample rate conversion */ | ||
59 | |||
60 | #define FDMDV_OS 2 /* oversampling rate */ | ||
61 | #define FDMDV_OS_TAPS_16K 48 /* number of OS filter taps at 16kHz */ | ||
62 | #define FDMDV_OS_TAPS_8K (FDMDV_OS_TAPS_16K/FDMDV_OS) /* number of OS filter taps at 8kHz */ | ||
63 | |||
64 | /*---------------------------------------------------------------------------*\ | ||
65 | |||
66 | GLOBALS | ||
67 | |||
68 | \*---------------------------------------------------------------------------*/ | ||
69 | |||
70 | /* 48 tap 600Hz low pass FIR filter coefficients */ | ||
71 | |||
72 | const float nlp_fir[] = { | ||
73 | -1.0818124e-03, | ||
74 | -1.1008344e-03, | ||
75 | -9.2768838e-04, | ||
76 | -4.2289438e-04, | ||
77 | 5.5034190e-04, | ||
78 | 2.0029849e-03, | ||
79 | 3.7058509e-03, | ||
80 | 5.1449415e-03, | ||
81 | 5.5924666e-03, | ||
82 | 4.3036754e-03, | ||
83 | 8.0284511e-04, | ||
84 | -4.8204610e-03, | ||
85 | -1.1705810e-02, | ||
86 | -1.8199275e-02, | ||
87 | -2.2065282e-02, | ||
88 | -2.0920610e-02, | ||
89 | -1.2808831e-02, | ||
90 | 3.2204775e-03, | ||
91 | 2.6683811e-02, | ||
92 | 5.5520624e-02, | ||
93 | 8.6305944e-02, | ||
94 | 1.1480192e-01, | ||
95 | 1.3674206e-01, | ||
96 | 1.4867556e-01, | ||
97 | 1.4867556e-01, | ||
98 | 1.3674206e-01, | ||
99 | 1.1480192e-01, | ||
100 | 8.6305944e-02, | ||
101 | 5.5520624e-02, | ||
102 | 2.6683811e-02, | ||
103 | 3.2204775e-03, | ||
104 | -1.2808831e-02, | ||
105 | -2.0920610e-02, | ||
106 | -2.2065282e-02, | ||
107 | -1.8199275e-02, | ||
108 | -1.1705810e-02, | ||
109 | -4.8204610e-03, | ||
110 | 8.0284511e-04, | ||
111 | 4.3036754e-03, | ||
112 | 5.5924666e-03, | ||
113 | 5.1449415e-03, | ||
114 | 3.7058509e-03, | ||
115 | 2.0029849e-03, | ||
116 | 5.5034190e-04, | ||
117 | -4.2289438e-04, | ||
118 | -9.2768838e-04, | ||
119 | -1.1008344e-03, | ||
120 | -1.0818124e-03 | ||
121 | }; | ||
122 | |||
123 | typedef struct { | ||
124 | int Fs; /* sample rate in Hz */ | ||
125 | int m; | ||
126 | float w[PMAX_M/DEC]; /* DFT window */ | ||
127 | float sq[PMAX_M]; /* squared speech samples */ | ||
128 | float mem_x,mem_y; /* memory for notch filter */ | ||
129 | float mem_fir[NLP_NTAP]; /* decimation FIR filter memory */ | ||
130 | codec2_fft_cfg fft_cfg; /* kiss FFT config */ | ||
131 | float *Sn16k; /* Fs=16kHz input speech vector */ | ||
132 | FILE *f; | ||
133 | } NLP; | ||
134 | |||
135 | #ifdef POST_PROCESS_MBE | ||
136 | float test_candidate_mbe(COMP Sw[], COMP W[], float f0); | ||
137 | float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax, COMP Sw[], COMP W[], float *prev_Wo); | ||
138 | #endif | ||
139 | float post_process_sub_multiples(COMP Fw[], | ||
140 | int pmin, int pmax, float gmax, int gmax_bin, | ||
141 | float *prev_f0); | ||
142 | static void fdmdv_16_to_8(float out8k[], float in16k[], int n); | ||
143 | |||
144 | /*---------------------------------------------------------------------------*\ | ||
145 | |||
146 | nlp_create() | ||
147 | |||
148 | Initialisation function for NLP pitch estimator. | ||
149 | |||
150 | \*---------------------------------------------------------------------------*/ | ||
151 | |||
152 | void *nlp_create(C2CONST *c2const) | ||
153 | { | ||
154 | NLP *nlp; | ||
155 | int i; | ||
156 | int m = c2const->m_pitch; | ||
157 | int Fs = c2const->Fs; | ||
158 | |||
159 | nlp = (NLP*)malloc(sizeof(NLP)); | ||
160 | if (nlp == NULL) | ||
161 | return NULL; | ||
162 | |||
163 | assert((Fs == 8000) || (Fs == 16000)); | ||
164 | nlp->Fs = Fs; | ||
165 | |||
166 | nlp->m = m; | ||
167 | |||
168 | /* if running at 16kHz allocate storage for decimating filter memory */ | ||
169 | |||
170 | if (Fs == 16000) { | ||
171 | nlp->Sn16k = (float*)malloc(sizeof(float)*(FDMDV_OS_TAPS_16K + c2const->n_samp)); | ||
172 | for(i=0; i<FDMDV_OS_TAPS_16K; i++) { | ||
173 | nlp->Sn16k[i] = 0.0; | ||
174 | } | ||
175 | if (nlp->Sn16k == NULL) { | ||
176 | free(nlp); | ||
177 | return NULL; | ||
178 | } | ||
179 | |||
180 | /* most processing occurs at 8 kHz sample rate so halve m */ | ||
181 | |||
182 | m /= 2; | ||
183 | } | ||
184 | |||
185 | assert(m <= PMAX_M); | ||
186 | |||
187 | for(i=0; i<m/DEC; i++) { | ||
188 | nlp->w[i] = 0.5 - 0.5*cosf(2*PI*i/(m/DEC-1)); | ||
189 | } | ||
190 | |||
191 | for(i=0; i<PMAX_M; i++) | ||
192 | nlp->sq[i] = 0.0; | ||
193 | nlp->mem_x = 0.0; | ||
194 | nlp->mem_y = 0.0; | ||
195 | for(i=0; i<NLP_NTAP; i++) | ||
196 | nlp->mem_fir[i] = 0.0; | ||
197 | |||
198 | nlp->fft_cfg = codec2_fft_alloc (PE_FFT_SIZE, 0, NULL, NULL); | ||
199 | assert(nlp->fft_cfg != NULL); | ||
200 | |||
201 | return (void*)nlp; | ||
202 | } | ||
203 | |||
204 | /*---------------------------------------------------------------------------*\ | ||
205 | |||
206 | nlp_destroy() | ||
207 | |||
208 | Shut down function for NLP pitch estimator. | ||
209 | |||
210 | \*---------------------------------------------------------------------------*/ | ||
211 | |||
212 | void nlp_destroy(void *nlp_state) | ||
213 | { | ||
214 | NLP *nlp; | ||
215 | assert(nlp_state != NULL); | ||
216 | nlp = (NLP*)nlp_state; | ||
217 | |||
218 | codec2_fft_free(nlp->fft_cfg); | ||
219 | if (nlp->Fs == 16000) { | ||
220 | free(nlp->Sn16k); | ||
221 | } | ||
222 | free(nlp_state); | ||
223 | } | ||
224 | |||
225 | /*---------------------------------------------------------------------------*\ | ||
226 | |||
227 | nlp() | ||
228 | |||
229 | Determines the pitch in samples using the Non Linear Pitch (NLP) | ||
230 | algorithm [1]. Returns the fundamental in Hz. Note that the actual | ||
231 | pitch estimate is for the centre of the M sample Sn[] vector, not | ||
232 | the current N sample input vector. This is (I think) a delay of 2.5 | ||
233 | frames with N=80 samples. You should align further analysis using | ||
234 | this pitch estimate to be centred on the middle of Sn[]. | ||
235 | |||
236 | Two post processors have been tried, the MBE version (as discussed | ||
237 | in [1]), and a post processor that checks sub-multiples. Both | ||
238 | suffer occasional gross pitch errors (i.e. neither are perfect). In | ||
239 | the presence of background noise the sub-multiple algorithm tends | ||
240 | towards low F0 which leads to better sounding background noise than | ||
241 | the MBE post processor. | ||
242 | |||
243 | A good way to test and develop the NLP pitch estimator is using the | ||
244 | tnlp (codec2/unittest) and the codec2/octave/plnlp.m Octave script. | ||
245 | |||
246 | A pitch tracker searching a few frames forward and backward in time | ||
247 | would be a useful addition. | ||
248 | |||
249 | References: | ||
250 | |||
251 | [1] http://rowetel.com/downloads/1997_rowe_phd_thesis.pdf Chapter 4 | ||
252 | |||
253 | \*---------------------------------------------------------------------------*/ | ||
254 | |||
255 | float nlp( | ||
256 | void *nlp_state, | ||
257 | float Sn[], /* input speech vector */ | ||
258 | int n, /* frames shift (no. new samples in Sn[]) */ | ||
259 | float *pitch, /* estimated pitch period in samples at current Fs */ | ||
260 | COMP Sw[], /* Freq domain version of Sn[] */ | ||
261 | COMP W[], /* Freq domain window */ | ||
262 | float *prev_f0 /* previous pitch f0 in Hz, memory for pitch tracking */ | ||
263 | ) | ||
264 | { | ||
265 | NLP *nlp; | ||
266 | float notch; /* current notch filter output */ | ||
267 | COMP Fw[PE_FFT_SIZE]; /* DFT of squared signal (input/output) */ | ||
268 | float gmax; | ||
269 | int gmax_bin; | ||
270 | int m, i, j; | ||
271 | float best_f0; | ||
272 | PROFILE_VAR(start, tnotch, filter, peakpick, window, fft, magsq, shiftmem); | ||
273 | |||
274 | assert(nlp_state != NULL); | ||
275 | nlp = (NLP*)nlp_state; | ||
276 | m = nlp->m; | ||
277 | |||
278 | /* Square, notch filter at DC, and LP filter vector */ | ||
279 | |||
280 | /* If running at 16 kHz decimate to 8 kHz, as NLP ws designed for | ||
281 | Fs = 8kHz. The decimating filter introduces about 3ms of delay, | ||
282 | that shouldn't be a problem as pitch changes slowly. */ | ||
283 | |||
284 | if (nlp->Fs == 8000) { | ||
285 | /* Square latest input samples */ | ||
286 | |||
287 | for(i=m-n; i<m; i++) { | ||
288 | nlp->sq[i] = Sn[i]*Sn[i]; | ||
289 | } | ||
290 | } | ||
291 | else { | ||
292 | assert(nlp->Fs == 16000); | ||
293 | |||
294 | /* re-sample at 8 KHz */ | ||
295 | |||
296 | for(i=0; i<n; i++) { | ||
297 | nlp->Sn16k[FDMDV_OS_TAPS_16K+i] = Sn[m-n+i]; | ||
298 | } | ||
299 | |||
300 | m /= 2; n /= 2; | ||
301 | |||
302 | float Sn8k[n]; | ||
303 | fdmdv_16_to_8(Sn8k, &nlp->Sn16k[FDMDV_OS_TAPS_16K], n); | ||
304 | |||
305 | /* Square latest input samples */ | ||
306 | |||
307 | for(i=m-n, j=0; i<m; i++, j++) { | ||
308 | nlp->sq[i] = Sn8k[j]*Sn8k[j]; | ||
309 | } | ||
310 | assert(j <= n); | ||
311 | } | ||
312 | //fprintf(stderr, "n: %d m: %d\n", n, m); | ||
313 | |||
314 | PROFILE_SAMPLE(start); | ||
315 | |||
316 | for(i=m-n; i<m; i++) { /* notch filter at DC */ | ||
317 | notch = nlp->sq[i] - nlp->mem_x; | ||
318 | notch += COEFF*nlp->mem_y; | ||
319 | nlp->mem_x = nlp->sq[i]; | ||
320 | nlp->mem_y = notch; | ||
321 | nlp->sq[i] = notch + 1.0; /* With 0 input vectors to codec, | ||
322 | kiss_fft() would take a long | ||
323 | time to execute when running in | ||
324 | real time. Problem was traced | ||
325 | to kiss_fft function call in | ||
326 | this function. Adding this small | ||
327 | constant fixed problem. Not | ||
328 | exactly sure why. */ | ||
329 | } | ||
330 | |||
331 | PROFILE_SAMPLE_AND_LOG(tnotch, start, " square and notch"); | ||
332 | |||
333 | for(i=m-n; i<m; i++) { /* FIR filter vector */ | ||
334 | |||
335 | for(j=0; j<NLP_NTAP-1; j++) | ||
336 | nlp->mem_fir[j] = nlp->mem_fir[j+1]; | ||
337 | nlp->mem_fir[NLP_NTAP-1] = nlp->sq[i]; | ||
338 | |||
339 | nlp->sq[i] = 0.0; | ||
340 | for(j=0; j<NLP_NTAP; j++) | ||
341 | nlp->sq[i] += nlp->mem_fir[j]*nlp_fir[j]; | ||
342 | } | ||
343 | |||
344 | PROFILE_SAMPLE_AND_LOG(filter, tnotch, " filter"); | ||
345 | |||
346 | /* Decimate and DFT */ | ||
347 | |||
348 | for(i=0; i<PE_FFT_SIZE; i++) { | ||
349 | Fw[i].real = 0.0; | ||
350 | Fw[i].imag = 0.0; | ||
351 | } | ||
352 | for(i=0; i<m/DEC; i++) { | ||
353 | Fw[i].real = nlp->sq[i*DEC]*nlp->w[i]; | ||
354 | } | ||
355 | PROFILE_SAMPLE_AND_LOG(window, filter, " window"); | ||
356 | #ifdef DUMP | ||
357 | dump_dec(Fw); | ||
358 | #endif | ||
359 | |||
360 | // FIXME: check if this can be converted to a real fft | ||
361 | // since all imag inputs are 0 | ||
362 | codec2_fft_inplace(nlp->fft_cfg, Fw); | ||
363 | PROFILE_SAMPLE_AND_LOG(fft, window, " fft"); | ||
364 | |||
365 | for(i=0; i<PE_FFT_SIZE; i++) | ||
366 | Fw[i].real = Fw[i].real*Fw[i].real + Fw[i].imag*Fw[i].imag; | ||
367 | |||
368 | PROFILE_SAMPLE_AND_LOG(magsq, fft, " mag sq"); | ||
369 | #ifdef DUMP | ||
370 | dump_sq(m, nlp->sq); | ||
371 | dump_Fw(Fw); | ||
372 | #endif | ||
373 | |||
374 | /* todo: express everything in f0, as pitch in samples is dep on Fs */ | ||
375 | |||
376 | int pmin = floor(SAMPLE_RATE*P_MIN_S); | ||
377 | int pmax = floor(SAMPLE_RATE*P_MAX_S); | ||
378 | |||
379 | /* find global peak */ | ||
380 | |||
381 | gmax = 0.0; | ||
382 | gmax_bin = PE_FFT_SIZE*DEC/pmax; | ||
383 | for(i=PE_FFT_SIZE*DEC/pmax; i<=PE_FFT_SIZE*DEC/pmin; i++) { | ||
384 | if (Fw[i].real > gmax) { | ||
385 | gmax = Fw[i].real; | ||
386 | gmax_bin = i; | ||
387 | } | ||
388 | } | ||
389 | |||
390 | PROFILE_SAMPLE_AND_LOG(peakpick, magsq, " peak pick"); | ||
391 | |||
392 | #ifdef POST_PROCESS_MBE | ||
393 | best_f0 = post_process_mbe(Fw, pmin, pmax, gmax, Sw, W, prev_f0); | ||
394 | #else | ||
395 | best_f0 = post_process_sub_multiples(Fw, pmin, pmax, gmax, gmax_bin, prev_f0); | ||
396 | #endif | ||
397 | |||
398 | PROFILE_SAMPLE_AND_LOG(shiftmem, peakpick, " post process"); | ||
399 | |||
400 | /* Shift samples in buffer to make room for new samples */ | ||
401 | |||
402 | for(i=0; i<m-n; i++) | ||
403 | nlp->sq[i] = nlp->sq[i+n]; | ||
404 | |||
405 | /* return pitch period in samples and F0 estimate */ | ||
406 | |||
407 | *pitch = (float)nlp->Fs/best_f0; | ||
408 | |||
409 | PROFILE_SAMPLE_AND_LOG2(shiftmem, " shift mem"); | ||
410 | |||
411 | PROFILE_SAMPLE_AND_LOG2(start, " nlp int"); | ||
412 | |||
413 | *prev_f0 = best_f0; | ||
414 | |||
415 | return(best_f0); | ||
416 | } | ||
417 | |||
418 | /*---------------------------------------------------------------------------*\ | ||
419 | |||
420 | post_process_sub_multiples() | ||
421 | |||
422 | Given the global maximma of Fw[] we search integer submultiples for | ||
423 | local maxima. If local maxima exist and they are above an | ||
424 | experimentally derived threshold (OK a magic number I pulled out of | ||
425 | the air) we choose the submultiple as the F0 estimate. | ||
426 | |||
427 | The rational for this is that the lowest frequency peak of Fw[] | ||
428 | should be F0, as Fw[] can be considered the autocorrelation function | ||
429 | of Sw[] (the speech spectrum). However sometimes due to phase | ||
430 | effects the lowest frequency maxima may not be the global maxima. | ||
431 | |||
432 | This works OK in practice and favours low F0 values in the presence | ||
433 | of background noise which means the sinusoidal codec does an OK job | ||
434 | of synthesising the background noise. High F0 in background noise | ||
435 | tends to sound more periodic introducing annoying artifacts. | ||
436 | |||
437 | \*---------------------------------------------------------------------------*/ | ||
438 | |||
439 | float post_process_sub_multiples(COMP Fw[], | ||
440 | int pmin, int pmax, float gmax, int gmax_bin, | ||
441 | float *prev_f0) | ||
442 | { | ||
443 | int min_bin, cmax_bin; | ||
444 | int mult; | ||
445 | float thresh, best_f0; | ||
446 | int b, bmin, bmax, lmax_bin; | ||
447 | float lmax; | ||
448 | int prev_f0_bin; | ||
449 | |||
450 | /* post process estimate by searching submultiples */ | ||
451 | |||
452 | mult = 2; | ||
453 | min_bin = PE_FFT_SIZE*DEC/pmax; | ||
454 | cmax_bin = gmax_bin; | ||
455 | prev_f0_bin = *prev_f0*(PE_FFT_SIZE*DEC)/SAMPLE_RATE; | ||
456 | |||
457 | while(gmax_bin/mult >= min_bin) { | ||
458 | |||
459 | b = gmax_bin/mult; /* determine search interval */ | ||
460 | bmin = 0.8*b; | ||
461 | bmax = 1.2*b; | ||
462 | if (bmin < min_bin) | ||
463 | bmin = min_bin; | ||
464 | |||
465 | /* lower threshold to favour previous frames pitch estimate, | ||
466 | this is a form of pitch tracking */ | ||
467 | |||
468 | if ((prev_f0_bin > bmin) && (prev_f0_bin < bmax)) | ||
469 | thresh = CNLP*0.5*gmax; | ||
470 | else | ||
471 | thresh = CNLP*gmax; | ||
472 | |||
473 | lmax = 0; | ||
474 | lmax_bin = bmin; | ||
475 | for (b=bmin; b<=bmax; b++) /* look for maximum in interval */ | ||
476 | if (Fw[b].real > lmax) { | ||
477 | lmax = Fw[b].real; | ||
478 | lmax_bin = b; | ||
479 | } | ||
480 | |||
481 | if (lmax > thresh) | ||
482 | if ((lmax > Fw[lmax_bin-1].real) && (lmax > Fw[lmax_bin+1].real)) { | ||
483 | cmax_bin = lmax_bin; | ||
484 | } | ||
485 | |||
486 | mult++; | ||
487 | } | ||
488 | |||
489 | best_f0 = (float)cmax_bin*SAMPLE_RATE/(PE_FFT_SIZE*DEC); | ||
490 | |||
491 | return best_f0; | ||
492 | } | ||
493 | |||
494 | #ifdef POST_PROCESS_MBE | ||
495 | |||
496 | /*---------------------------------------------------------------------------*\ | ||
497 | |||
498 | post_process_mbe() | ||
499 | |||
500 | Use the MBE pitch estimation algorithm to evaluate pitch candidates. This | ||
501 | works OK but the accuracy at low F0 is affected by NW, the analysis window | ||
502 | size used for the DFT of the input speech Sw[]. Also favours high F0 in | ||
503 | the presence of background noise which causes periodic artifacts in the | ||
504 | synthesised speech. | ||
505 | |||
506 | \*---------------------------------------------------------------------------*/ | ||
507 | |||
508 | float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax, COMP Sw[], COMP W[], float *prev_Wo) | ||
509 | { | ||
510 | float candidate_f0; | ||
511 | float f0,best_f0; /* fundamental frequency */ | ||
512 | float e,e_min; /* MBE cost function */ | ||
513 | int i; | ||
514 | #ifdef DUMP | ||
515 | float e_hz[F0_MAX]; | ||
516 | #endif | ||
517 | #if !defined(NDEBUG) || defined(DUMP) | ||
518 | int bin; | ||
519 | #endif | ||
520 | float f0_min, f0_max; | ||
521 | float f0_start, f0_end; | ||
522 | |||
523 | f0_min = (float)SAMPLE_RATE/pmax; | ||
524 | f0_max = (float)SAMPLE_RATE/pmin; | ||
525 | |||
526 | /* Now look for local maxima. Each local maxima is a candidate | ||
527 | that we test using the MBE pitch estimation algotithm */ | ||
528 | |||
529 | #ifdef DUMP | ||
530 | for(i=0; i<F0_MAX; i++) | ||
531 | e_hz[i] = -1; | ||
532 | #endif | ||
533 | e_min = 1E32; | ||
534 | best_f0 = 50; | ||
535 | for(i=PE_FFT_SIZE*DEC/pmax; i<=PE_FFT_SIZE*DEC/pmin; i++) { | ||
536 | if ((Fw[i].real > Fw[i-1].real) && (Fw[i].real > Fw[i+1].real)) { | ||
537 | |||
538 | /* local maxima found, lets test if it's big enough */ | ||
539 | |||
540 | if (Fw[i].real > T*gmax) { | ||
541 | |||
542 | /* OK, sample MBE cost function over +/- 10Hz range in 2.5Hz steps */ | ||
543 | |||
544 | candidate_f0 = (float)i*SAMPLE_RATE/(PE_FFT_SIZE*DEC); | ||
545 | f0_start = candidate_f0-20; | ||
546 | f0_end = candidate_f0+20; | ||
547 | if (f0_start < f0_min) f0_start = f0_min; | ||
548 | if (f0_end > f0_max) f0_end = f0_max; | ||
549 | |||
550 | for(f0=f0_start; f0<=f0_end; f0+= 2.5) { | ||
551 | e = test_candidate_mbe(Sw, W, f0); | ||
552 | #if !defined(NDEBUG) || defined(DUMP) | ||
553 | bin = floorf(f0); assert((bin > 0) && (bin < F0_MAX)); | ||
554 | #endif | ||
555 | #ifdef DUMP | ||
556 | e_hz[bin] = e; | ||
557 | #endif | ||
558 | if (e < e_min) { | ||
559 | e_min = e; | ||
560 | best_f0 = f0; | ||
561 | } | ||
562 | } | ||
563 | |||
564 | } | ||
565 | } | ||
566 | } | ||
567 | |||
568 | /* finally sample MBE cost function around previous pitch estimate | ||
569 | (form of pitch tracking) */ | ||
570 | |||
571 | candidate_f0 = *prev_Wo * SAMPLE_RATE/TWO_PI; | ||
572 | f0_start = candidate_f0-20; | ||
573 | f0_end = candidate_f0+20; | ||
574 | if (f0_start < f0_min) f0_start = f0_min; | ||
575 | if (f0_end > f0_max) f0_end = f0_max; | ||
576 | |||
577 | for(f0=f0_start; f0<=f0_end; f0+= 2.5) { | ||
578 | e = test_candidate_mbe(Sw, W, f0); | ||
579 | #if !defined(NDEBUG) || defined(DUMP) | ||
580 | bin = floorf(f0); assert((bin > 0) && (bin < F0_MAX)); | ||
581 | #endif | ||
582 | #ifdef DUMP | ||
583 | e_hz[bin] = e; | ||
584 | #endif | ||
585 | if (e < e_min) { | ||
586 | e_min = e; | ||
587 | best_f0 = f0; | ||
588 | } | ||
589 | } | ||
590 | |||
591 | #ifdef DUMP | ||
592 | dump_e(e_hz); | ||
593 | #endif | ||
594 | |||
595 | return best_f0; | ||
596 | } | ||
597 | |||
598 | /*---------------------------------------------------------------------------*\ | ||
599 | |||
600 | test_candidate_mbe() | ||
601 | |||
602 | Returns the error of the MBE cost function for the input f0. | ||
603 | |||
604 | Note: I think a lot of the operations below can be simplified as | ||
605 | W[].imag = 0 and has been normalised such that den always equals 1. | ||
606 | |||
607 | \*---------------------------------------------------------------------------*/ | ||
608 | |||
609 | float test_candidate_mbe( | ||
610 | COMP Sw[], | ||
611 | COMP W[], | ||
612 | float f0 | ||
613 | ) | ||
614 | { | ||
615 | COMP Sw_[FFT_ENC]; /* DFT of all voiced synthesised signal */ | ||
616 | int l,al,bl,m; /* loop variables */ | ||
617 | COMP Am; /* amplitude sample for this band */ | ||
618 | int offset; /* centers Hw[] about current harmonic */ | ||
619 | float den; /* denominator of Am expression */ | ||
620 | float error; /* accumulated error between originl and synthesised */ | ||
621 | float Wo; /* current "test" fundamental freq. */ | ||
622 | int L; | ||
623 | |||
624 | L = floorf((SAMPLE_RATE/2.0)/f0); | ||
625 | Wo = f0*(2*PI/SAMPLE_RATE); | ||
626 | |||
627 | error = 0.0; | ||
628 | |||
629 | /* Just test across the harmonics in the first 1000 Hz (L/4) */ | ||
630 | |||
631 | for(l=1; l<L/4; l++) { | ||
632 | Am.real = 0.0; | ||
633 | Am.imag = 0.0; | ||
634 | den = 0.0; | ||
635 | al = ceilf((l - 0.5)*Wo*FFT_ENC/TWO_PI); | ||
636 | bl = ceilf((l + 0.5)*Wo*FFT_ENC/TWO_PI); | ||
637 | |||
638 | /* Estimate amplitude of harmonic assuming harmonic is totally voiced */ | ||
639 | |||
640 | for(m=al; m<bl; m++) { | ||
641 | offset = FFT_ENC/2 + m - l*Wo*FFT_ENC/TWO_PI + 0.5; | ||
642 | Am.real += Sw[m].real*W[offset].real + Sw[m].imag*W[offset].imag; | ||
643 | Am.imag += Sw[m].imag*W[offset].real - Sw[m].real*W[offset].imag; | ||
644 | den += W[offset].real*W[offset].real + W[offset].imag*W[offset].imag; | ||
645 | } | ||
646 | |||
647 | Am.real = Am.real/den; | ||
648 | Am.imag = Am.imag/den; | ||
649 | |||
650 | /* Determine error between estimated harmonic and original */ | ||
651 | |||
652 | for(m=al; m<bl; m++) { | ||
653 | offset = FFT_ENC/2 + m - l*Wo*FFT_ENC/TWO_PI + 0.5; | ||
654 | Sw_[m].real = Am.real*W[offset].real - Am.imag*W[offset].imag; | ||
655 | Sw_[m].imag = Am.real*W[offset].imag + Am.imag*W[offset].real; | ||
656 | error += (Sw[m].real - Sw_[m].real)*(Sw[m].real - Sw_[m].real); | ||
657 | error += (Sw[m].imag - Sw_[m].imag)*(Sw[m].imag - Sw_[m].imag); | ||
658 | } | ||
659 | } | ||
660 | |||
661 | return error; | ||
662 | } | ||
663 | |||
664 | #endif | ||
665 | |||
666 | /*---------------------------------------------------------------------------*\ | ||
667 | |||
668 | FUNCTION....: fdmdv_16_to_8() | ||
669 | AUTHOR......: David Rowe | ||
670 | DATE CREATED: 9 May 2012 | ||
671 | |||
672 | Changes the sample rate of a signal from 16 to 8 kHz. | ||
673 | |||
674 | n is the number of samples at the 8 kHz rate, there are FDMDV_OS*n | ||
675 | samples at the 48 kHz rate. As above however a memory of | ||
676 | FDMDV_OS_TAPS samples is reqd for in16k[] (see t16_8.c unit test as example). | ||
677 | |||
678 | Low pass filter the 16 kHz signal at 4 kHz using the same filter as | ||
679 | the upsampler, then just output every FDMDV_OS-th filtered sample. | ||
680 | |||
681 | Note: this function copied from fdmdv.c, included in nlp.c as a convenience | ||
682 | to avoid linking with another source file. | ||
683 | |||
684 | \*---------------------------------------------------------------------------*/ | ||
685 | |||
686 | static void fdmdv_16_to_8(float out8k[], float in16k[], int n) | ||
687 | { | ||
688 | float acc; | ||
689 | int i,j,k; | ||
690 | |||
691 | for(i=0, k=0; k<n; i+=FDMDV_OS, k++) { | ||
692 | acc = 0.0; | ||
693 | for(j=0; j<FDMDV_OS_TAPS_16K; j++) | ||
694 | acc += fdmdv_os_filter[j]*in16k[i-j]; | ||
695 | out8k[k] = acc; | ||
696 | } | ||
697 | |||
698 | /* update filter memory */ | ||
699 | |||
700 | for(i=-FDMDV_OS_TAPS_16K; i<0; i++) | ||
701 | in16k[i] = in16k[i + n*FDMDV_OS]; | ||
702 | } | ||