diff options
| author | erdgeist@erdgeist.org <erdgeist@bauklotz.fritz.box> | 2019-07-04 23:26:09 +0200 |
|---|---|---|
| committer | erdgeist@erdgeist.org <erdgeist@bauklotz.fritz.box> | 2019-07-04 23:26:09 +0200 |
| commit | f02dfce6e6c34b3d8a7b8a0e784b506178e331fa (patch) | |
| tree | 45556e6104242d4702689760433d7321ae74ec17 /phase.c | |
stripdown of version 0.9
Diffstat (limited to 'phase.c')
| -rw-r--r-- | phase.c | 289 |
1 files changed, 289 insertions, 0 deletions
| @@ -0,0 +1,289 @@ | |||
| 1 | /*---------------------------------------------------------------------------*\ | ||
| 2 | |||
| 3 | FILE........: phase.c | ||
| 4 | AUTHOR......: David Rowe | ||
| 5 | DATE CREATED: 1/2/09 | ||
| 6 | |||
| 7 | Functions for modelling and synthesising phase. | ||
| 8 | |||
| 9 | \*---------------------------------------------------------------------------*/ | ||
| 10 | |||
| 11 | /* | ||
| 12 | Copyright (C) 2009 David Rowe | ||
| 13 | |||
| 14 | All rights reserved. | ||
| 15 | |||
| 16 | This program is free software; you can redistribute it and/or modify | ||
| 17 | it under the terms of the GNU Lesser General Public License version 2.1, as | ||
| 18 | published by the Free Software Foundation. This program is | ||
| 19 | distributed in the hope that it will be useful, but WITHOUT ANY | ||
| 20 | WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
| 21 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public | ||
| 22 | License for more details. | ||
| 23 | |||
| 24 | You should have received a copy of the GNU Lesser General Public License | ||
| 25 | along with this program; if not,see <http://www.gnu.org/licenses/>. | ||
| 26 | */ | ||
| 27 | |||
| 28 | #include "defines.h" | ||
| 29 | #include "phase.h" | ||
| 30 | #include "kiss_fft.h" | ||
| 31 | #include "comp.h" | ||
| 32 | #include "comp_prim.h" | ||
| 33 | #include "sine.h" | ||
| 34 | |||
| 35 | #include <assert.h> | ||
| 36 | #include <ctype.h> | ||
| 37 | #include <math.h> | ||
| 38 | #include <string.h> | ||
| 39 | #include <stdlib.h> | ||
| 40 | |||
| 41 | /*---------------------------------------------------------------------------*\ | ||
| 42 | |||
| 43 | sample_phase() | ||
| 44 | |||
| 45 | Samples phase at centre of each harmonic from and array of FFT_ENC | ||
| 46 | DFT samples. | ||
| 47 | |||
| 48 | \*---------------------------------------------------------------------------*/ | ||
| 49 | |||
| 50 | void sample_phase(MODEL *model, | ||
| 51 | COMP H[], | ||
| 52 | COMP A[] /* LPC analysis filter in freq domain */ | ||
| 53 | ) | ||
| 54 | { | ||
| 55 | int m, b; | ||
| 56 | float r; | ||
| 57 | |||
| 58 | r = TWO_PI/(FFT_ENC); | ||
| 59 | |||
| 60 | /* Sample phase at harmonics */ | ||
| 61 | |||
| 62 | for(m=1; m<=model->L; m++) { | ||
| 63 | b = (int)(m*model->Wo/r + 0.5); | ||
| 64 | H[m] = cconj(A[b]); /* synth filter 1/A is opposite phase to analysis filter */ | ||
| 65 | } | ||
| 66 | } | ||
| 67 | |||
| 68 | |||
| 69 | /*---------------------------------------------------------------------------*\ | ||
| 70 | |||
| 71 | phase_synth_zero_order() | ||
| 72 | |||
| 73 | Synthesises phases based on SNR and a rule based approach. No phase | ||
| 74 | parameters are required apart from the SNR (which can be reduced to a | ||
| 75 | 1 bit V/UV decision per frame). | ||
| 76 | |||
| 77 | The phase of each harmonic is modelled as the phase of a synthesis | ||
| 78 | filter excited by an impulse. In many Codec 2 modes the synthesis | ||
| 79 | filter is a LPC filter. Unlike the first order model the position | ||
| 80 | of the impulse is not transmitted, so we create an excitation pulse | ||
| 81 | train using a rule based approach. | ||
| 82 | |||
| 83 | Consider a pulse train with a pulse starting time n=0, with pulses | ||
| 84 | repeated at a rate of Wo, the fundamental frequency. A pulse train | ||
| 85 | in the time domain is equivalent to harmonics in the frequency | ||
| 86 | domain. We can make an excitation pulse train using a sum of | ||
| 87 | sinsusoids: | ||
| 88 | |||
| 89 | for(m=1; m<=L; m++) | ||
| 90 | ex[n] = cos(m*Wo*n) | ||
| 91 | |||
| 92 | Note: the Octave script ../octave/phase.m is an example of this if | ||
| 93 | you would like to try making a pulse train. | ||
| 94 | |||
| 95 | The phase of each excitation harmonic is: | ||
| 96 | |||
| 97 | arg(E[m]) = mWo | ||
| 98 | |||
| 99 | where E[m] are the complex excitation (freq domain) samples, | ||
| 100 | arg(x), just returns the phase of a complex sample x. | ||
| 101 | |||
| 102 | As we don't transmit the pulse position for this model, we need to | ||
| 103 | synthesise it. Now the excitation pulses occur at a rate of Wo. | ||
| 104 | This means the phase of the first harmonic advances by N_SAMP samples | ||
| 105 | over a synthesis frame of N_SAMP samples. For example if Wo is pi/20 | ||
| 106 | (200 Hz), then over a 10ms frame (N_SAMP=80 samples), the phase of the | ||
| 107 | first harmonic would advance (pi/20)*80 = 4*pi or two complete | ||
| 108 | cycles. | ||
| 109 | |||
| 110 | We generate the excitation phase of the fundamental (first | ||
| 111 | harmonic): | ||
| 112 | |||
| 113 | arg[E[1]] = Wo*N_SAMP; | ||
| 114 | |||
| 115 | We then relate the phase of the m-th excitation harmonic to the | ||
| 116 | phase of the fundamental as: | ||
| 117 | |||
| 118 | arg(E[m]) = m*arg(E[1]) | ||
| 119 | |||
| 120 | This E[m] then gets passed through the LPC synthesis filter to | ||
| 121 | determine the final harmonic phase. | ||
| 122 | |||
| 123 | Comparing to speech synthesised using original phases: | ||
| 124 | |||
| 125 | - Through headphones speech synthesised with this model is not as | ||
| 126 | good. Through a loudspeaker it is very close to original phases. | ||
| 127 | |||
| 128 | - If there are voicing errors, the speech can sound clicky or | ||
| 129 | staticy. If V speech is mistakenly declared UV, this model tends to | ||
| 130 | synthesise impulses or clicks, as there is usually very little shift or | ||
| 131 | dispersion through the LPC synthesis filter. | ||
| 132 | |||
| 133 | - When combined with LPC amplitude modelling there is an additional | ||
| 134 | drop in quality. I am not sure why, theory is interformant energy | ||
| 135 | is raised making any phase errors more obvious. | ||
| 136 | |||
| 137 | NOTES: | ||
| 138 | |||
| 139 | 1/ This synthesis model is effectively the same as a simple LPC-10 | ||
| 140 | vocoders, and yet sounds much better. Why? Conventional wisdom | ||
| 141 | (AMBE, MELP) says mixed voicing is required for high quality | ||
| 142 | speech. | ||
| 143 | |||
| 144 | 2/ I am pretty sure the Lincoln Lab sinusoidal coding guys (like xMBE | ||
| 145 | also from MIT) first described this zero phase model, I need to look | ||
| 146 | up the paper. | ||
| 147 | |||
| 148 | 3/ Note that this approach could cause some discontinuities in | ||
| 149 | the phase at the edge of synthesis frames, as no attempt is made | ||
| 150 | to make sure that the phase tracks are continuous (the excitation | ||
| 151 | phases are continuous, but not the final phases after filtering | ||
| 152 | by the LPC spectra). Technically this is a bad thing. However | ||
| 153 | this may actually be a good thing, disturbing the phase tracks a | ||
| 154 | bit. More research needed, e.g. test a synthesis model that adds | ||
| 155 | a small delta-W to make phase tracks line up for voiced | ||
| 156 | harmonics. | ||
| 157 | |||
| 158 | \*---------------------------------------------------------------------------*/ | ||
| 159 | |||
| 160 | void phase_synth_zero_order( | ||
| 161 | int n_samp, | ||
| 162 | MODEL *model, | ||
| 163 | float *ex_phase, /* excitation phase of fundamental */ | ||
| 164 | COMP H[] /* L synthesis filter freq domain samples */ | ||
| 165 | |||
| 166 | ) | ||
| 167 | { | ||
| 168 | int m; | ||
| 169 | float new_phi; | ||
| 170 | COMP Ex[MAX_AMP+1]; /* excitation samples */ | ||
| 171 | COMP A_[MAX_AMP+1]; /* synthesised harmonic samples */ | ||
| 172 | |||
| 173 | /* | ||
| 174 | Update excitation fundamental phase track, this sets the position | ||
| 175 | of each pitch pulse during voiced speech. After much experiment | ||
| 176 | I found that using just this frame's Wo improved quality for UV | ||
| 177 | sounds compared to interpolating two frames Wo like this: | ||
| 178 | |||
| 179 | ex_phase[0] += (*prev_Wo+model->Wo)*N_SAMP/2; | ||
| 180 | */ | ||
| 181 | |||
| 182 | ex_phase[0] += (model->Wo)*n_samp; | ||
| 183 | ex_phase[0] -= TWO_PI*floorf(ex_phase[0]/TWO_PI + 0.5); | ||
| 184 | |||
| 185 | for(m=1; m<=model->L; m++) { | ||
| 186 | |||
| 187 | /* generate excitation */ | ||
| 188 | |||
| 189 | if (model->voiced) { | ||
| 190 | |||
| 191 | Ex[m].real = cosf(ex_phase[0]*m); | ||
| 192 | Ex[m].imag = sinf(ex_phase[0]*m); | ||
| 193 | } | ||
| 194 | else { | ||
| 195 | |||
| 196 | /* When a few samples were tested I found that LPC filter | ||
| 197 | phase is not needed in the unvoiced case, but no harm in | ||
| 198 | keeping it. | ||
| 199 | */ | ||
| 200 | float phi = TWO_PI*(float)codec2_rand()/CODEC2_RAND_MAX; | ||
| 201 | Ex[m].real = cosf(phi); | ||
| 202 | Ex[m].imag = sinf(phi); | ||
| 203 | } | ||
| 204 | |||
| 205 | /* filter using LPC filter */ | ||
| 206 | |||
| 207 | A_[m].real = H[m].real*Ex[m].real - H[m].imag*Ex[m].imag; | ||
| 208 | A_[m].imag = H[m].imag*Ex[m].real + H[m].real*Ex[m].imag; | ||
| 209 | |||
| 210 | /* modify sinusoidal phase */ | ||
| 211 | |||
| 212 | new_phi = atan2f(A_[m].imag, A_[m].real+1E-12); | ||
| 213 | model->phi[m] = new_phi; | ||
| 214 | } | ||
| 215 | |||
| 216 | } | ||
| 217 | |||
| 218 | |||
| 219 | /*---------------------------------------------------------------------------*\ | ||
| 220 | |||
| 221 | FUNCTION....: mag_to_phase | ||
| 222 | AUTHOR......: David Rowe | ||
| 223 | DATE CREATED: Jan 2017 | ||
| 224 | |||
| 225 | Algorithm for http://www.dsprelated.com/showcode/20.php ported to C. See | ||
| 226 | also Octave function mag_to_phase.m | ||
| 227 | |||
| 228 | Given a magnitude spectrum in dB, returns a minimum-phase phase | ||
| 229 | spectra. | ||
| 230 | |||
| 231 | \*---------------------------------------------------------------------------*/ | ||
| 232 | |||
| 233 | void mag_to_phase(float phase[], /* Nfft/2+1 output phase samples in radians */ | ||
| 234 | float Gdbfk[], /* Nfft/2+1 postive freq amplitudes samples in dB */ | ||
| 235 | int Nfft, | ||
| 236 | codec2_fft_cfg fft_fwd_cfg, | ||
| 237 | codec2_fft_cfg fft_inv_cfg | ||
| 238 | ) | ||
| 239 | { | ||
| 240 | COMP Sdb[Nfft], c[Nfft], cf[Nfft], Cf[Nfft]; | ||
| 241 | int Ns = Nfft/2+1; | ||
| 242 | int i; | ||
| 243 | |||
| 244 | /* install negative frequency components, 1/Nfft takes into | ||
| 245 | account kiss fft lack of scaling on ifft */ | ||
| 246 | |||
| 247 | Sdb[0].real = Gdbfk[0]; | ||
| 248 | Sdb[0].imag = 0.0; | ||
| 249 | for(i=1; i<Ns; i++) { | ||
| 250 | Sdb[i].real = Sdb[Nfft-i].real = Gdbfk[i]; | ||
| 251 | Sdb[i].imag = Sdb[Nfft-i].imag = 0.0; | ||
| 252 | } | ||
| 253 | |||
| 254 | /* compute real cepstrum from log magnitude spectrum */ | ||
| 255 | |||
| 256 | codec2_fft(fft_inv_cfg, Sdb, c); | ||
| 257 | for(i=0; i<Nfft; i++) { | ||
| 258 | c[i].real /= (float)Nfft; | ||
| 259 | c[i].imag /= (float)Nfft; | ||
| 260 | } | ||
| 261 | |||
| 262 | /* Fold cepstrum to reflect non-min-phase zeros inside unit circle */ | ||
| 263 | |||
| 264 | cf[0] = c[0]; | ||
| 265 | for(i=1; i<Ns-1; i++) { | ||
| 266 | cf[i] = cadd(c[i],c[Nfft-i]); | ||
| 267 | } | ||
| 268 | cf[Ns-1] = c[Ns-1]; | ||
| 269 | for(i=Ns; i<Nfft; i++) { | ||
| 270 | cf[i].real = 0.0; | ||
| 271 | cf[i].imag = 0.0; | ||
| 272 | } | ||
| 273 | |||
| 274 | /* Cf = dB_magnitude + j * minimum_phase */ | ||
| 275 | |||
| 276 | codec2_fft(fft_fwd_cfg, cf, Cf); | ||
| 277 | |||
| 278 | /* The maths says we are meant to be using log(x), not 20*log10(x), | ||
| 279 | so we need to scale the phase to account for this: | ||
| 280 | log(x) = 20*log10(x)/scale */ | ||
| 281 | |||
| 282 | float scale = (20.0/logf(10.0)); | ||
| 283 | |||
| 284 | for(i=0; i<Ns; i++) { | ||
| 285 | phase[i] = Cf[i].imag/scale; | ||
| 286 | } | ||
| 287 | |||
| 288 | |||
| 289 | } | ||
