]> git.bts.cx Git - benzene.git/blob - third_party/soloud_speech/klatt.cpp
Initial version
[benzene.git] / third_party / soloud_speech / klatt.cpp
1 #include <math.h>
2 #include <stdlib.h>
3 #include "klatt.h"
4 #include "darray.h"
5 #include "resonator.h"
6
7 #ifndef PI
8 #define PI 3.1415926535897932384626433832795f
9 #endif
10
11 #ifndef NULL
12 #define NULL 0
13 #endif
14
15 class Interp
16 {
17 public:
18 float mSteady;
19 float mFixed;
20 char mProportion;
21 char mExtDelay;
22 char mIntDelay;
23 };
24
25
26 enum Eparm_e
27 {
28 ELM_FN, ELM_F1, ELM_F2, ELM_F3,
29 ELM_B1, ELM_B2, ELM_B3, ELM_AN,
30 ELM_A1, ELM_A2, ELM_A3, ELM_A4,
31 ELM_A5, ELM_A6, ELM_AB, ELM_AV,
32 ELM_AVC, ELM_ASP, ELM_AF,
33 ELM_COUNT
34 };
35
36 class Element
37 {
38 public:
39 const char *mName; // unused
40 const char mRK;
41 const char mDU;
42 const char mUD;
43 unsigned char mFont; // unused
44 const char *mDict; // unused
45 const char *mIpa; // unused
46 int mFeat; // only ELM_FEATURE_VWL
47 Interp mInterpolator[ELM_COUNT];
48 };
49
50 enum ELEMENT_FEATURES
51 {
52 ELM_FEATURE_ALV = 0x00000001,
53 ELM_FEATURE_APR = 0x00000002,
54 ELM_FEATURE_BCK = 0x00000004,
55 ELM_FEATURE_BLB = 0x00000008,
56 ELM_FEATURE_CNT = 0x00000010,
57 ELM_FEATURE_DNT = 0x00000020,
58 ELM_FEATURE_FNT = 0x00000040,
59 ELM_FEATURE_FRC = 0x00000080,
60 ELM_FEATURE_GLT = 0x00000100,
61 ELM_FEATURE_HGH = 0x00000200,
62 ELM_FEATURE_LAT = 0x00000400,
63 ELM_FEATURE_LBD = 0x00000800,
64 ELM_FEATURE_LBV = 0x00001000,
65 ELM_FEATURE_LMD = 0x00002000,
66 ELM_FEATURE_LOW = 0x00004000,
67 ELM_FEATURE_MDL = 0x00008000,
68 ELM_FEATURE_NAS = 0x00010000,
69 ELM_FEATURE_PAL = 0x00020000,
70 ELM_FEATURE_PLA = 0x00040000,
71 ELM_FEATURE_RND = 0x00080000,
72 ELM_FEATURE_RZD = 0x00100000,
73 ELM_FEATURE_SMH = 0x00200000,
74 ELM_FEATURE_STP = 0x00400000,
75 ELM_FEATURE_UMD = 0x00800000,
76 ELM_FEATURE_UNR = 0x01000000,
77 ELM_FEATURE_VCD = 0x02000000,
78 ELM_FEATURE_VEL = 0x04000000,
79 ELM_FEATURE_VLS = 0x08000000,
80 ELM_FEATURE_VWL = 0x10000000
81 };
82
83 enum ELEMENTS
84 {
85 ELM_END = 0,
86 ELM_Q, ELM_P, ELM_PY, ELM_PZ, ELM_T, ELM_TY,
87 ELM_TZ, ELM_K, ELM_KY, ELM_KZ, ELM_B, ELM_BY, ELM_BZ,
88 ELM_D, ELM_DY, ELM_DZ, ELM_G, ELM_GY, ELM_GZ, ELM_M,
89 ELM_N, ELM_NG, ELM_F, ELM_TH, ELM_S, ELM_SH, ELM_X,
90 ELM_H, ELM_V, ELM_QQ, ELM_DH, ELM_DI, ELM_Z, ELM_ZZ,
91 ELM_ZH, ELM_CH, ELM_CI, ELM_J, ELM_JY, ELM_L, ELM_LL,
92 ELM_RX, ELM_R, ELM_W, ELM_Y, ELM_I, ELM_E, ELM_AA,
93 ELM_U, ELM_O, ELM_OO, ELM_A, ELM_EE, ELM_ER, ELM_AR,
94 ELM_AW, ELM_UU, ELM_AI, ELM_IE, ELM_OI, ELM_OU, ELM_OV,
95 ELM_OA, ELM_IA, ELM_IB, ELM_AIR,ELM_OOR,ELM_OR
96 };
97
98 #define PHONEME_COUNT 53
99 #define AMP_ADJ 14
100 #define StressDur(e,s) (s,((e->mDU + e->mUD)/2))
101
102
103
104
105 class PhonemeToElements
106 {
107 public:
108 int mKey;
109 char mData[8];
110 };
111
112 /* Order is important - 2 byte phonemes first, otherwise
113 the search function will fail*/
114 static PhonemeToElements phoneme_to_elements[PHONEME_COUNT] =
115 {
116 /* mKey, count, 0-7 elements */
117 /* tS */ 0x5374, 2, ELM_CH, ELM_CI, 0, 0, 0, 0, 0,
118 /* dZ */ 0x5a64, 4, ELM_J, ELM_JY, ELM_QQ, ELM_JY, 0, 0, 0,
119 /* rr */ 0x7272, 3, ELM_R, ELM_QQ, ELM_R, 0, 0, 0, 0,
120 /* eI */ 0x4965, 2, ELM_AI, ELM_I, 0, 0, 0, 0, 0,
121 /* aI */ 0x4961, 2, ELM_IE, ELM_I, 0, 0, 0, 0, 0,
122 /* oI */ 0x496f, 2, ELM_OI, ELM_I, 0, 0, 0, 0, 0,
123 /* aU */ 0x5561, 2, ELM_OU, ELM_OV, 0, 0, 0, 0, 0,
124 /* @U */ 0x5540, 2, ELM_OA, ELM_OV, 0, 0, 0, 0, 0,
125 /* I@ */ 0x4049, 2, ELM_IA, ELM_IB, 0, 0, 0, 0, 0,
126 /* e@ */ 0x4065, 2, ELM_AIR, ELM_IB, 0, 0, 0, 0, 0,
127 /* U@ */ 0x4055, 2, ELM_OOR, ELM_IB, 0, 0, 0, 0, 0,
128 /* O@ */ 0x404f, 2, ELM_OR, ELM_IB, 0, 0, 0, 0, 0,
129 /* oU */ 0x556f, 2, ELM_OI, ELM_OV, 0, 0, 0, 0, 0,
130 /* */ 0x0020, 1, ELM_Q, 0, 0, 0, 0, 0, 0,
131 /* p */ 0x0070, 3, ELM_P, ELM_PY, ELM_PZ, 0, 0, 0, 0,
132 /* t */ 0x0074, 3, ELM_T, ELM_TY, ELM_TZ, 0, 0, 0, 0,
133 /* k */ 0x006b, 3, ELM_K, ELM_KY, ELM_KZ, 0, 0, 0, 0,
134 /* b */ 0x0062, 3, ELM_B, ELM_BY, ELM_BZ, 0, 0, 0, 0,
135 /* d */ 0x0064, 3, ELM_D, ELM_DY, ELM_DZ, 0, 0, 0, 0,
136 /* g */ 0x0067, 3, ELM_G, ELM_GY, ELM_GZ, 0, 0, 0, 0,
137 /* m */ 0x006d, 1, ELM_M, 0, 0, 0, 0, 0, 0,
138 /* n */ 0x006e, 1, ELM_N, 0, 0, 0, 0, 0, 0,
139 /* N */ 0x004e, 1, ELM_NG, 0, 0, 0, 0, 0, 0,
140 /* f */ 0x0066, 1, ELM_F, 0, 0, 0, 0, 0, 0,
141 /* T */ 0x0054, 1, ELM_TH, 0, 0, 0, 0, 0, 0,
142 /* s */ 0x0073, 1, ELM_S, 0, 0, 0, 0, 0, 0,
143 /* S */ 0x0053, 1, ELM_SH, 0, 0, 0, 0, 0, 0,
144 /* h */ 0x0068, 1, ELM_H, 0, 0, 0, 0, 0, 0,
145 /* v */ 0x0076, 3, ELM_V, ELM_QQ, ELM_V, 0, 0, 0, 0,
146 /* D */ 0x0044, 3, ELM_DH, ELM_QQ, ELM_DI, 0, 0, 0, 0,
147 /* z */ 0x007a, 3, ELM_Z, ELM_QQ, ELM_ZZ, 0, 0, 0, 0,
148 /* Z */ 0x005a, 3, ELM_ZH, ELM_QQ, ELM_ZH, 0, 0, 0, 0,
149 /* l */ 0x006c, 1, ELM_L, 0, 0, 0, 0, 0, 0,
150 /* r */ 0x0072, 1, ELM_R, 0, 0, 0, 0, 0, 0,
151 /* R */ 0x0052, 1, ELM_RX, 0, 0, 0, 0, 0, 0,
152 /* w */ 0x0077, 1, ELM_W, 0, 0, 0, 0, 0, 0,
153 /* x */ 0x0078, 1, ELM_X, 0, 0, 0, 0, 0, 0,
154 /* % */ 0x0025, 1, ELM_QQ, 0, 0, 0, 0, 0, 0,
155 /* j */ 0x006a, 1, ELM_Y, 0, 0, 0, 0, 0, 0,
156 /* I */ 0x0049, 1, ELM_I, 0, 0, 0, 0, 0, 0,
157 /* e */ 0x0065, 1, ELM_E, 0, 0, 0, 0, 0, 0,
158 /* & */ 0x0026, 1, ELM_AA, 0, 0, 0, 0, 0, 0,
159 /* V */ 0x0056, 1, ELM_U, 0, 0, 0, 0, 0, 0,
160 /* 0 */ 0x0030, 1, ELM_O, 0, 0, 0, 0, 0, 0,
161 /* U */ 0x0055, 1, ELM_OO, 0, 0, 0, 0, 0, 0,
162 /* @ */ 0x0040, 1, ELM_A, 0, 0, 0, 0, 0, 0,
163 /* i */ 0x0069, 1, ELM_EE, 0, 0, 0, 0, 0, 0,
164 /* 3 */ 0x0033, 1, ELM_ER, 0, 0, 0, 0, 0, 0,
165 /* A */ 0x0041, 1, ELM_AR, 0, 0, 0, 0, 0, 0,
166 /* O */ 0x004f, 1, ELM_AW, 0, 0, 0, 0, 0, 0,
167 /* u */ 0x0075, 1, ELM_UU, 0, 0, 0, 0, 0, 0,
168 /* o */ 0x006f, 1, ELM_OI, 0, 0, 0, 0, 0, 0,
169 /* . */ 0x002e, 1, ELM_END,0, 0, 0, 0, 0, 0,
170 };
171
172 static Element gElement[] =
173 {
174 #include "Elements.def"
175 };
176
177 static short clip(float input)
178 {
179 int temp = (int)input;
180 /* clip on boundaries of 16-bit word */
181
182 if (temp < -32767)
183 {
184 //assert?
185 temp = -32767;
186 }
187 else
188 if (temp > 32767)
189 {
190 //assert?
191 temp = 32767;
192 }
193
194 return (short)(temp);
195 }
196
197 /* Convert from decibels to a linear scale factor */
198 static float DBtoLIN(int dB)
199 {
200 /*
201 * Convertion table, db to linear, 87 dB --> 32767
202 * 86 dB --> 29491 (1 dB down = 0.5**1/6)
203 * ...
204 * 81 dB --> 16384 (6 dB down = 0.5)
205 * ...
206 * 0 dB --> 0
207 *
208 * The just noticeable difference for a change in intensity of a vowel
209 * is approximately 1 dB. Thus all amplitudes are quantized to 1 dB
210 * steps.
211 */
212
213 static const float amptable[88] =
214 {
215 0.0, 0.0, 0.0, 0.0, 0.0,
216 0.0, 0.0, 0.0, 0.0, 0.0,
217 0.0, 0.0, 0.0, 6.0, 7.0,
218 8.0, 9.0, 10.0, 11.0, 13.0,
219 14.0, 16.0, 18.0, 20.0, 22.0,
220 25.0, 28.0, 32.0, 35.0, 40.0,
221 45.0, 51.0, 57.0, 64.0, 71.0,
222 80.0, 90.0, 101.0, 114.0, 128.0,
223 142.0, 159.0, 179.0, 202.0, 227.0,
224 256.0, 284.0, 318.0, 359.0, 405.0,
225 455.0, 512.0, 568.0, 638.0, 719.0,
226 811.0, 911.0, 1024.0, 1137.0, 1276.0,
227 1438.0, 1622.0, 1823.0, 2048.0, 2273.0,
228 2552.0, 2875.0, 3244.0, 3645.0, 4096.0,
229 4547.0, 5104.0, 5751.0, 6488.0, 7291.0,
230 8192.0, 9093.0, 10207.0, 11502.0, 12976.0,
231 14582.0, 16384.0, 18350.0, 20644.0, 23429.0,
232 26214.0, 29491.0, 32767.0
233 };
234
235 // Check limits or argument (can be removed in final product)
236 if (dB < 0)
237 {
238 dB = 0;
239 }
240 else
241 if (dB >= 88)
242 {
243 dB = 87;
244 }
245
246 return amptable[dB] * 0.001f;
247 }
248
249
250
251 klatt_frame::klatt_frame() :
252 mF0FundamentalFreq(1330), mVoicingAmpdb(60), mFormant1Freq(500),
253 mFormant1Bandwidth(60), mFormant2Freq(1500), mFormant2Bandwidth(90),
254 mFormant3Freq(2800), mFormant3Bandwidth(150), mFormant4Freq(3250),
255 mFormant4Bandwidth(200), mFormant5Freq(3700), mFormant5Bandwidth(200),
256 mFormant6Freq(4990), mFormant6Bandwidth(500), mNasalZeroFreq(270),
257 mNasalZeroBandwidth(100), mNasalPoleFreq(270), mNasalPoleBandwidth(100),
258 mAspirationAmpdb(0), mNoSamplesInOpenPeriod(30), mVoicingBreathiness(0),
259 mVoicingSpectralTiltdb(10), mFricationAmpdb(0), mSkewnessOfAlternatePeriods(0),
260 mFormant1Ampdb(0), mFormant1ParallelBandwidth(80), mFormant2Ampdb(0),
261 mFormant2ParallelBandwidth(200), mFormant3Ampdb(0), mFormant3ParallelBandwidth(350),
262 mFormant4Ampdb(0), mFormant4ParallelBandwidth(500), mFormant5Ampdb(0),
263 mFormant5ParallelBandwidth(600), mFormant6Ampdb(0), mFormant6ParallelBandwidth(800),
264 mParallelNasalPoleAmpdb(0), mBypassFricationAmpdb(0), mPalallelVoicingAmpdb(0),
265 mOverallGaindb(62)
266 {
267 };
268
269
270 klatt::klatt() :
271 mBaseF0(1330),
272 mBaseSpeed(10.0f),
273 mBaseDeclination(0.5f),
274 mBaseWaveform(KW_SAW),
275 mF0Flutter(0),
276 mSampleRate(0),
277 mNspFr(0),
278 mF0FundamentalFreq(0),
279 mVoicingAmpdb(0),
280 mSkewnessOfAlternatePeriods(0),
281 mTimeCount(0),
282 mNPer(0),
283 mT0(0),
284 mNOpen(0),
285 mNMod(0),
286 mAmpVoice(0),
287 mAmpBypas(0),
288 mAmpAspir(0),
289 mAmpFrica(0),
290 mAmpBreth(0),
291 mSkew(0),
292 mVLast(0),
293 mNLast(0),
294 mGlotLast(0),
295 mDecay(0),
296 mOneMd(0),
297 mSeed(5),
298 mElementCount(0),
299 mElement(0),
300 mElementIndex(0),
301 mLastElement(0),
302 mTStress(0),
303 mNTStress(0),
304 mTop(0)
305 {
306 }
307
308 /*
309 function FLUTTER
310
311 This function adds F0 flutter, as specified in:
312
313 "Analysis, synthesis and perception of voice quality variations among
314 female and male talkers" D.H. Klatt and L.C. Klatt JASA 87(2) February 1990.
315 Flutter is added by applying a quasi-random element constructed from three
316 slowly varying sine waves.
317 */
318 void klatt::flutter()
319 {
320 int original_f0 = mFrame.mF0FundamentalFreq / 10;
321 float fla = (float) mF0Flutter / 50;
322 float flb = (float) original_f0 / 100;
323 float flc = (float)sin(2 * PI * 12.7 * mTimeCount);
324 float fld = (float)sin(2 * PI * 7.1 * mTimeCount);
325 float fle = (float)sin(2 * PI * 4.7 * mTimeCount);
326 float delta_f0 = fla * flb * (flc + fld + fle) * 10;
327 mF0FundamentalFreq += (int) delta_f0;
328 }
329
330 /* Vwave is the differentiated glottal flow waveform, there is a weak
331 spectral zero around 800 Hz, magic constants a,b reset pitch-synch
332 */
333
334 float klatt::natural_source(int aNper)
335 {
336 // See if glottis open
337 if (aNper < mNOpen)
338 {
339 switch (mBaseWaveform)
340 {
341 case KW_TRIANGLE:
342 return ((aNper % 200) - 100) * 81.92f; // triangle
343 case KW_SIN:
344 return (float)(sin(aNper * 0.0314) * 8192); // sin
345 case KW_SQUARE:
346 return ((aNper % 200) - 100) > 0 ? 8192.0f : -8192.0f; // square
347 case KW_PULSE:
348 return ((aNper % 200) - 100) > 50 ? 8192.0f : -8192.0f; // pulse
349 case KW_NOISE:
350 return (int)mNLast & 1 ? -8192.0f : 8192.0f;
351 case KW_WARBLE:
352 return (int)mNLast & 7 ? -8192.0f : 8192.0f;
353 case KW_SAW: // fallthrough
354 default:
355 return (abs((aNper % 200) - 100) - 50) * 163.84f; // saw
356 }
357 }
358 else
359 {
360 // Glottis closed
361 return (0.0);
362 }
363
364 }
365
366 /* Reset selected parameters pitch-synchronously */
367
368 void klatt::pitch_synch_par_reset(int ns)
369 {
370 if (mF0FundamentalFreq > 0)
371 {
372 mT0 = (40 * mSampleRate) / mF0FundamentalFreq;
373
374 /* Period in samp*4 */
375 mAmpVoice = DBtoLIN(mVoicingAmpdb);
376
377 /* Duration of period before amplitude modulation */
378 mNMod = mT0;
379
380 if (mVoicingAmpdb > 0)
381 {
382 mNMod >>= 1;
383 }
384
385 /* Breathiness of voicing waveform */
386
387 mAmpBreth = DBtoLIN(mFrame.mVoicingBreathiness) * 0.1f;
388
389 /* Set open phase of glottal period */
390 /* where 40 <= open phase <= 263 */
391
392 mNOpen = 4 * mFrame.mNoSamplesInOpenPeriod;
393
394 if (mNOpen >= (mT0 - 1))
395 {
396 mNOpen = mT0 - 2;
397 }
398
399 if (mNOpen < 40)
400 {
401 mNOpen = 40; /* F0 max = 1000 Hz */
402 }
403
404 int temp;
405 float temp1;
406
407 temp = mSampleRate / mNOpen;
408 mCritDampedGlotLowPassFilter.initResonator(0L, temp, mSampleRate);
409
410 /* Make gain at F1 about constant */
411
412 temp1 = mNOpen * .00833f;
413 mCritDampedGlotLowPassFilter.setGain(temp1 * temp1);
414
415 /* Truncate skewness so as not to exceed duration of closed phase
416 of glottal period */
417
418 temp = mT0 - mNOpen;
419
420 if (mSkewnessOfAlternatePeriods > temp)
421 {
422 mSkewnessOfAlternatePeriods = temp;
423 }
424
425 if (mSkew >= 0)
426 {
427 mSkew = mSkewnessOfAlternatePeriods; /* Reset mSkew to requested mSkewnessOfAlternatePeriods */
428 }
429 else
430 {
431 mSkew = -mSkewnessOfAlternatePeriods;
432 }
433
434 /* Add skewness to closed portion of voicing period */
435
436 mT0 = mT0 + mSkew;
437 mSkew = -mSkew;
438 }
439 else
440 {
441 mT0 = 4; /* Default for f0 undefined */
442 mAmpVoice = 0.0;
443 mNMod = mT0;
444 mAmpBreth = 0.0;
445 }
446
447 /* Reset these pars pitch synchronously or at update rate if f0=0 */
448
449 if ((mT0 != 4) || (ns == 0))
450 {
451 /* Set one-pole ELM_FEATURE_LOW-pass filter that tilts glottal source */
452 mDecay = (0.033f * mFrame.mVoicingSpectralTiltdb); /* Function of samp_rate ? */
453
454 if (mDecay > 0.0f)
455 {
456 mOneMd = 1.0f - mDecay;
457 }
458 else
459 {
460 mOneMd = 1.0f;
461 }
462 }
463 }
464
465
466 /* Get variable parameters from host computer,
467 initially also get definition of fixed pars
468 */
469
470 void klatt::frame_init()
471 {
472 int mOverallGaindb; /* Overall gain, 60 dB is unity 0 to 60 */
473 float amp_parF1; /* mFormant1Ampdb converted to linear gain */
474 float amp_parFN; /* mParallelNasalPoleAmpdb converted to linear gain */
475 float amp_parF2; /* mFormant2Ampdb converted to linear gain */
476 float amp_parF3; /* mFormant3Ampdb converted to linear gain */
477 float amp_parF4; /* mFormant4Ampdb converted to linear gain */
478 float amp_parF5; /* mFormant5Ampdb converted to linear gain */
479 float amp_parF6; /* mFormant6Ampdb converted to linear gain */
480
481 /* Read speech frame definition into temp store
482 and move some parameters into active use immediately
483 (voice-excited ones are updated pitch synchronously
484 to avoid waveform glitches).
485 */
486
487 mF0FundamentalFreq = mFrame.mF0FundamentalFreq;
488 mVoicingAmpdb = mFrame.mVoicingAmpdb - 7;
489
490 if (mVoicingAmpdb < 0) mVoicingAmpdb = 0;
491
492 mAmpAspir = DBtoLIN(mFrame.mAspirationAmpdb) * .05f;
493 mAmpFrica = DBtoLIN(mFrame.mFricationAmpdb) * 0.25f;
494 mSkewnessOfAlternatePeriods = mFrame.mSkewnessOfAlternatePeriods;
495
496 /* Fudge factors (which comprehend affects of formants on each other?)
497 with these in place ALL_PARALLEL should sound as close as
498 possible to CASCADE_PARALLEL.
499 Possible problem feeding in Holmes's amplitudes given this.
500 */
501 amp_parF1 = DBtoLIN(mFrame.mFormant1Ampdb) * 0.4f; /* -7.96 dB */
502 amp_parF2 = DBtoLIN(mFrame.mFormant2Ampdb) * 0.15f; /* -16.5 dB */
503 amp_parF3 = DBtoLIN(mFrame.mFormant3Ampdb) * 0.06f; /* -24.4 dB */
504 amp_parF4 = DBtoLIN(mFrame.mFormant4Ampdb) * 0.04f; /* -28.0 dB */
505 amp_parF5 = DBtoLIN(mFrame.mFormant5Ampdb) * 0.022f; /* -33.2 dB */
506 amp_parF6 = DBtoLIN(mFrame.mFormant6Ampdb) * 0.03f; /* -30.5 dB */
507 amp_parFN = DBtoLIN(mFrame.mParallelNasalPoleAmpdb) * 0.6f; /* -4.44 dB */
508 mAmpBypas = DBtoLIN(mFrame.mBypassFricationAmpdb) * 0.05f; /* -26.0 db */
509
510 // Set coeficients of nasal resonator and zero antiresonator
511 mNasalPole.initResonator(mFrame.mNasalPoleFreq, mFrame.mNasalPoleBandwidth, mSampleRate);
512
513 mNasalZero.initAntiresonator(mFrame.mNasalZeroFreq, mFrame.mNasalZeroBandwidth, mSampleRate);
514
515 // Set coefficients of parallel resonators, and amplitude of outputs
516 mParallelFormant1.initResonator(mFrame.mFormant1Freq, mFrame.mFormant1ParallelBandwidth, mSampleRate);
517 mParallelFormant1.setGain(amp_parF1);
518
519 mParallelResoNasalPole.initResonator(mFrame.mNasalPoleFreq, mFrame.mNasalPoleBandwidth, mSampleRate);
520 mParallelResoNasalPole.setGain(amp_parFN);
521
522 mParallelFormant2.initResonator(mFrame.mFormant2Freq, mFrame.mFormant2ParallelBandwidth, mSampleRate);
523 mParallelFormant2.setGain(amp_parF2);
524
525 mParallelFormant3.initResonator(mFrame.mFormant3Freq, mFrame.mFormant3ParallelBandwidth, mSampleRate);
526 mParallelFormant3.setGain(amp_parF3);
527
528 mParallelFormant4.initResonator(mFrame.mFormant4Freq, mFrame.mFormant4ParallelBandwidth, mSampleRate);
529 mParallelFormant4.setGain(amp_parF4);
530
531 mParallelFormant5.initResonator(mFrame.mFormant5Freq, mFrame.mFormant5ParallelBandwidth, mSampleRate);
532 mParallelFormant5.setGain(amp_parF5);
533
534 mParallelFormant6.initResonator(mFrame.mFormant6Freq, mFrame.mFormant6ParallelBandwidth, mSampleRate);
535 mParallelFormant6.setGain(amp_parF6);
536
537
538 /* fold overall gain into output resonator */
539 mOverallGaindb = mFrame.mOverallGaindb - 3;
540
541 if (mOverallGaindb <= 0)
542 mOverallGaindb = 57;
543
544 /* output ELM_FEATURE_LOW-pass filter - resonator with freq 0 and BW = globals->mSampleRate
545 Thus 3db point is globals->mSampleRate/2 i.e. Nyquist limit.
546 Only 3db down seems rather mild...
547 */
548 mOutputLowPassFilter.initResonator(0L, (int)mSampleRate, mSampleRate);
549 mOutputLowPassFilter.setGain(DBtoLIN(mOverallGaindb));
550 }
551
552 /*
553 function PARWAV
554
555 CONVERT FRAME OF PARAMETER DATA TO A WAVEFORM CHUNK
556 Synthesize globals->mNspFr samples of waveform and store in jwave[].
557 */
558
559 void klatt::parwave(short int *jwave)
560 {
561 /* Output of cascade branch, also final output */
562
563 /* Initialize synthesizer and get specification for current speech
564 frame from host microcomputer */
565
566 frame_init();
567
568 if (mF0Flutter != 0)
569 {
570 mTimeCount++; /* used for f0 flutter */
571 flutter(); /* add f0 flutter */
572 }
573
574 /* MAIN LOOP, for each output sample of current frame: */
575
576 int ns;
577 for (ns = 0; ns < mNspFr; ns++)
578 {
579 float noise;
580 int n4;
581 float sourc; /* Sound source if all-parallel config used */
582 float glotout; /* Output of glottal sound source */
583 float par_glotout; /* Output of parallelglottal sound sourc */
584 float voice = 0; /* Current sample of voicing waveform */
585 float frics; /* Frication sound source */
586 float aspiration; /* Aspiration sound source */
587 int nrand; /* Varible used by random number generator */
588
589 /* Our own code like rand(), but portable
590 whole upper 31 bits of seed random
591 assumes 32-bit unsigned arithmetic
592 with untested code to handle larger.
593 */
594 mSeed = mSeed * 1664525 + 1;
595
596 mSeed &= 0xFFFFFFFF;
597
598 /* Shift top bits of seed up to top of int then back down to LS 14 bits */
599 /* Assumes 8 bits per sizeof unit i.e. a "byte" */
600 nrand = (((int) mSeed) << (8 * sizeof(int) - 32)) >> (8 * sizeof(int) - 14);
601
602 /* Tilt down noise spectrum by soft ELM_FEATURE_LOW-pass filter having
603 * a pole near the origin in the z-plane, i.e.
604 * output = input + (0.75 * lastoutput) */
605
606 noise = nrand + (0.75f * mNLast); /* Function of samp_rate ? */
607
608 mNLast = noise;
609
610 /* Amplitude modulate noise (reduce noise amplitude during
611 second half of glottal period) if voicing simultaneously present
612 */
613
614 if (mNPer > mNMod)
615 {
616 noise *= 0.5f;
617 }
618
619 /* Compute frication noise */
620 sourc = frics = mAmpFrica * noise;
621
622 /* Compute voicing waveform : (run glottal source simulation at
623 4 times normal sample rate to minimize quantization noise in
624 period of female voice)
625 */
626
627 for (n4 = 0; n4 < 4; n4++)
628 {
629 /* use a more-natural-shaped source waveform with excitation
630 occurring both upon opening and upon closure, stronest at closure */
631 voice = natural_source(mNPer);
632
633 /* Reset period when counter 'mNPer' reaches mT0 */
634
635 if (mNPer >= mT0)
636 {
637 mNPer = 0;
638 pitch_synch_par_reset(ns);
639 }
640
641 /* Low-pass filter voicing waveform before downsampling from 4*globals->mSampleRate */
642 /* to globals->mSampleRate samples/sec. Resonator f=.09*globals->mSampleRate, bw=.06*globals->mSampleRate */
643
644 voice = mDownSampLowPassFilter.resonate(voice); /* in=voice, out=voice */
645
646 /* Increment counter that keeps track of 4*globals->mSampleRate samples/sec */
647 mNPer++;
648 }
649
650 /* Tilt spectrum of voicing source down by soft ELM_FEATURE_LOW-pass filtering, amount
651 of tilt determined by mVoicingSpectralTiltdb
652 */
653 voice = (voice * mOneMd) + (mVLast * mDecay);
654
655 mVLast = voice;
656
657 /* Add breathiness during glottal open phase */
658 if (mNPer < mNOpen)
659 {
660 /* Amount of breathiness determined by parameter mVoicingBreathiness */
661 /* Use nrand rather than noise because noise is ELM_FEATURE_LOW-passed */
662 voice += mAmpBreth * nrand;
663 }
664
665 /* Set amplitude of voicing */
666 glotout = mAmpVoice * voice;
667
668 /* Compute aspiration amplitude and add to voicing source */
669 aspiration = mAmpAspir * noise;
670
671 glotout += aspiration;
672
673 par_glotout = glotout;
674
675 /* NIS - rsynth "hack"
676 As Holmes' scheme is weak at nasals and (physically) nasal cavity
677 is "back near glottis" feed glottal source through nasal resonators
678 Don't think this is quite right, but improves things a bit
679 */
680 par_glotout = mNasalZero.antiresonate(par_glotout);
681 par_glotout = mNasalPole.resonate(par_glotout);
682 /* And just use mParallelFormant1 NOT mParallelResoNasalPole */
683 float out = mParallelFormant1.resonate(par_glotout);
684 /* Sound sourc for other parallel resonators is frication
685 plus first difference of voicing waveform.
686 */
687 sourc += (par_glotout - mGlotLast);
688 mGlotLast = par_glotout;
689
690 /* Standard parallel vocal tract
691 Formants F6,F5,F4,F3,F2, outputs added with alternating sign
692 */
693 out = mParallelFormant6.resonate(sourc) - out;
694 out = mParallelFormant5.resonate(sourc) - out;
695 out = mParallelFormant4.resonate(sourc) - out;
696 out = mParallelFormant3.resonate(sourc) - out;
697 out = mParallelFormant2.resonate(sourc) - out;
698
699 out = mAmpBypas * sourc - out;
700 out = mOutputLowPassFilter.resonate(out);
701
702 *jwave++ = clip(out); /* Convert back to integer */
703 }
704 }
705
706
707
708 static char * phoneme_to_element_lookup(char *s, void ** data)
709 {
710 int key8 = *s;
711 int key16 = key8 + (s[1] << 8);
712 if (s[1] == 0) key16 = -1; // avoid key8==key16
713 int i;
714 for (i = 0; i < PHONEME_COUNT; i++)
715 {
716 if (phoneme_to_elements[i].mKey == key16)
717 {
718 *data = &phoneme_to_elements[i].mData;
719 return s+2;
720 }
721 if (phoneme_to_elements[i].mKey == key8)
722 {
723 *data = &phoneme_to_elements[i].mData;
724 return s+1;
725 }
726 }
727 // should never happen
728 *data = NULL;
729 return s+1;
730 }
731
732
733
734 int klatt::phone_to_elm(char *aPhoneme, int aCount, darray *aElement)
735 {
736 int stress = 0;
737 char *s = aPhoneme;
738 int t = 0;
739 char *limit = s + aCount;
740
741 while (s < limit && *s)
742 {
743 char *e = NULL;
744 s = phoneme_to_element_lookup(s, (void**)&e);
745
746 if (e)
747 {
748 int n = *e++;
749
750 while (n-- > 0)
751 {
752 int x = *e++;
753 Element * p = &gElement[x];
754 /* This works because only vowels have mUD != mDU,
755 and we set stress just before a vowel
756 */
757 aElement->put(x);
758
759 if (!(p->mFeat & ELM_FEATURE_VWL))
760 stress = 0;
761
762 int stressdur = StressDur(p,stress);
763
764 t += stressdur;
765
766 aElement->put(stressdur);
767 aElement->put(stress);
768 }
769 }
770
771 else
772 {
773 char ch = *s++;
774
775 switch (ch)
776 {
777
778 case '\'': /* Primary stress */
779 stress = 3;
780 break;
781
782 case ',': /* Secondary stress */
783 stress = 2;
784 break;
785
786 case '+': /* Tertiary stress */
787 stress = 1;
788 break;
789
790 case '-': /* hyphen in input */
791 break;
792
793 default:
794 // fprintf(stderr, "Ignoring %c in '%.*s'\n", ch, aCount, aPhoneme);
795 break;
796 }
797 }
798 }
799
800 return t;
801 }
802
803
804
805 /* 'a' is dominant element, 'b' is dominated
806 ext is flag to say to use external times from 'a' rather
807 than internal i.e. ext != 0 if 'a' is NOT current element.
808 */
809
810 static void set_trans(Slope *t, Element * a, Element * b,int ext, int /* e */)
811 {
812 int i;
813
814 for (i = 0; i < ELM_COUNT; i++)
815 {
816 t[i].mTime = ((ext) ? a->mInterpolator[i].mExtDelay : a->mInterpolator[i].mIntDelay);
817
818 if (t[i].mTime)
819 {
820 t[i].mValue = a->mInterpolator[i].mFixed + (a->mInterpolator[i].mProportion * b->mInterpolator[i].mSteady) * 0.01f; // mProportion is in scale 0..100, so *0.01.
821 }
822 else
823 {
824 t[i].mValue = b->mInterpolator[i].mSteady;
825 }
826 }
827 }
828
829 static float lerp(float a, float b, int t, int d)
830 {
831 if (t <= 0)
832 {
833 return a;
834 }
835
836 if (t >= d)
837 {
838 return b;
839 }
840
841 float f = (float)t / (float)d;
842 return a + (b - a) * f;
843 }
844
845 static float interpolate(Slope *aStartSlope, Slope *aEndSlope, float aMidValue, int aTime, int aDuration)
846 {
847 int steadyTime = aDuration - (aStartSlope->mTime + aEndSlope->mTime);
848
849 if (steadyTime >= 0)
850 {
851 // Interpolate to a midpoint, stay there for a while, then interpolate to end
852
853 if (aTime < aStartSlope->mTime)
854 {
855 // interpolate to the first value
856 return lerp(aStartSlope->mValue, aMidValue, aTime, aStartSlope->mTime);
857 }
858 // reached midpoint
859
860 aTime -= aStartSlope->mTime;
861
862 if (aTime <= steadyTime)
863 {
864 // still at steady state
865 return aMidValue;
866 }
867
868 // interpolate to the end
869 return lerp(aMidValue, aEndSlope->mValue, aTime - steadyTime, aEndSlope->mTime);
870 }
871 else
872 {
873 // No steady state
874 float f = 1.0f - ((float) aTime / (float) aDuration);
875 float sp = lerp(aStartSlope->mValue, aMidValue, aTime, aStartSlope->mTime);
876 float ep = lerp(aEndSlope->mValue, aMidValue, aDuration - aTime, aEndSlope->mTime);
877 return f * sp + ((float) 1.0 - f) * ep;
878 }
879 }
880
881
882
883 void klatt::initsynth(int aElementCount,unsigned char *aElement)
884 {
885 mElement = aElement;
886 mElementCount = aElementCount;
887 mElementIndex = 0;
888 mLastElement = &gElement[0];
889 mSeed = 5;
890 mTStress = 0;
891 mNTStress = 0;
892 mFrame.mF0FundamentalFreq = mBaseF0;
893 mTop = 1.1f * mFrame.mF0FundamentalFreq;
894 mFrame.mNasalPoleFreq = (int)mLastElement->mInterpolator[ELM_FN].mSteady;
895 mFrame.mFormant1ParallelBandwidth = mFrame.mFormant1Bandwidth = 60;
896 mFrame.mFormant2ParallelBandwidth = mFrame.mFormant2Bandwidth = 90;
897 mFrame.mFormant3ParallelBandwidth = mFrame.mFormant3Bandwidth = 150;
898 // mFrame.mFormant4ParallelBandwidth = (default)
899
900 // Set stress attack/decay slope
901 mStressS.mTime = 40;
902 mStressE.mTime = 40;
903 mStressE.mValue = 0.0;
904 }
905
906 int klatt::synth(int /* aSampleCount */, short *aSamplePointer)
907 {
908 short *samp = aSamplePointer;
909
910 if (mElementIndex >= mElementCount)
911 return -1;
912
913 Element * currentElement = &gElement[mElement[mElementIndex++]];
914 int dur = mElement[mElementIndex++];
915 mElementIndex++; // skip stress
916
917 if (currentElement->mRK == 31) // "END"
918 {
919 // Reset the fundamental frequency top
920 mFrame.mF0FundamentalFreq = mBaseF0;
921 mTop = 1.1f * mFrame.mF0FundamentalFreq;
922 }
923
924 // Skip zero length elements which are only there to affect
925 // boundary values of adjacent elements
926
927 if (dur > 0)
928 {
929 Element * ne = (mElementIndex < mElementCount) ? &gElement[mElement[mElementIndex]] : &gElement[0];
930 Slope start[ELM_COUNT];
931 Slope end[ELM_COUNT];
932 int t;
933
934 if (currentElement->mRK > mLastElement->mRK)
935 {
936 set_trans(start, currentElement, mLastElement, 0, 's');
937 // we dominate last
938 }
939 else
940 {
941 set_trans(start, mLastElement, currentElement, 1, 's');
942 // last dominates us
943 }
944
945 if (ne->mRK > currentElement->mRK)
946 {
947 set_trans(end, ne, currentElement, 1, 'e');
948 // next dominates us
949 }
950 else
951 {
952 set_trans(end, currentElement, ne, 0, 'e');
953 // we dominate next
954 }
955
956 for (t = 0; t < dur; t++, mTStress++)
957 {
958 float base = mTop * 0.8f; // 3 * top / 5
959 float tp[ELM_COUNT];
960
961 if (mTStress == mNTStress)
962 {
963 int j = mElementIndex;
964 mStressS = mStressE;
965 mTStress = 0;
966 mNTStress = dur;
967
968 while (j <= mElementCount)
969 {
970 Element * e = (j < mElementCount) ? &gElement[mElement[j++]] : &gElement[0];
971 int du = (j < mElementCount) ? mElement[j++] : 0;
972 int s = (j < mElementCount) ? mElement[j++] : 3;
973
974 if (s || e->mFeat & ELM_FEATURE_VWL)
975 {
976 int d = 0;
977
978 if (s)
979 mStressE.mValue = (float) s / 3;
980 else
981 mStressE.mValue = (float) 0.1;
982
983 do
984 {
985 d += du;
986 e = (j < mElementCount) ? &gElement[mElement[j++]] : &gElement[0];
987 du = mElement[j++];
988 }
989
990 while ((e->mFeat & ELM_FEATURE_VWL) && mElement[j++] == s);
991
992 mNTStress += d / 2;
993
994 break;
995 }
996
997 mNTStress += du;
998 }
999 }
1000
1001 int j;
1002 for (j = 0; j < ELM_COUNT; j++)
1003 {
1004 tp[j] = interpolate(&start[j], &end[j], (float) currentElement->mInterpolator[j].mSteady, t, dur);
1005 }
1006
1007 // Now call the synth for each frame
1008
1009 mFrame.mF0FundamentalFreq = (int)(base + (mTop - base) * interpolate(&mStressS, &mStressE, (float)0, mTStress, mNTStress));
1010 mFrame.mVoicingAmpdb = mFrame.mPalallelVoicingAmpdb = (int)tp[ELM_AV];
1011 mFrame.mFricationAmpdb = (int)tp[ELM_AF];
1012 mFrame.mNasalZeroFreq = (int)tp[ELM_FN];
1013 mFrame.mAspirationAmpdb = (int)tp[ELM_ASP];
1014 mFrame.mVoicingBreathiness = (int)tp[ELM_AVC];
1015 mFrame.mFormant1ParallelBandwidth = mFrame.mFormant1Bandwidth = (int)tp[ELM_B1];
1016 mFrame.mFormant2ParallelBandwidth = mFrame.mFormant2Bandwidth = (int)tp[ELM_B2];
1017 mFrame.mFormant3ParallelBandwidth = mFrame.mFormant3Bandwidth = (int)tp[ELM_B3];
1018 mFrame.mFormant1Freq = (int)tp[ELM_F1];
1019 mFrame.mFormant2Freq = (int)tp[ELM_F2];
1020 mFrame.mFormant3Freq = (int)tp[ELM_F3];
1021
1022 // AMP_ADJ + is a kludge to get amplitudes up to klatt-compatible levels
1023
1024
1025 //pars.mParallelNasalPoleAmpdb = AMP_ADJ + tp[ELM_AN];
1026
1027 mFrame.mBypassFricationAmpdb = AMP_ADJ + (int)tp[ELM_AB];
1028 mFrame.mFormant5Ampdb = AMP_ADJ + (int)tp[ELM_A5];
1029 mFrame.mFormant6Ampdb = AMP_ADJ + (int)tp[ELM_A6];
1030 mFrame.mFormant1Ampdb = AMP_ADJ + (int)tp[ELM_A1];
1031 mFrame.mFormant2Ampdb = AMP_ADJ + (int)tp[ELM_A2];
1032 mFrame.mFormant3Ampdb = AMP_ADJ + (int)tp[ELM_A3];
1033 mFrame.mFormant4Ampdb = AMP_ADJ + (int)tp[ELM_A4];
1034
1035 parwave(samp);
1036
1037 samp += mNspFr;
1038
1039 // Declination of f0 envelope 0.25Hz / cS
1040 mTop -= mBaseDeclination;// 0.5;
1041 }
1042 }
1043
1044 mLastElement = currentElement;
1045
1046 return (int)(samp - aSamplePointer);
1047 }
1048
1049
1050 void klatt::init(int aBaseFrequency, float aBaseSpeed, float aBaseDeclination, int aBaseWaveform)
1051 {
1052 mBaseF0 = aBaseFrequency;
1053 mBaseSpeed = aBaseSpeed;
1054 mBaseDeclination = aBaseDeclination;
1055 mBaseWaveform = aBaseWaveform;
1056
1057 mSampleRate = 11025;
1058 mF0Flutter = 0;
1059 mF0FundamentalFreq = mBaseF0;
1060 mFrame.mF0FundamentalFreq = mBaseF0;
1061
1062 int FLPhz = (950 * mSampleRate) / 10000;
1063 int BLPhz = (630 * mSampleRate) / 10000;
1064 mNspFr = (int)(mSampleRate * mBaseSpeed) / 1000;
1065
1066 mDownSampLowPassFilter.initResonator(FLPhz, BLPhz, mSampleRate);
1067
1068 mNPer = 0; /* LG */
1069 mT0 = 0; /* LG */
1070
1071 mVLast = 0; /* Previous output of voice */
1072 mNLast = 0; /* Previous output of random number generator */
1073 mGlotLast = 0; /* Previous value of glotout */
1074 }