third_party/soloud_speech/klatt.cpp

   1 #include <math.h>
   2 #include <stdlib.h>
   3 #include "klatt.h"
   4 #include "darray.h"
   5 #include "resonator.h"
   6
   7 #ifndef PI
   8 #define PI 3.1415926535897932384626433832795f
   9 #endif
  10
  11 #ifndef NULL
  12 #define NULL 0
  13 #endif
  14
  15 class Interp
  16 {
  17 public:
  18         float mSteady;
  19         float mFixed;
  20         char  mProportion;
  21         char  mExtDelay;
  22         char  mIntDelay;
  23 };
  24
  25
  26 enum Eparm_e
  27 {
  28   ELM_FN, ELM_F1, ELM_F2, ELM_F3,
  29   ELM_B1, ELM_B2, ELM_B3, ELM_AN,
  30   ELM_A1, ELM_A2, ELM_A3, ELM_A4,
  31   ELM_A5, ELM_A6, ELM_AB, ELM_AV,
  32   ELM_AVC, ELM_ASP, ELM_AF,
  33   ELM_COUNT
  34 };
  35
  36 class Element
  37 {
  38 public:
  39           const char *mName; // unused
  40           const char mRK;
  41           const char mDU;
  42           const char mUD;
  43           unsigned char mFont; // unused
  44           const char  *mDict; // unused
  45           const char  *mIpa; // unused
  46           int   mFeat; // only ELM_FEATURE_VWL
  47           Interp mInterpolator[ELM_COUNT];
  48  };
  49
  50 enum ELEMENT_FEATURES
  51 {
  52         ELM_FEATURE_ALV = 0x00000001,
  53         ELM_FEATURE_APR = 0x00000002,
  54         ELM_FEATURE_BCK = 0x00000004,
  55         ELM_FEATURE_BLB = 0x00000008,
  56         ELM_FEATURE_CNT = 0x00000010,
  57         ELM_FEATURE_DNT = 0x00000020,
  58         ELM_FEATURE_FNT = 0x00000040,
  59         ELM_FEATURE_FRC = 0x00000080,
  60         ELM_FEATURE_GLT = 0x00000100,
  61         ELM_FEATURE_HGH = 0x00000200,
  62         ELM_FEATURE_LAT = 0x00000400,
  63         ELM_FEATURE_LBD = 0x00000800,
  64         ELM_FEATURE_LBV = 0x00001000,
  65         ELM_FEATURE_LMD = 0x00002000,
  66         ELM_FEATURE_LOW = 0x00004000,
  67         ELM_FEATURE_MDL = 0x00008000,
  68         ELM_FEATURE_NAS = 0x00010000,
  69         ELM_FEATURE_PAL = 0x00020000,
  70         ELM_FEATURE_PLA = 0x00040000,
  71         ELM_FEATURE_RND = 0x00080000,
  72         ELM_FEATURE_RZD = 0x00100000,
  73         ELM_FEATURE_SMH = 0x00200000,
  74         ELM_FEATURE_STP = 0x00400000,
  75         ELM_FEATURE_UMD = 0x00800000,
  76         ELM_FEATURE_UNR = 0x01000000,
  77         ELM_FEATURE_VCD = 0x02000000,
  78         ELM_FEATURE_VEL = 0x04000000,
  79         ELM_FEATURE_VLS = 0x08000000,
  80         ELM_FEATURE_VWL = 0x10000000
  81 };
  82
  83 enum ELEMENTS
  84 {
  85         ELM_END = 0,
  86         ELM_Q,  ELM_P,  ELM_PY, ELM_PZ, ELM_T,  ELM_TY,
  87         ELM_TZ, ELM_K,  ELM_KY, ELM_KZ, ELM_B,  ELM_BY, ELM_BZ,
  88         ELM_D,  ELM_DY, ELM_DZ, ELM_G,  ELM_GY, ELM_GZ, ELM_M,
  89         ELM_N,  ELM_NG, ELM_F,  ELM_TH, ELM_S,  ELM_SH, ELM_X,
  90         ELM_H,  ELM_V,  ELM_QQ, ELM_DH, ELM_DI, ELM_Z,  ELM_ZZ,
  91         ELM_ZH, ELM_CH, ELM_CI, ELM_J,  ELM_JY, ELM_L,  ELM_LL,
  92         ELM_RX, ELM_R,  ELM_W,  ELM_Y,  ELM_I,  ELM_E,  ELM_AA,
  93         ELM_U,  ELM_O,  ELM_OO, ELM_A,  ELM_EE, ELM_ER, ELM_AR,
  94         ELM_AW, ELM_UU, ELM_AI, ELM_IE, ELM_OI, ELM_OU, ELM_OV,
  95         ELM_OA, ELM_IA, ELM_IB, ELM_AIR,ELM_OOR,ELM_OR
  96 };
  97
  98 #define PHONEME_COUNT 53
  99 #define AMP_ADJ 14
 100 #define StressDur(e,s) (s,((e->mDU + e->mUD)/2))
 101
 102
 103
 104
 105 class PhonemeToElements
 106 {
 107 public:
 108         int mKey;
 109         char mData[8];
 110 };
 111
 112 /* Order is important - 2 byte phonemes first, otherwise
 113    the search function will fail*/
 114 static PhonemeToElements phoneme_to_elements[PHONEME_COUNT] =
 115 {
 116         /* mKey, count, 0-7 elements */
 117 /* tS */ 0x5374, 2, ELM_CH, ELM_CI, 0, 0, 0, 0, 0,
 118 /* dZ */ 0x5a64, 4, ELM_J, ELM_JY, ELM_QQ, ELM_JY, 0, 0, 0,
 119 /* rr */ 0x7272, 3, ELM_R, ELM_QQ, ELM_R, 0, 0, 0, 0,
 120 /* eI */ 0x4965, 2, ELM_AI, ELM_I, 0, 0, 0, 0, 0,
 121 /* aI */ 0x4961, 2, ELM_IE, ELM_I, 0, 0, 0, 0, 0,
 122 /* oI */ 0x496f, 2, ELM_OI, ELM_I, 0, 0, 0, 0, 0,
 123 /* aU */ 0x5561, 2, ELM_OU, ELM_OV, 0, 0, 0, 0, 0,
 124 /* @U */ 0x5540, 2, ELM_OA, ELM_OV, 0, 0, 0, 0, 0,
 125 /* I@ */ 0x4049, 2, ELM_IA, ELM_IB, 0, 0, 0, 0, 0,
 126 /* e@ */ 0x4065, 2, ELM_AIR, ELM_IB, 0, 0, 0, 0, 0,
 127 /* U@ */ 0x4055, 2, ELM_OOR, ELM_IB, 0, 0, 0, 0, 0,
 128 /* O@ */ 0x404f, 2, ELM_OR, ELM_IB, 0, 0, 0, 0, 0,
 129 /* oU */ 0x556f, 2, ELM_OI, ELM_OV, 0, 0, 0, 0, 0,
 130 /*    */ 0x0020, 1, ELM_Q, 0, 0, 0, 0, 0, 0,
 131 /* p  */ 0x0070, 3, ELM_P, ELM_PY, ELM_PZ, 0, 0, 0, 0,
 132 /* t  */ 0x0074, 3, ELM_T, ELM_TY, ELM_TZ, 0, 0, 0, 0,
 133 /* k  */ 0x006b, 3, ELM_K, ELM_KY, ELM_KZ, 0, 0, 0, 0,
 134 /* b  */ 0x0062, 3, ELM_B, ELM_BY, ELM_BZ, 0, 0, 0, 0,
 135 /* d  */ 0x0064, 3, ELM_D, ELM_DY, ELM_DZ, 0, 0, 0, 0,
 136 /* g  */ 0x0067, 3, ELM_G, ELM_GY, ELM_GZ, 0, 0, 0, 0,
 137 /* m  */ 0x006d, 1, ELM_M, 0, 0, 0, 0, 0, 0,
 138 /* n  */ 0x006e, 1, ELM_N, 0, 0, 0, 0, 0, 0,
 139 /* N  */ 0x004e, 1, ELM_NG, 0, 0, 0, 0, 0, 0,
 140 /* f  */ 0x0066, 1, ELM_F, 0, 0, 0, 0, 0, 0,
 141 /* T  */ 0x0054, 1, ELM_TH, 0, 0, 0, 0, 0, 0,
 142 /* s  */ 0x0073, 1, ELM_S, 0, 0, 0, 0, 0, 0,
 143 /* S  */ 0x0053, 1, ELM_SH, 0, 0, 0, 0, 0, 0,
 144 /* h  */ 0x0068, 1, ELM_H, 0, 0, 0, 0, 0, 0,
 145 /* v  */ 0x0076, 3, ELM_V, ELM_QQ, ELM_V, 0, 0, 0, 0,
 146 /* D  */ 0x0044, 3, ELM_DH, ELM_QQ, ELM_DI, 0, 0, 0, 0,
 147 /* z  */ 0x007a, 3, ELM_Z, ELM_QQ, ELM_ZZ, 0, 0, 0, 0,
 148 /* Z  */ 0x005a, 3, ELM_ZH, ELM_QQ, ELM_ZH, 0, 0, 0, 0,
 149 /* l  */ 0x006c, 1, ELM_L, 0, 0, 0, 0, 0, 0,
 150 /* r  */ 0x0072, 1, ELM_R, 0, 0, 0, 0, 0, 0,
 151 /* R  */ 0x0052, 1, ELM_RX, 0, 0, 0, 0, 0, 0,
 152 /* w  */ 0x0077, 1, ELM_W, 0, 0, 0, 0, 0, 0,
 153 /* x  */ 0x0078, 1, ELM_X, 0, 0, 0, 0, 0, 0,
 154 /* %  */ 0x0025, 1, ELM_QQ, 0, 0, 0, 0, 0, 0,
 155 /* j  */ 0x006a, 1, ELM_Y, 0, 0, 0, 0, 0, 0,
 156 /* I  */ 0x0049, 1, ELM_I, 0, 0, 0, 0, 0, 0,
 157 /* e  */ 0x0065, 1, ELM_E, 0, 0, 0, 0, 0, 0,
 158 /* &  */ 0x0026, 1, ELM_AA, 0, 0, 0, 0, 0, 0,
 159 /* V  */ 0x0056, 1, ELM_U, 0, 0, 0, 0, 0, 0,
 160 /* 0  */ 0x0030, 1, ELM_O, 0, 0, 0, 0, 0, 0,
 161 /* U  */ 0x0055, 1, ELM_OO, 0, 0, 0, 0, 0, 0,
 162 /* @  */ 0x0040, 1, ELM_A, 0, 0, 0, 0, 0, 0,
 163 /* i  */ 0x0069, 1, ELM_EE, 0, 0, 0, 0, 0, 0,
 164 /* 3  */ 0x0033, 1, ELM_ER, 0, 0, 0, 0, 0, 0,
 165 /* A  */ 0x0041, 1, ELM_AR, 0, 0, 0, 0, 0, 0,
 166 /* O  */ 0x004f, 1, ELM_AW, 0, 0, 0, 0, 0, 0,
 167 /* u  */ 0x0075, 1, ELM_UU, 0, 0, 0, 0, 0, 0,
 168 /* o  */ 0x006f, 1, ELM_OI, 0, 0, 0, 0, 0, 0,
 169 /* .  */ 0x002e, 1, ELM_END,0, 0, 0, 0, 0, 0,
 170 };
 171
 172 static Element gElement[] =
 173 {
 174 #include "Elements.def"
 175 };
 176
 177 static short clip(float input)
 178 {
 179         int temp = (int)input;
 180         /* clip on boundaries of 16-bit word */
 181
 182         if (temp < -32767)
 183         {
 184                 //assert?
 185                 temp = -32767;
 186         }
 187         else
 188         if (temp > 32767)
 189         {
 190                 //assert?
 191                 temp = 32767;
 192         }
 193
 194         return (short)(temp);
 195 }
 196
 197 /* Convert from decibels to a linear scale factor */
 198 static float DBtoLIN(int dB)
 199 {
 200         /*
 201         * Convertion table, db to linear, 87 dB --> 32767
 202         *                                 86 dB --> 29491 (1 dB down = 0.5**1/6)
 203         *                                 ...
 204         *                                 81 dB --> 16384 (6 dB down = 0.5)
 205         *                                 ...
 206         *                                  0 dB -->     0
 207         *
 208         * The just noticeable difference for a change in intensity of a vowel
 209         *   is approximately 1 dB.  Thus all amplitudes are quantized to 1 dB
 210         *   steps.
 211         */
 212
 213         static const float amptable[88] =
 214         {
 215                 0.0, 0.0, 0.0, 0.0, 0.0,
 216                 0.0, 0.0, 0.0, 0.0, 0.0,
 217                 0.0, 0.0, 0.0, 6.0, 7.0,
 218                 8.0, 9.0, 10.0, 11.0, 13.0,
 219                 14.0, 16.0, 18.0, 20.0, 22.0,
 220                 25.0, 28.0, 32.0, 35.0, 40.0,
 221                 45.0, 51.0, 57.0, 64.0, 71.0,
 222                 80.0, 90.0, 101.0, 114.0, 128.0,
 223                 142.0, 159.0, 179.0, 202.0, 227.0,
 224                 256.0, 284.0, 318.0, 359.0, 405.0,
 225                 455.0, 512.0, 568.0, 638.0, 719.0,
 226                 811.0, 911.0, 1024.0, 1137.0, 1276.0,
 227                 1438.0, 1622.0, 1823.0, 2048.0, 2273.0,
 228                 2552.0, 2875.0, 3244.0, 3645.0, 4096.0,
 229                 4547.0, 5104.0, 5751.0, 6488.0, 7291.0,
 230                 8192.0, 9093.0, 10207.0, 11502.0, 12976.0,
 231                 14582.0, 16384.0, 18350.0, 20644.0, 23429.0,
 232                 26214.0, 29491.0, 32767.0
 233         };
 234
 235         // Check limits or argument (can be removed in final product)
 236         if (dB < 0)
 237         {
 238                 dB = 0;
 239         }
 240         else
 241         if (dB >= 88)
 242         {
 243                 dB = 87;
 244         }
 245
 246         return amptable[dB] * 0.001f;
 247 }
 248
 249
 250
 251 klatt_frame::klatt_frame() :
 252         mF0FundamentalFreq(1330),       mVoicingAmpdb(60),                              mFormant1Freq(500),
 253         mFormant1Bandwidth(60),         mFormant2Freq(1500),                    mFormant2Bandwidth(90),
 254         mFormant3Freq(2800),            mFormant3Bandwidth(150),                mFormant4Freq(3250),
 255         mFormant4Bandwidth(200),        mFormant5Freq(3700),                    mFormant5Bandwidth(200),
 256         mFormant6Freq(4990),            mFormant6Bandwidth(500),                mNasalZeroFreq(270),
 257         mNasalZeroBandwidth(100),       mNasalPoleFreq(270),                    mNasalPoleBandwidth(100),
 258         mAspirationAmpdb(0),            mNoSamplesInOpenPeriod(30),             mVoicingBreathiness(0),
 259         mVoicingSpectralTiltdb(10), mFricationAmpdb(0),                         mSkewnessOfAlternatePeriods(0),
 260         mFormant1Ampdb(0),                      mFormant1ParallelBandwidth(80), mFormant2Ampdb(0),
 261         mFormant2ParallelBandwidth(200), mFormant3Ampdb(0),                     mFormant3ParallelBandwidth(350),
 262         mFormant4Ampdb(0),                      mFormant4ParallelBandwidth(500), mFormant5Ampdb(0),
 263         mFormant5ParallelBandwidth(600), mFormant6Ampdb(0),                     mFormant6ParallelBandwidth(800),
 264         mParallelNasalPoleAmpdb(0), mBypassFricationAmpdb(0),       mPalallelVoicingAmpdb(0),
 265         mOverallGaindb(62)
 266 {
 267 };
 268
 269
 270 klatt::klatt() :
 271         mBaseF0(1330),
 272         mBaseSpeed(10.0f),
 273         mBaseDeclination(0.5f),
 274         mBaseWaveform(KW_SAW),
 275         mF0Flutter(0),
 276         mSampleRate(0),
 277         mNspFr(0),
 278         mF0FundamentalFreq(0),
 279         mVoicingAmpdb(0),
 280         mSkewnessOfAlternatePeriods(0),
 281         mTimeCount(0),
 282         mNPer(0),
 283         mT0(0),
 284         mNOpen(0),
 285         mNMod(0),
 286         mAmpVoice(0),
 287         mAmpBypas(0),
 288         mAmpAspir(0),
 289         mAmpFrica(0),
 290         mAmpBreth(0),
 291         mSkew(0),
 292         mVLast(0),
 293         mNLast(0),
 294         mGlotLast(0),
 295         mDecay(0),
 296         mOneMd(0),
 297         mSeed(5),
 298         mElementCount(0),
 299         mElement(0),
 300         mElementIndex(0),
 301         mLastElement(0),
 302         mTStress(0),
 303         mNTStress(0),
 304         mTop(0)
 305 {
 306 }
 307
 308 /*
 309 function FLUTTER
 310
 311 This function adds F0 flutter, as specified in:
 312
 313 "Analysis, synthesis and perception of voice quality variations among
 314 female and male talkers" D.H. Klatt and L.C. Klatt JASA 87(2) February 1990.
 315 Flutter is added by applying a quasi-random element constructed from three
 316 slowly varying sine waves.
 317 */
 318 void klatt::flutter()
 319 {
 320         int original_f0 = mFrame.mF0FundamentalFreq / 10;
 321         float fla = (float) mF0Flutter / 50;
 322         float flb = (float) original_f0 / 100;
 323         float flc = (float)sin(2 * PI * 12.7 * mTimeCount);
 324         float fld = (float)sin(2 * PI * 7.1 * mTimeCount);
 325         float fle = (float)sin(2 * PI * 4.7 * mTimeCount);
 326         float delta_f0 = fla * flb * (flc + fld + fle) * 10;
 327         mF0FundamentalFreq += (int) delta_f0;
 328 }
 329
 330 /* Vwave is the differentiated glottal flow waveform, there is a weak
 331 spectral zero around 800 Hz, magic constants a,b reset pitch-synch
 332 */
 333
 334 float klatt::natural_source(int aNper)
 335 {
 336         // See if glottis open
 337         if (aNper < mNOpen)
 338         {
 339                 switch (mBaseWaveform)
 340                 {
 341                 case KW_TRIANGLE:
 342                         return ((aNper % 200) - 100) * 81.92f; // triangle
 343                 case KW_SIN:
 344                         return (float)(sin(aNper * 0.0314) * 8192); // sin
 345                 case KW_SQUARE:
 346                         return ((aNper % 200) - 100) > 0 ? 8192.0f : -8192.0f; // square
 347                 case KW_PULSE:
 348                         return ((aNper % 200) - 100) > 50 ? 8192.0f : -8192.0f; // pulse
 349                 case KW_NOISE:
 350                         return (int)mNLast & 1 ? -8192.0f : 8192.0f;
 351                 case KW_WARBLE:
 352                                 return (int)mNLast & 7 ? -8192.0f : 8192.0f;
 353                 case KW_SAW: // fallthrough
 354                 default:
 355                         return (abs((aNper % 200) - 100) - 50) * 163.84f; // saw
 356                 }
 357         }
 358         else
 359         {
 360                 // Glottis closed
 361                 return (0.0);
 362         }
 363
 364 }
 365
 366 /* Reset selected parameters pitch-synchronously */
 367
 368 void klatt::pitch_synch_par_reset(int ns)
 369 {
 370         if (mF0FundamentalFreq > 0)
 371         {
 372                 mT0 = (40 * mSampleRate) / mF0FundamentalFreq;
 373
 374                 /* Period in samp*4 */
 375                 mAmpVoice = DBtoLIN(mVoicingAmpdb);
 376
 377                 /* Duration of period before amplitude modulation */
 378                 mNMod = mT0;
 379
 380                 if (mVoicingAmpdb > 0)
 381                 {
 382                         mNMod >>= 1;
 383                 }
 384
 385                 /* Breathiness of voicing waveform */
 386
 387                 mAmpBreth = DBtoLIN(mFrame.mVoicingBreathiness) * 0.1f;
 388
 389                 /* Set open phase of glottal period */
 390                 /* where  40 <= open phase <= 263 */
 391
 392                 mNOpen = 4 * mFrame.mNoSamplesInOpenPeriod;
 393
 394                 if (mNOpen >= (mT0 - 1))
 395                 {
 396                         mNOpen = mT0 - 2;
 397                 }
 398
 399                 if (mNOpen < 40)
 400                 {
 401                         mNOpen = 40;                  /* F0 max = 1000 Hz */
 402                 }
 403
 404                 int temp;
 405                 float temp1;
 406
 407                 temp = mSampleRate / mNOpen;
 408                 mCritDampedGlotLowPassFilter.initResonator(0L, temp, mSampleRate);
 409
 410                 /* Make gain at F1 about constant */
 411
 412                 temp1 = mNOpen * .00833f;
 413                 mCritDampedGlotLowPassFilter.setGain(temp1 * temp1);
 414
 415                 /* Truncate skewness so as not to exceed duration of closed phase
 416                 of glottal period */
 417
 418                 temp = mT0 - mNOpen;
 419
 420                 if (mSkewnessOfAlternatePeriods > temp)
 421                 {
 422                         mSkewnessOfAlternatePeriods = temp;
 423                 }
 424
 425                 if (mSkew >= 0)
 426                 {
 427                         mSkew = mSkewnessOfAlternatePeriods;                /* Reset mSkew to requested mSkewnessOfAlternatePeriods */
 428                 }
 429                 else
 430                 {
 431                         mSkew = -mSkewnessOfAlternatePeriods;
 432                 }
 433
 434                 /* Add skewness to closed portion of voicing period */
 435
 436                 mT0 = mT0 + mSkew;
 437                 mSkew = -mSkew;
 438         }
 439         else
 440         {
 441                 mT0 = 4;                        /* Default for f0 undefined */
 442                 mAmpVoice = 0.0;
 443                 mNMod = mT0;
 444                 mAmpBreth = 0.0;
 445         }
 446
 447         /* Reset these pars pitch synchronously or at update rate if f0=0 */
 448
 449         if ((mT0 != 4) || (ns == 0))
 450         {
 451                 /* Set one-pole ELM_FEATURE_LOW-pass filter that tilts glottal source */
 452                 mDecay = (0.033f * mFrame.mVoicingSpectralTiltdb);      /* Function of samp_rate ? */
 453
 454                 if (mDecay > 0.0f)
 455                 {
 456                         mOneMd = 1.0f - mDecay;
 457                 }
 458                 else
 459                 {
 460                         mOneMd = 1.0f;
 461                 }
 462         }
 463 }
 464
 465
 466 /* Get variable parameters from host computer,
 467 initially also get definition of fixed pars
 468 */
 469
 470 void klatt::frame_init()
 471 {
 472         int mOverallGaindb;                       /* Overall gain, 60 dB is unity  0 to   60  */
 473         float amp_parF1;                 /* mFormant1Ampdb converted to linear gain  */
 474         float amp_parFN;                 /* mParallelNasalPoleAmpdb converted to linear gain  */
 475         float amp_parF2;                 /* mFormant2Ampdb converted to linear gain  */
 476         float amp_parF3;                 /* mFormant3Ampdb converted to linear gain  */
 477         float amp_parF4;                 /* mFormant4Ampdb converted to linear gain  */
 478         float amp_parF5;                 /* mFormant5Ampdb converted to linear gain  */
 479         float amp_parF6;                 /* mFormant6Ampdb converted to linear gain  */
 480
 481         /* Read  speech frame definition into temp store
 482        and move some parameters into active use immediately
 483        (voice-excited ones are updated pitch synchronously
 484        to avoid waveform glitches).
 485          */
 486
 487         mF0FundamentalFreq = mFrame.mF0FundamentalFreq;
 488         mVoicingAmpdb = mFrame.mVoicingAmpdb - 7;
 489
 490         if (mVoicingAmpdb < 0) mVoicingAmpdb = 0;
 491
 492         mAmpAspir = DBtoLIN(mFrame.mAspirationAmpdb) * .05f;
 493         mAmpFrica = DBtoLIN(mFrame.mFricationAmpdb) * 0.25f;
 494         mSkewnessOfAlternatePeriods = mFrame.mSkewnessOfAlternatePeriods;
 495
 496         /* Fudge factors (which comprehend affects of formants on each other?)
 497        with these in place ALL_PARALLEL should sound as close as
 498            possible to CASCADE_PARALLEL.
 499            Possible problem feeding in Holmes's amplitudes given this.
 500         */
 501         amp_parF1 = DBtoLIN(mFrame.mFormant1Ampdb) * 0.4f;      /* -7.96 dB */
 502         amp_parF2 = DBtoLIN(mFrame.mFormant2Ampdb) * 0.15f;     /* -16.5 dB */
 503         amp_parF3 = DBtoLIN(mFrame.mFormant3Ampdb) * 0.06f;     /* -24.4 dB */
 504         amp_parF4 = DBtoLIN(mFrame.mFormant4Ampdb) * 0.04f;     /* -28.0 dB */
 505         amp_parF5 = DBtoLIN(mFrame.mFormant5Ampdb) * 0.022f;    /* -33.2 dB */
 506         amp_parF6 = DBtoLIN(mFrame.mFormant6Ampdb) * 0.03f;     /* -30.5 dB */
 507         amp_parFN = DBtoLIN(mFrame.mParallelNasalPoleAmpdb) * 0.6f;     /* -4.44 dB */
 508         mAmpBypas = DBtoLIN(mFrame.mBypassFricationAmpdb) * 0.05f;      /* -26.0 db */
 509
 510         // Set coeficients of nasal resonator and zero antiresonator
 511         mNasalPole.initResonator(mFrame.mNasalPoleFreq, mFrame.mNasalPoleBandwidth, mSampleRate);
 512
 513         mNasalZero.initAntiresonator(mFrame.mNasalZeroFreq, mFrame.mNasalZeroBandwidth, mSampleRate);
 514
 515         // Set coefficients of parallel resonators, and amplitude of outputs
 516         mParallelFormant1.initResonator(mFrame.mFormant1Freq, mFrame.mFormant1ParallelBandwidth, mSampleRate);
 517         mParallelFormant1.setGain(amp_parF1);
 518
 519         mParallelResoNasalPole.initResonator(mFrame.mNasalPoleFreq, mFrame.mNasalPoleBandwidth, mSampleRate);
 520         mParallelResoNasalPole.setGain(amp_parFN);
 521
 522         mParallelFormant2.initResonator(mFrame.mFormant2Freq, mFrame.mFormant2ParallelBandwidth, mSampleRate);
 523         mParallelFormant2.setGain(amp_parF2);
 524
 525         mParallelFormant3.initResonator(mFrame.mFormant3Freq, mFrame.mFormant3ParallelBandwidth, mSampleRate);
 526         mParallelFormant3.setGain(amp_parF3);
 527
 528         mParallelFormant4.initResonator(mFrame.mFormant4Freq, mFrame.mFormant4ParallelBandwidth, mSampleRate);
 529         mParallelFormant4.setGain(amp_parF4);
 530
 531         mParallelFormant5.initResonator(mFrame.mFormant5Freq, mFrame.mFormant5ParallelBandwidth, mSampleRate);
 532         mParallelFormant5.setGain(amp_parF5);
 533
 534         mParallelFormant6.initResonator(mFrame.mFormant6Freq, mFrame.mFormant6ParallelBandwidth, mSampleRate);
 535         mParallelFormant6.setGain(amp_parF6);
 536
 537
 538         /* fold overall gain into output resonator */
 539         mOverallGaindb = mFrame.mOverallGaindb - 3;
 540
 541         if (mOverallGaindb <= 0)
 542                 mOverallGaindb = 57;
 543
 544         /* output ELM_FEATURE_LOW-pass filter - resonator with freq 0 and BW = globals->mSampleRate
 545         Thus 3db point is globals->mSampleRate/2 i.e. Nyquist limit.
 546         Only 3db down seems rather mild...
 547         */
 548         mOutputLowPassFilter.initResonator(0L, (int)mSampleRate, mSampleRate);
 549         mOutputLowPassFilter.setGain(DBtoLIN(mOverallGaindb));
 550 }
 551
 552 /*
 553 function PARWAV
 554
 555 CONVERT FRAME OF PARAMETER DATA TO A WAVEFORM CHUNK
 556 Synthesize globals->mNspFr samples of waveform and store in jwave[].
 557 */
 558
 559 void klatt::parwave(short int *jwave)
 560 {
 561         /* Output of cascade branch, also final output  */
 562
 563         /* Initialize synthesizer and get specification for current speech
 564         frame from host microcomputer */
 565
 566         frame_init();
 567
 568         if (mF0Flutter != 0)
 569         {
 570                 mTimeCount++;                  /* used for f0 flutter */
 571                 flutter();       /* add f0 flutter */
 572         }
 573
 574         /* MAIN LOOP, for each output sample of current frame: */
 575
 576         int ns;
 577         for (ns = 0; ns < mNspFr; ns++)
 578         {
 579                 float noise;
 580                 int n4;
 581                 float sourc;                   /* Sound source if all-parallel config used  */
 582                 float glotout;                 /* Output of glottal sound source  */
 583                 float par_glotout;             /* Output of parallelglottal sound sourc  */
 584                 float voice = 0;               /* Current sample of voicing waveform  */
 585                 float frics;                   /* Frication sound source  */
 586                 float aspiration;              /* Aspiration sound source  */
 587                 int nrand;                    /* Varible used by random number generator  */
 588
 589                 /* Our own code like rand(), but portable
 590                 whole upper 31 bits of seed random
 591                 assumes 32-bit unsigned arithmetic
 592                 with untested code to handle larger.
 593                 */
 594                 mSeed = mSeed * 1664525 + 1;
 595
 596                 mSeed &= 0xFFFFFFFF;
 597
 598                 /* Shift top bits of seed up to top of int then back down to LS 14 bits */
 599                 /* Assumes 8 bits per sizeof unit i.e. a "byte" */
 600                 nrand = (((int) mSeed) << (8 * sizeof(int) - 32)) >> (8 * sizeof(int) - 14);
 601
 602                 /* Tilt down noise spectrum by soft ELM_FEATURE_LOW-pass filter having
 603                 *    a pole near the origin in the z-plane, i.e.
 604                 *    output = input + (0.75 * lastoutput) */
 605
 606                 noise = nrand + (0.75f * mNLast);       /* Function of samp_rate ? */
 607
 608                 mNLast = noise;
 609
 610                 /* Amplitude modulate noise (reduce noise amplitude during
 611                 second half of glottal period) if voicing simultaneously present
 612                 */
 613
 614                 if (mNPer > mNMod)
 615                 {
 616                         noise *= 0.5f;
 617                 }
 618
 619                 /* Compute frication noise */
 620                 sourc = frics = mAmpFrica * noise;
 621
 622                 /* Compute voicing waveform : (run glottal source simulation at
 623                 4 times normal sample rate to minimize quantization noise in
 624                 period of female voice)
 625                 */
 626
 627                 for (n4 = 0; n4 < 4; n4++)
 628                 {
 629                         /* use a more-natural-shaped source waveform with excitation
 630                         occurring both upon opening and upon closure, stronest at closure */
 631                         voice = natural_source(mNPer);
 632
 633                         /* Reset period when counter 'mNPer' reaches mT0 */
 634
 635                         if (mNPer >= mT0)
 636                         {
 637                                 mNPer = 0;
 638                                 pitch_synch_par_reset(ns);
 639                         }
 640
 641                         /* Low-pass filter voicing waveform before downsampling from 4*globals->mSampleRate */
 642                         /* to globals->mSampleRate samples/sec.  Resonator f=.09*globals->mSampleRate, bw=.06*globals->mSampleRate  */
 643
 644                         voice = mDownSampLowPassFilter.resonate(voice); /* in=voice, out=voice */
 645
 646                         /* Increment counter that keeps track of 4*globals->mSampleRate samples/sec */
 647                         mNPer++;
 648                 }
 649
 650                 /* Tilt spectrum of voicing source down by soft ELM_FEATURE_LOW-pass filtering, amount
 651                 of tilt determined by mVoicingSpectralTiltdb
 652                 */
 653                 voice = (voice * mOneMd) + (mVLast * mDecay);
 654
 655                 mVLast = voice;
 656
 657                 /* Add breathiness during glottal open phase */
 658                 if (mNPer < mNOpen)
 659                 {
 660                         /* Amount of breathiness determined by parameter mVoicingBreathiness */
 661                         /* Use nrand rather than noise because noise is ELM_FEATURE_LOW-passed */
 662                         voice += mAmpBreth * nrand;
 663                 }
 664
 665                 /* Set amplitude of voicing */
 666                 glotout = mAmpVoice * voice;
 667
 668                 /* Compute aspiration amplitude and add to voicing source */
 669                 aspiration = mAmpAspir * noise;
 670
 671                 glotout += aspiration;
 672
 673                 par_glotout = glotout;
 674
 675                 /* NIS - rsynth "hack"
 676                 As Holmes' scheme is weak at nasals and (physically) nasal cavity
 677                 is "back near glottis" feed glottal source through nasal resonators
 678                 Don't think this is quite right, but improves things a bit
 679                 */
 680                 par_glotout = mNasalZero.antiresonate(par_glotout);
 681                 par_glotout = mNasalPole.resonate(par_glotout);
 682                 /* And just use mParallelFormant1 NOT mParallelResoNasalPole */
 683                 float out = mParallelFormant1.resonate(par_glotout);
 684                 /* Sound sourc for other parallel resonators is frication
 685                 plus first difference of voicing waveform.
 686                 */
 687                 sourc += (par_glotout - mGlotLast);
 688                 mGlotLast = par_glotout;
 689
 690                 /* Standard parallel vocal tract
 691                 Formants F6,F5,F4,F3,F2, outputs added with alternating sign
 692                 */
 693                 out = mParallelFormant6.resonate(sourc) - out;
 694                 out = mParallelFormant5.resonate(sourc) - out;
 695                 out = mParallelFormant4.resonate(sourc) - out;
 696                 out = mParallelFormant3.resonate(sourc) - out;
 697                 out = mParallelFormant2.resonate(sourc) - out;
 698
 699                 out = mAmpBypas * sourc - out;
 700                 out = mOutputLowPassFilter.resonate(out);
 701
 702                 *jwave++ = clip(out); /* Convert back to integer */
 703         }
 704 }
 705
 706
 707
 708 static char * phoneme_to_element_lookup(char *s, void ** data)
 709 {
 710         int key8 = *s;
 711         int key16 = key8 + (s[1] << 8);
 712         if (s[1] == 0) key16 = -1; // avoid key8==key16
 713         int i;
 714         for (i = 0; i < PHONEME_COUNT; i++)
 715         {
 716                 if (phoneme_to_elements[i].mKey == key16)
 717                 {
 718                         *data = &phoneme_to_elements[i].mData;
 719                         return s+2;
 720                 }
 721                 if (phoneme_to_elements[i].mKey == key8)
 722                 {
 723                         *data = &phoneme_to_elements[i].mData;
 724                         return s+1;
 725                 }
 726         }
 727         // should never happen
 728         *data = NULL;
 729         return s+1;
 730 }
 731
 732
 733
 734 int klatt::phone_to_elm(char *aPhoneme, int aCount, darray *aElement)
 735 {
 736         int stress = 0;
 737         char *s = aPhoneme;
 738         int t = 0;
 739         char *limit = s + aCount;
 740
 741         while (s < limit && *s)
 742         {
 743                 char *e = NULL;
 744                 s = phoneme_to_element_lookup(s, (void**)&e);
 745
 746                 if (e)
 747                 {
 748                         int n = *e++;
 749
 750                         while (n-- > 0)
 751                         {
 752                                 int x = *e++;
 753                                 Element * p = &gElement[x];
 754                                 /* This works because only vowels have mUD != mDU,
 755                                 and we set stress just before a vowel
 756                                 */
 757                                 aElement->put(x);
 758
 759                                 if (!(p->mFeat & ELM_FEATURE_VWL))
 760                                         stress = 0;
 761
 762                                 int stressdur = StressDur(p,stress);
 763
 764                                 t += stressdur;
 765
 766                                 aElement->put(stressdur);
 767                                 aElement->put(stress);
 768                         }
 769                 }
 770
 771                 else
 772                 {
 773                         char ch = *s++;
 774
 775                         switch (ch)
 776                         {
 777
 778                         case '\'':                /* Primary stress */
 779                                 stress = 3;
 780                                 break;
 781
 782                         case ',':                 /* Secondary stress */
 783                                 stress = 2;
 784                                 break;
 785
 786                         case '+':                 /* Tertiary stress */
 787                                 stress = 1;
 788                                 break;
 789
 790                         case '-':                 /* hyphen in input */
 791                                 break;
 792
 793                         default:
 794 //                              fprintf(stderr, "Ignoring %c in '%.*s'\n", ch, aCount, aPhoneme);
 795                                 break;
 796                         }
 797                 }
 798         }
 799
 800         return t;
 801 }
 802
 803
 804
 805 /* 'a' is dominant element, 'b' is dominated
 806     ext is flag to say to use external times from 'a' rather
 807     than internal i.e. ext != 0 if 'a' is NOT current element.
 808  */
 809
 810 static void set_trans(Slope *t, Element * a, Element * b,int ext, int /* e */)
 811 {
 812         int i;
 813
 814         for (i = 0; i < ELM_COUNT; i++)
 815         {
 816                 t[i].mTime = ((ext) ? a->mInterpolator[i].mExtDelay : a->mInterpolator[i].mIntDelay);
 817
 818                 if (t[i].mTime)
 819                 {
 820                         t[i].mValue = a->mInterpolator[i].mFixed + (a->mInterpolator[i].mProportion * b->mInterpolator[i].mSteady) * 0.01f; // mProportion is in scale 0..100, so *0.01.
 821                 }
 822                 else
 823                 {
 824                         t[i].mValue = b->mInterpolator[i].mSteady;
 825                 }
 826         }
 827 }
 828
 829 static float lerp(float a, float b, int t, int d)
 830 {
 831         if (t <= 0)
 832         {
 833                 return a;
 834         }
 835
 836         if (t >= d)
 837         {
 838                 return b;
 839         }
 840
 841         float f = (float)t / (float)d;
 842         return a + (b - a) * f;
 843 }
 844
 845 static float interpolate(Slope *aStartSlope, Slope *aEndSlope, float aMidValue, int aTime, int aDuration)
 846 {
 847         int steadyTime = aDuration - (aStartSlope->mTime + aEndSlope->mTime);
 848
 849         if (steadyTime >= 0)
 850         {
 851                 // Interpolate to a midpoint, stay there for a while, then interpolate to end
 852
 853                 if (aTime < aStartSlope->mTime)
 854                 {
 855                         // interpolate to the first value
 856                         return lerp(aStartSlope->mValue, aMidValue, aTime, aStartSlope->mTime);
 857                 }
 858                 // reached midpoint
 859
 860                 aTime -= aStartSlope->mTime;
 861
 862                 if (aTime <= steadyTime)
 863                 {
 864                         // still at steady state
 865                         return aMidValue;
 866                 }
 867
 868                 // interpolate to the end
 869                 return lerp(aMidValue, aEndSlope->mValue, aTime - steadyTime, aEndSlope->mTime);
 870         }
 871         else
 872         {
 873                 // No steady state
 874                 float f = 1.0f - ((float) aTime / (float) aDuration);
 875                 float sp = lerp(aStartSlope->mValue, aMidValue, aTime, aStartSlope->mTime);
 876                 float ep = lerp(aEndSlope->mValue, aMidValue, aDuration - aTime, aEndSlope->mTime);
 877                 return f * sp + ((float) 1.0 - f) * ep;
 878         }
 879 }
 880
 881
 882
 883 void klatt::initsynth(int aElementCount,unsigned char *aElement)
 884 {
 885         mElement = aElement;
 886         mElementCount = aElementCount;
 887         mElementIndex = 0;
 888         mLastElement = &gElement[0];
 889         mSeed = 5;
 890         mTStress = 0;
 891         mNTStress = 0;
 892         mFrame.mF0FundamentalFreq = mBaseF0;
 893         mTop = 1.1f * mFrame.mF0FundamentalFreq;
 894         mFrame.mNasalPoleFreq = (int)mLastElement->mInterpolator[ELM_FN].mSteady;
 895         mFrame.mFormant1ParallelBandwidth = mFrame.mFormant1Bandwidth = 60;
 896         mFrame.mFormant2ParallelBandwidth = mFrame.mFormant2Bandwidth = 90;
 897         mFrame.mFormant3ParallelBandwidth = mFrame.mFormant3Bandwidth = 150;
 898 //      mFrame.mFormant4ParallelBandwidth = (default)
 899
 900         // Set stress attack/decay slope
 901         mStressS.mTime = 40;
 902         mStressE.mTime = 40;
 903         mStressE.mValue = 0.0;
 904 }
 905
 906 int klatt::synth(int /* aSampleCount */, short *aSamplePointer)
 907 {
 908         short *samp = aSamplePointer;
 909
 910         if (mElementIndex >= mElementCount)
 911                 return -1;
 912
 913         Element * currentElement = &gElement[mElement[mElementIndex++]];
 914         int dur = mElement[mElementIndex++];
 915         mElementIndex++; // skip stress
 916
 917         if (currentElement->mRK == 31) // "END"
 918         {
 919                 // Reset the fundamental frequency top
 920                 mFrame.mF0FundamentalFreq = mBaseF0;
 921                 mTop = 1.1f * mFrame.mF0FundamentalFreq;
 922         }
 923
 924         // Skip zero length elements which are only there to affect
 925         // boundary values of adjacent elements
 926
 927         if (dur > 0)
 928         {
 929                 Element * ne = (mElementIndex < mElementCount) ? &gElement[mElement[mElementIndex]] : &gElement[0];
 930                 Slope start[ELM_COUNT];
 931                 Slope end[ELM_COUNT];
 932                 int t;
 933
 934                 if (currentElement->mRK > mLastElement->mRK)
 935                 {
 936                         set_trans(start, currentElement, mLastElement, 0, 's');
 937                         // we dominate last
 938                 }
 939                 else
 940                 {
 941                         set_trans(start, mLastElement, currentElement, 1, 's');
 942                         // last dominates us
 943                 }
 944
 945                 if (ne->mRK > currentElement->mRK)
 946                 {
 947                         set_trans(end, ne, currentElement, 1, 'e');
 948                         // next dominates us
 949                 }
 950                 else
 951                 {
 952                         set_trans(end, currentElement, ne, 0, 'e');
 953                         // we dominate next
 954                 }
 955
 956                 for (t = 0; t < dur; t++, mTStress++)
 957                 {
 958                         float base = mTop * 0.8f; // 3 * top / 5
 959                         float tp[ELM_COUNT];
 960
 961                         if (mTStress == mNTStress)
 962                         {
 963                                 int j = mElementIndex;
 964                                 mStressS = mStressE;
 965                                 mTStress = 0;
 966                                 mNTStress = dur;
 967
 968                                 while (j <= mElementCount)
 969                                 {
 970                                         Element * e   = (j < mElementCount) ? &gElement[mElement[j++]] : &gElement[0];
 971                                         int du = (j < mElementCount) ? mElement[j++] : 0;
 972                                         int s  = (j < mElementCount) ? mElement[j++] : 3;
 973
 974                                         if (s || e->mFeat & ELM_FEATURE_VWL)
 975                                         {
 976                                                 int d = 0;
 977
 978                                                 if (s)
 979                                                         mStressE.mValue = (float) s / 3;
 980                                                 else
 981                                                         mStressE.mValue = (float) 0.1;
 982
 983                                                 do
 984                                                 {
 985                                                         d += du;
 986                                                         e = (j < mElementCount) ? &gElement[mElement[j++]] : &gElement[0];
 987                                                         du = mElement[j++];
 988                                                 }
 989
 990                                                 while ((e->mFeat & ELM_FEATURE_VWL) && mElement[j++] == s);
 991
 992                                                 mNTStress += d / 2;
 993
 994                                                 break;
 995                                         }
 996
 997                                         mNTStress += du;
 998                                 }
 999                         }
1000
1001                         int j;
1002                         for (j = 0; j < ELM_COUNT; j++)
1003                         {
1004                                 tp[j] = interpolate(&start[j], &end[j], (float) currentElement->mInterpolator[j].mSteady, t, dur);
1005                         }
1006
1007                         // Now call the synth for each frame
1008
1009                         mFrame.mF0FundamentalFreq = (int)(base + (mTop - base) * interpolate(&mStressS, &mStressE, (float)0, mTStress, mNTStress));
1010                         mFrame.mVoicingAmpdb = mFrame.mPalallelVoicingAmpdb = (int)tp[ELM_AV];
1011                         mFrame.mFricationAmpdb = (int)tp[ELM_AF];
1012                         mFrame.mNasalZeroFreq = (int)tp[ELM_FN];
1013                         mFrame.mAspirationAmpdb = (int)tp[ELM_ASP];
1014                         mFrame.mVoicingBreathiness = (int)tp[ELM_AVC];
1015                         mFrame.mFormant1ParallelBandwidth = mFrame.mFormant1Bandwidth = (int)tp[ELM_B1];
1016                         mFrame.mFormant2ParallelBandwidth = mFrame.mFormant2Bandwidth = (int)tp[ELM_B2];
1017                         mFrame.mFormant3ParallelBandwidth = mFrame.mFormant3Bandwidth = (int)tp[ELM_B3];
1018                         mFrame.mFormant1Freq = (int)tp[ELM_F1];
1019                         mFrame.mFormant2Freq = (int)tp[ELM_F2];
1020                         mFrame.mFormant3Freq = (int)tp[ELM_F3];
1021
1022                         // AMP_ADJ + is a kludge to get amplitudes up to klatt-compatible levels
1023
1024
1025                         //pars.mParallelNasalPoleAmpdb  = AMP_ADJ + tp[ELM_AN];
1026
1027                         mFrame.mBypassFricationAmpdb = AMP_ADJ + (int)tp[ELM_AB];
1028                         mFrame.mFormant5Ampdb = AMP_ADJ + (int)tp[ELM_A5];
1029                         mFrame.mFormant6Ampdb = AMP_ADJ + (int)tp[ELM_A6];
1030                         mFrame.mFormant1Ampdb = AMP_ADJ + (int)tp[ELM_A1];
1031                         mFrame.mFormant2Ampdb = AMP_ADJ + (int)tp[ELM_A2];
1032                         mFrame.mFormant3Ampdb = AMP_ADJ + (int)tp[ELM_A3];
1033                         mFrame.mFormant4Ampdb = AMP_ADJ + (int)tp[ELM_A4];
1034
1035                         parwave(samp);
1036
1037                         samp += mNspFr;
1038
1039                         // Declination of f0 envelope 0.25Hz / cS
1040                         mTop -= mBaseDeclination;// 0.5;
1041                 }
1042         }
1043
1044         mLastElement = currentElement;
1045
1046         return (int)(samp - aSamplePointer);
1047 }
1048
1049
1050 void klatt::init(int aBaseFrequency, float aBaseSpeed, float aBaseDeclination, int aBaseWaveform)
1051 {
1052         mBaseF0 = aBaseFrequency;
1053         mBaseSpeed = aBaseSpeed;
1054         mBaseDeclination = aBaseDeclination;
1055         mBaseWaveform = aBaseWaveform;
1056
1057     mSampleRate = 11025;
1058     mF0Flutter = 0;
1059         mF0FundamentalFreq = mBaseF0;
1060         mFrame.mF0FundamentalFreq = mBaseF0;
1061
1062         int FLPhz = (950 * mSampleRate) / 10000;
1063         int BLPhz = (630 * mSampleRate) / 10000;
1064         mNspFr = (int)(mSampleRate * mBaseSpeed) / 1000;
1065
1066         mDownSampLowPassFilter.initResonator(FLPhz, BLPhz, mSampleRate);
1067
1068         mNPer = 0;                        /* LG */
1069         mT0 = 0;                          /* LG */
1070
1071         mVLast = 0;                       /* Previous output of voice  */
1072         mNLast = 0;                       /* Previous output of random number generator  */
1073         mGlotLast = 0;                    /* Previous value of glotout  */
1074 }