48 #include "sphinxbase/byteorder.h"
49 #include "sphinxbase/fixpoint.h"
55 #include "fe_internal.h"
58 static const arg_t fe_args[] = {
59 waveform_to_cepstral_command_line_macro(),
60 { NULL, 0, NULL, NULL }
69 fe->sampling_rate = cmd_ln_float32_r(config,
"-samprate");
70 frate = cmd_ln_int32_r(config,
"-frate");
71 if (frate > MAX_INT16 || frate > fe->sampling_rate || frate < 1) {
73 (
"Frame rate %d can not be bigger than sample rate %.02f\n",
74 frate, fe->sampling_rate);
78 fe->frame_rate = (int16)frate;
81 fe->seed = cmd_ln_int32_r(config,
"-seed");
83 #ifdef WORDS_BIGENDIAN
84 fe->swap = strcmp(
"big",
cmd_ln_str_r(config,
"-input_endian")) == 0 ? 0 : 1;
86 fe->swap = strcmp(
"little",
cmd_ln_str_r(config,
"-input_endian")) == 0 ? 0 : 1;
88 fe->window_length = cmd_ln_float32_r(config,
"-wlen");
89 fe->pre_emphasis_alpha = cmd_ln_float32_r(config,
"-alpha");
91 fe->num_cepstra = (uint8)cmd_ln_int32_r(config,
"-ncep");
92 fe->fft_size = (int16)cmd_ln_int32_r(config,
"-nfft");
95 for (j = fe->fft_size, fe->fft_order = 0; j > 1; j >>= 1, fe->fft_order++) {
96 if (((j % 2) != 0) || (fe->fft_size <= 0)) {
97 E_ERROR(
"fft: number of points must be a power of 2 (is %d)\n",
103 if (fe->fft_size < (
int)(fe->window_length * fe->sampling_rate)) {
104 E_ERROR(
"FFT: Number of points must be greater or equal to frame size (%d samples)\n",
105 (
int)(fe->window_length * fe->sampling_rate));
109 fe->pre_speech = (int16)cmd_ln_int32_r(config,
"-vad_prespeech");
110 fe->post_speech = (int16)cmd_ln_int32_r(config,
"-vad_postspeech");
111 fe->start_speech = (int16)cmd_ln_int32_r(config,
"-vad_startspeech");
112 fe->vad_threshold = cmd_ln_float32_r(config,
"-vad_threshold");
118 if (0 == strcmp(
cmd_ln_str_r(config,
"-transform"),
"dct"))
119 fe->transform = DCT_II;
120 else if (0 == strcmp(
cmd_ln_str_r(config,
"-transform"),
"legacy"))
121 fe->transform = LEGACY_DCT;
122 else if (0 == strcmp(
cmd_ln_str_r(config,
"-transform"),
"htk"))
123 fe->transform = DCT_HTK;
125 E_ERROR(
"Invalid transform type (values are 'dct', 'legacy', 'htk')\n");
130 fe->log_spec = RAW_LOG_SPEC;
132 fe->log_spec = SMOOTH_LOG_SPEC;
140 mel->sampling_rate = fe->sampling_rate;
141 mel->fft_size = fe->fft_size;
142 mel->num_cepstra = fe->num_cepstra;
143 mel->num_filters = cmd_ln_int32_r(config,
"-nfilt");
146 fe->feature_dimension = mel->num_filters;
148 fe->feature_dimension = fe->num_cepstra;
150 mel->upper_filt_freq = cmd_ln_float32_r(config,
"-upperf");
151 mel->lower_filt_freq = cmd_ln_float32_r(config,
"-lowerf");
156 mel->warp_params =
cmd_ln_str_r(config,
"-warp_params");
157 mel->lifter_val = cmd_ln_int32_r(config,
"-lifter");
162 if (fe_warp_set(mel, mel->warp_type) != FE_SUCCESS) {
163 E_ERROR(
"Failed to initialize the warping function.\n");
166 fe_warp_set_parameters(mel, mel->warp_params, mel->sampling_rate);
171 fe_print_current(
fe_t const *fe)
173 E_INFO(
"Current FE Parameters:\n");
174 E_INFO(
"\tSampling Rate: %f\n", fe->sampling_rate);
175 E_INFO(
"\tFrame Size: %d\n", fe->frame_size);
176 E_INFO(
"\tFrame Shift: %d\n", fe->frame_shift);
177 E_INFO(
"\tFFT Size: %d\n", fe->fft_size);
178 E_INFO(
"\tLower Frequency: %g\n",
179 fe->mel_fb->lower_filt_freq);
180 E_INFO(
"\tUpper Frequency: %g\n",
181 fe->mel_fb->upper_filt_freq);
182 E_INFO(
"\tNumber of filters: %d\n", fe->mel_fb->num_filters);
183 E_INFO(
"\tNumber of Overflow Samps: %d\n", fe->num_overflow_samps);
184 E_INFO(
"\tStart Utt Status: %d\n", fe->start_flag);
185 E_INFO(
"Will %sremove DC offset at frame level\n",
186 fe->remove_dc ?
"" :
"not ");
188 E_INFO(
"Will add dither to audio\n");
189 E_INFO(
"Dither seeded with %d\n", fe->seed);
192 E_INFO(
"Will not add dither to audio\n");
194 if (fe->mel_fb->lifter_val) {
195 E_INFO(
"Will apply sine-curve liftering, period %d\n",
196 fe->mel_fb->lifter_val);
198 E_INFO(
"Will %snormalize filters to unit area\n",
199 fe->mel_fb->unit_area ?
"" :
"not ");
200 E_INFO(
"Will %sround filter frequencies to DFT points\n",
201 fe->mel_fb->round_filters ?
"" :
"not ");
202 E_INFO(
"Will %suse double bandwidth in mel filter\n",
203 fe->mel_fb->doublewide ?
"" :
"not ");
216 int prespch_frame_len;
222 if (fe_parse_general_params(
cmd_ln_retain(config), fe) < 0) {
231 fe->frame_shift = (int32) (fe->sampling_rate / fe->frame_rate + 0.5);
232 fe->frame_size = (int32) (fe->window_length * fe->sampling_rate + 0.5);
237 assert (fe->frame_shift > 1);
239 if (fe->frame_size < fe->frame_shift) {
241 (
"Frame size %d (-wlen) must be greater than frame shift %d (-frate)\n",
242 fe->frame_size, fe->frame_shift);
248 if (fe->frame_size > (fe->fft_size)) {
250 (
"Number of FFT points has to be a power of 2 higher than %d, it is %d\n",
251 fe->frame_size, fe->fft_size);
257 fe_init_dither(fe->seed);
260 fe->overflow_samps =
ckd_calloc(fe->frame_size,
sizeof(int16));
261 fe->hamming_window =
ckd_calloc(fe->frame_size/2,
sizeof(window_t));
264 fe_create_hamming(fe->hamming_window, fe->frame_size);
267 fe->mel_fb =
ckd_calloc(1,
sizeof(*fe->mel_fb));
270 fe_parse_melfb_params(config, fe, fe->mel_fb);
272 if (fe->mel_fb->upper_filt_freq > fe->sampling_rate / 2 + 1.0) {
273 E_ERROR(
"Upper frequency %.1f is higher than samprate/2 (%.1f)\n",
274 fe->mel_fb->upper_filt_freq, fe->sampling_rate / 2);
279 fe_build_melfilters(fe->mel_fb);
281 fe_compute_melcosine(fe->mel_fb);
282 if (fe->remove_noise || fe->remove_silence)
283 fe->noise_stats = fe_init_noisestats(fe->mel_fb->num_filters);
286 prespch_frame_len = fe->log_spec != RAW_LOG_SPEC ? fe->num_cepstra : fe->mel_fb->num_filters;
287 fe->vad_data->prespch_buf = fe_prespch_init(fe->pre_speech + 1, prespch_frame_len, fe->frame_shift);
291 fe->spch =
ckd_calloc(fe->frame_size,
sizeof(*fe->spch));
292 fe->frame =
ckd_calloc(fe->fft_size,
sizeof(*fe->frame));
293 fe->spec =
ckd_calloc(fe->fft_size,
sizeof(*fe->spec));
294 fe->mfspec =
ckd_calloc(fe->mel_fb->num_filters,
sizeof(*fe->mfspec));
297 fe->ccc =
ckd_calloc(fe->fft_size / 4,
sizeof(*fe->ccc));
298 fe->sss =
ckd_calloc(fe->fft_size / 4,
sizeof(*fe->sss));
299 fe_create_twiddle(fe);
302 fe_print_current(fe);
317 fe_get_config(
fe_t *fe)
323 fe_init_dither(int32 seed)
325 E_INFO(
"Using %d as the seed.\n", seed);
332 vad_data->in_speech = 0;
333 vad_data->pre_speech_frames = 0;
334 vad_data->post_speech_frames = 0;
335 fe_prespch_reset_cep(vad_data->prespch_buf);
339 fe_start_utt(
fe_t * fe)
341 fe->num_overflow_samps = 0;
342 memset(fe->overflow_samps, 0, fe->frame_size *
sizeof(int16));
345 fe_reset_vad_data(fe->vad_data);
350 fe_start_stream(
fe_t *fe)
352 fe->sample_counter = 0;
353 fe_reset_noisestats(fe->noise_stats);
357 fe_get_output_size(
fe_t *fe)
359 return (
int)fe->feature_dimension;
363 fe_get_input_size(
fe_t *fe,
int *out_frame_shift,
367 *out_frame_shift = fe->frame_shift;
369 *out_frame_size = fe->frame_size;
373 fe_get_vad_state(
fe_t *fe)
375 return fe->vad_data->in_speech;
379 fe_process_frames(
fe_t *fe,
380 int16
const **inout_spch,
381 size_t *inout_nsamps,
383 int32 *inout_nframes,
386 return fe_process_frames_ext(fe, inout_spch, inout_nsamps, buf_cep, inout_nframes, NULL, NULL, out_frameidx);
394 fe_copy_from_prespch(
fe_t *fe, int32 *inout_nframes, mfcc_t **buf_cep,
int outidx)
396 while ((*inout_nframes) > 0 && fe_prespch_read_cep(fe->vad_data->prespch_buf, buf_cep[outidx]) > 0) {
407 fe_check_prespeech(
fe_t *fe, int32 *inout_nframes, mfcc_t **buf_cep,
int outidx, int32 *out_frameidx,
size_t *inout_nsamps,
int orig_nsamps)
409 if (fe->vad_data->in_speech) {
410 if (fe_prespch_ncep(fe->vad_data->prespch_buf) > 0) {
414 outidx = fe_copy_from_prespch(fe, inout_nframes, buf_cep, outidx);
418 *out_frameidx = (fe->sample_counter + orig_nsamps - *inout_nsamps) / fe->frame_shift - fe->pre_speech;
426 if (fe->num_overflow_samps > 0)
427 fe->num_overflow_samps -= fe->frame_shift;
433 fe_process_frames_ext(
fe_t *fe,
434 int16
const **inout_spch,
435 size_t *inout_nsamps,
437 int32 *inout_nframes,
439 int32 *voiced_spch_nsamps,
442 int outidx, n_overflow, orig_n_overflow;
443 int16
const *orig_spch;
452 if (buf_cep == NULL) {
453 if (*inout_nsamps + fe->num_overflow_samps < (
size_t)fe->frame_size)
457 + ((*inout_nsamps + fe->num_overflow_samps - fe->frame_size)
459 if (!fe->vad_data->in_speech)
460 *inout_nframes += fe_prespch_ncep(fe->vad_data->prespch_buf);
461 return *inout_nframes;
468 if (*inout_nsamps + fe->num_overflow_samps < (
size_t)fe->frame_size) {
469 if (*inout_nsamps > 0) {
471 memcpy(fe->overflow_samps + fe->num_overflow_samps,
472 *inout_spch, *inout_nsamps * (
sizeof(int16)));
473 fe->num_overflow_samps += *inout_nsamps;
475 *inout_spch += *inout_nsamps;
484 if (*inout_nframes < 1) {
493 if (fe->vad_data->in_speech && fe_prespch_ncep(fe->vad_data->prespch_buf) > 0) {
494 outidx = fe_copy_from_prespch(fe, inout_nframes, buf_cep, outidx);
495 if ((*inout_nframes) < 1) {
497 *inout_nframes = outidx;
503 orig_spch = *inout_spch;
504 orig_nsamps = *inout_nsamps;
505 orig_n_overflow = fe->num_overflow_samps;
508 if (fe->num_overflow_samps > 0) {
509 int offset = fe->frame_size - fe->num_overflow_samps;
511 memcpy(fe->overflow_samps + fe->num_overflow_samps,
512 *inout_spch, offset *
sizeof(**inout_spch));
513 fe_read_frame(fe, fe->overflow_samps, fe->frame_size);
515 *inout_spch += offset;
516 *inout_nsamps -= offset;
518 fe_read_frame(fe, *inout_spch, fe->frame_size);
520 *inout_spch += fe->frame_size;
521 *inout_nsamps -= fe->frame_size;
524 fe_write_frame(fe, buf_cep[outidx], voiced_spch != NULL);
525 outidx = fe_check_prespeech(fe, inout_nframes, buf_cep, outidx, out_frameidx, inout_nsamps, orig_nsamps);
528 while (*inout_nframes > 0 && *inout_nsamps >= (
size_t)fe->frame_shift) {
529 fe_shift_frame(fe, *inout_spch, fe->frame_shift);
530 fe_write_frame(fe, buf_cep[outidx], voiced_spch != NULL);
532 outidx = fe_check_prespeech(fe, inout_nframes, buf_cep, outidx, out_frameidx, inout_nsamps, orig_nsamps);
535 *inout_spch += fe->frame_shift;
536 *inout_nsamps -= fe->frame_shift;
540 if (fe->num_overflow_samps <= 0) {
542 n_overflow = *inout_nsamps;
543 if (n_overflow > fe->frame_shift)
544 n_overflow = fe->frame_shift;
545 fe->num_overflow_samps = fe->frame_size - fe->frame_shift;
547 if (fe->num_overflow_samps > *inout_spch - orig_spch)
548 fe->num_overflow_samps = *inout_spch - orig_spch;
549 fe->num_overflow_samps += n_overflow;
550 if (fe->num_overflow_samps > 0) {
551 memcpy(fe->overflow_samps,
552 *inout_spch - (fe->frame_size - fe->frame_shift),
553 fe->num_overflow_samps *
sizeof(**inout_spch));
555 *inout_spch += n_overflow;
556 *inout_nsamps -= n_overflow;
561 memmove(fe->overflow_samps,
562 fe->overflow_samps + orig_n_overflow - fe->num_overflow_samps,
563 fe->num_overflow_samps *
sizeof(*fe->overflow_samps));
565 n_overflow = *inout_spch - orig_spch + *inout_nsamps;
566 if (n_overflow > fe->frame_size - fe->num_overflow_samps)
567 n_overflow = fe->frame_size - fe->num_overflow_samps;
568 memcpy(fe->overflow_samps + fe->num_overflow_samps,
569 orig_spch, n_overflow *
sizeof(*orig_spch));
570 fe->num_overflow_samps += n_overflow;
572 if (n_overflow > *inout_spch - orig_spch) {
573 n_overflow -= (*inout_spch - orig_spch);
574 *inout_spch += n_overflow;
575 *inout_nsamps -= n_overflow;
581 *inout_nframes = outidx;
582 fe->sample_counter += orig_nsamps - *inout_nsamps;
588 fe_process_utt(
fe_t * fe, int16
const * spch,
size_t nsamps,
589 mfcc_t *** cep_block, int32 * nframes)
595 fe_process_frames(fe, NULL, &nsamps, NULL, nframes, NULL);
598 cep = (mfcc_t **)
ckd_calloc_2d(*nframes, fe->feature_dimension,
sizeof(**cep));
600 cep = (mfcc_t **)
ckd_calloc_2d(1, fe->feature_dimension,
sizeof(**cep));
602 rv = fe_process_frames(fe, &spch, &nsamps, cep, nframes, NULL);
610 fe_end_utt(
fe_t * fe, mfcc_t * cepvector, int32 * nframes)
614 if (fe->num_overflow_samps > 0) {
615 fe_read_frame(fe, fe->overflow_samps, fe->num_overflow_samps);
616 fe_write_frame(fe, cepvector, FALSE);
617 if (fe->vad_data->in_speech)
622 fe->num_overflow_samps = 0;
640 if (--fe->refcount > 0)
645 if (fe->mel_fb->mel_cosine)
646 fe_free_2d((
void *) fe->mel_fb->mel_cosine);
664 fe_free_noisestats(fe->noise_stats);
667 fe_prespch_free(fe->vad_data->prespch_buf);
681 fe_mfcc_to_float(
fe_t * fe,
682 mfcc_t ** input, float32 ** output, int32 nframes)
687 if ((
void *) input == (
void *) output)
688 return nframes * fe->feature_dimension;
690 for (i = 0; i < nframes * fe->feature_dimension; ++i)
691 output[0][i] = MFCC2FLOAT(input[0][i]);
700 fe_float_to_mfcc(
fe_t * fe,
701 float32 ** input, mfcc_t ** output, int32 nframes)
706 if ((
void *) input == (
void *) output)
707 return nframes * fe->feature_dimension;
709 for (i = 0; i < nframes * fe->feature_dimension; ++i)
710 output[0][i] = FLOAT2MFCC(input[0][i]);
716 fe_logspec_to_mfcc(
fe_t * fe,
const mfcc_t * fr_spec, mfcc_t * fr_cep)
719 fe_spec2cep(fe, fr_spec, fr_cep);
724 powspec =
ckd_malloc(fe->mel_fb->num_filters *
sizeof(powspec_t));
725 for (i = 0; i < fe->mel_fb->num_filters; ++i)
726 powspec[i] = (powspec_t) fr_spec[i];
727 fe_spec2cep(fe, powspec, fr_cep);
734 fe_logspec_dct2(
fe_t * fe,
const mfcc_t * fr_spec, mfcc_t * fr_cep)
737 fe_dct2(fe, fr_spec, fr_cep, 0);
742 powspec =
ckd_malloc(fe->mel_fb->num_filters *
sizeof(powspec_t));
743 for (i = 0; i < fe->mel_fb->num_filters; ++i)
744 powspec[i] = (powspec_t) fr_spec[i];
745 fe_dct2(fe, powspec, fr_cep, 0);
752 fe_mfcc_dct3(
fe_t * fe,
const mfcc_t * fr_cep, mfcc_t * fr_spec)
755 fe_dct3(fe, fr_cep, fr_spec);
760 powspec =
ckd_malloc(fe->mel_fb->num_filters *
sizeof(powspec_t));
761 fe_dct3(fe, fr_cep, powspec);
762 for (i = 0; i < fe->mel_fb->num_filters; ++i)
763 fr_spec[i] = (mfcc_t) powspec[i];
Command-line and other configurationparsing and handling.
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_retain(cmd_ln_t *cmdln)
Retain ownership of a command-line argument set.
#define E_INFO(...)
Print logging information to standard error stream.
#define ckd_calloc_2d(d1, d2, sz)
Macro for ckd_calloc_2d
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
#define E_ERROR(...)
Print error message to error log.
Base Struct to hold all structure for MFCC computation.
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT int cmd_ln_free_r(cmd_ln_t *cmdln)
Release a command-line argument set and all associated strings.
Basic type definitions used in Sphinx.
SPHINXBASE_EXPORT char const * cmd_ln_str_r(cmd_ln_t *cmdln, char const *name)
Retrieve a string from a command-line object.
#define s3_rand_seed(s)
Macros to simplify calling of random generator function.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_get(void)
Retrieve the global cmd_ln_t object used by non-re-entrant functions.
Implementation of logging routines.
Argument definition structure.
High performance prortable random generator created by Takuji Nishimura and Makoto Matsumoto...
Opaque structure used to hold the results of command-line parsing.
#define ckd_malloc(sz)
Macro for ckd_malloc
#define cmd_ln_boolean_r(c, n)
Retrieve a boolean value from a command-line object.
Structure for the front-end computation.