44 #include <sphinxbase/byteorder.h>
46 #include "ngram_model_trie.h"
48 static const char trie_hdr[] =
"Trie Language Model";
49 static const char dmp_hdr[] =
"Darpa Trigram LM";
56 read_counts_arpa(
lineiter_t ** li, uint32 * counts,
int *order)
58 int32 ngram, prev_ngram;
64 if (strcmp((*li)->buf,
"\\data\\") == 0)
69 if (*li == NULL || strcmp((*li)->buf,
"\\data\\") != 0) {
70 E_INFO(
"No \\data\\ mark in LM file\n");
77 if (sscanf((*li)->buf,
"ngram %d=%d", &ngram, &ngram_cnt) != 2)
79 if (ngram != prev_ngram + 1) {
81 (
"Ngram counts in LM file is not in order. %d goes after %d\n",
86 counts[*order] = ngram_cnt;
91 E_ERROR(
"EOF while reading ngram counts\n");
98 if (strcmp((*li)->buf,
"\\1-grams:") == 0)
111 string_comparator(
const void *a,
const void *b)
113 const char **ia = (
const char **) a;
114 const char **ib = (
const char **) b;
115 return strcmp(*ia, *ib);
128 for (i = 0; i < count; i++) {
132 (
"Unexpected end of ARPA file. Failed to read %dth unigram\n",
137 if ((n =
str2words((*li)->buf, wptr, 3)) < n_parts) {
138 if ((*li)->buf[0] !=
'\0')
139 E_WARN(
"Format error; unigram ignored: %s\n", (*li)->buf);
146 if (unigram->prob > 0) {
147 E_WARN(
"Unigram [%s] has positive probability. Zeroize\n",
151 if (n == n_parts + 1) {
164 for (i = 0; i < count; i++) {
167 (
void *) (
long) i)) != (
void *) (
long) i) {
168 E_WARN(
"Duplicate word in dictionary: %s\n",
175 ngram_model_trie_read_arpa(
cmd_ln_t * config,
184 uint32 counts[NGRAM_MAX_ORDER];
185 uint32 fixed_counts[NGRAM_MAX_ORDER];
189 E_INFO(
"Trying to read LM in arpa format\n");
190 if ((fp =
fopen_comp(path,
"r", &is_pipe)) == NULL) {
191 E_ERROR(
"File %s not found\n", path);
198 if (read_counts_arpa(&li, counts, &order) == -1) {
205 E_INFO(
"LM of order %d\n", order);
206 for (i = 0; i < order; i++) {
207 E_INFO(
"#%d-grams: %d\n", i + 1, counts[i]);
211 ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
215 model->
trie = lm_trie_create(counts[0], QUANT_16, order);
216 read_1grams_arpa(&li, counts[0], base, model->
trie->unigrams);
220 ngrams_raw_read_arpa(&li, base->
lmath, counts, order,
222 ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order);
223 for (i = 0; i < order; i++) {
224 base->
n_counts[i] = fixed_counts[i];
226 lm_trie_alloc_ngram(model->
trie, fixed_counts, order);
227 lm_trie_build(model->
trie, raw_ngrams, counts, order);
228 ngrams_raw_free(raw_ngrams, counts, order);
241 int n_hist,
int order,
int max_order)
243 if (n_hist > 0 && range.begin == range.end) {
248 for (i = 0; i < counts[0]; i++) {
250 unigram_find(trie->unigrams, i, &node);
252 fill_raw_ngram(trie, lmath, raw_ngrams, raw_ngram_idx, counts,
253 node, hist, 1, order, max_order);
256 else if (n_hist < order - 1) {
261 middle_t *middle = &trie->middle_begin[n_hist - 1];
262 for (ptr = range.begin; ptr < range.end; ptr++) {
263 address.base = middle->base.base;
264 address.offset = ptr * middle->base.total_bits;
267 middle->base.word_mask);
268 hist[n_hist] = new_word;
269 address.offset += middle->base.word_bits + middle->quant_bits;
272 middle->next_mask.mask);
274 (ptr + 1) * middle->base.total_bits +
275 middle->base.word_bits + middle->quant_bits;
278 middle->next_mask.mask);
279 fill_raw_ngram(trie, lmath, raw_ngrams, raw_ngram_idx, counts,
280 node, hist, n_hist + 1, order, max_order);
288 assert(n_hist == order - 1);
289 for (ptr = range.begin; ptr < range.end; ptr++) {
290 ngram_raw_t *raw_ngram = &raw_ngrams[*raw_ngram_idx];
292 (
float *)
ckd_calloc(order == max_order ? 1 : 2,
293 sizeof(*raw_ngram->weights));
294 if (order == max_order) {
296 address.base = longest->base.base;
297 address.offset = ptr * longest->base.total_bits;
300 longest->base.word_mask);
301 address.offset += longest->base.word_bits;
302 prob = lm_trie_quant_lpread(trie->quant, address);
305 middle_t *middle = &trie->middle_begin[n_hist - 1];
306 address.base = middle->base.base;
307 address.offset = ptr * middle->base.total_bits;
310 middle->base.word_mask);
311 address.offset += middle->base.word_bits;
313 lm_trie_quant_mpread(trie->quant, address, n_hist - 1);
315 lm_trie_quant_mboread(trie->quant, address,
317 raw_ngram->weights[1] =
320 raw_ngram->weights[0] =
323 (uint32 *)
ckd_calloc(order,
sizeof(*raw_ngram->words));
324 for (i = 0; i <= n_hist; i++) {
325 raw_ngram->words[i] = hist[n_hist - i];
333 ngram_model_trie_write_arpa(
ngram_model_t * base,
const char *path)
338 FILE *fp = fopen(path,
"w");
340 E_ERROR(
"Unable to open %s to write arpa LM from trie\n", path);
344 "This is an ARPA-format language model file, generated by CMU Sphinx\n");
346 fprintf(fp,
"\\data\\\n");
347 for (i = 0; i < base->
n; ++i) {
348 fprintf(fp,
"ngram %d=%d\n", i + 1, base->
n_counts[i]);
351 fprintf(fp,
"\n\\1-grams:\n");
352 for (j = 0; j < base->
n_counts[0]; j++) {
354 fprintf(fp,
"%.4f\t%s",
358 fprintf(fp,
"\t%.4f",
365 for (i = 2; i <= base->
n; ++i) {
368 sizeof(*raw_ngrams));
369 uint32 raw_ngram_idx;
371 uint32 hist[NGRAM_MAX_ORDER];
374 range.begin = range.end = 0;
376 fill_raw_ngram(model->
trie, base->
lmath, raw_ngrams,
377 &raw_ngram_idx, base->
n_counts, range, hist, 0,
379 assert(raw_ngram_idx == base->
n_counts[i - 1]);
380 ngram_comparator(NULL, &i);
381 qsort(raw_ngrams, (
size_t) base->
n_counts[i - 1],
384 fprintf(fp,
"\n\\%d-grams:\n", i);
385 for (j = 0; j < base->
n_counts[i - 1]; j++) {
387 fprintf(fp,
"%.4f", raw_ngrams[j].weights[0]);
388 for (k = 0; k < i; k++) {
390 base->
word_str[raw_ngrams[j].words[k]]);
394 fprintf(fp,
"\t%.4f", raw_ngrams[j].weights[1]);
402 fprintf(fp,
"\n\\end\\\n");
414 fread(&k,
sizeof(k), 1, fp);
415 tmp_word_str = (
char *)
ckd_calloc((
size_t) k, 1);
416 fread(tmp_word_str, 1, (
size_t) k, fp);
419 for (i = 0, j = 0; i < (uint32) k; i++)
420 if (tmp_word_str[i] ==
'\0')
424 (
"Error reading word strings (%d doesn't match n_unigrams %d)\n",
430 for (i = 0; i < base->
n_counts[0]; i++) {
433 (
void *) (
long) i) != (
void *) (
long) i) {
434 E_WARN(
"Duplicate word in dictionary: %s\n",
443 ngram_model_trie_read_bin(
cmd_ln_t * config,
452 uint32 counts[NGRAM_MAX_ORDER];
456 E_INFO(
"Trying to read LM in trie binary format\n");
457 if ((fp =
fopen_comp(path,
"rb", &is_pipe)) == NULL) {
458 E_ERROR(
"File %s not found\n", path);
461 hdr_size = strlen(trie_hdr);
462 hdr = (
char *)
ckd_calloc(hdr_size + 1,
sizeof(*hdr));
463 fread(hdr,
sizeof(*hdr), hdr_size, fp);
464 cmp_res = strcmp(hdr, trie_hdr);
467 E_INFO(
"Header doesn't match\n");
473 fread(&order,
sizeof(order), 1, fp);
474 for (i = 0; i < order; i++) {
475 fread(&counts[i],
sizeof(counts[i]), 1, fp);
477 ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
479 for (i = 0; i < order; i++) {
483 model->
trie = lm_trie_read_bin(counts, order, fp);
484 read_word_str(base, fp);
497 for (i = 0; i < model->
n_counts[0]; i++)
498 k += strlen(model->
word_str[i]) + 1;
499 fwrite(&k,
sizeof(k), 1, fp);
500 for (i = 0; i < model->
n_counts[0]; i++)
505 ngram_model_trie_write_bin(
ngram_model_t * base,
const char *path)
512 E_ERROR(
"Unable to open %s to write binary trie LM\n", path);
516 fwrite(trie_hdr,
sizeof(*trie_hdr), strlen(trie_hdr), fp);
517 fwrite(&model->
base.
n,
sizeof(model->
base.
n), 1, fp);
518 for (i = 0; i < model->
base.
n; i++) {
523 write_word_str(fp, base);
529 ngram_model_trie_read_dmp(
cmd_ln_t * config,
530 const char *file_name,
logmath_t * lmath)
539 uint32 fixed_counts[3];
540 uint32 *unigram_next;
548 E_INFO(
"Trying to read LM in DMP format\n");
549 if ((fp =
fopen_comp(file_name,
"rb", &is_pipe)) == NULL) {
550 E_ERROR(
"Dump file %s not found\n", file_name);
555 fread(&k,
sizeof(k), 1, fp);
556 if (k != strlen(dmp_hdr) + 1) {
558 if (k != strlen(dmp_hdr) + 1) {
560 (
"Wrong magic header size number %x: %s is not a dump file\n",
566 if (fread(str, 1, k, fp) != (
size_t) k) {
567 E_ERROR(
"Cannot read header\n");
570 if (strncmp(str, dmp_hdr, k) != 0) {
571 E_ERROR(
"Wrong header %s: %s is not a dump file\n", dmp_hdr);
575 if (fread(&k,
sizeof(k), 1, fp) != 1)
579 if (fread(str, 1, k, fp) != (
size_t) k) {
580 E_ERROR(
"Cannot read LM filename in header\n");
585 if (fread(&vn,
sizeof(vn), 1, fp) != 1)
591 if (fread(&ts,
sizeof(ts), 1, fp) != 1)
598 if (fread(&k,
sizeof(k), 1, fp) != 1)
604 if (fread(str, 1, k, fp) != (
size_t) k) {
605 E_ERROR(
"Failed to read word\n");
610 if (fread(&count,
sizeof(count), 1, fp) != 1)
620 if (fread(&count,
sizeof(count), 1, fp) != 1)
625 if (fread(&count,
sizeof(count), 1, fp) != 1)
630 E_INFO(
"ngrams 1=%d, 2=%d, 3=%d\n", counts[0], counts[1], counts[2]);
636 else if (counts[1] > 0)
640 ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
643 model->
trie = lm_trie_create(counts[0], QUANT_16, order);
646 (uint32 *)
ckd_calloc((int32) counts[0] + 1,
sizeof(unigram_next));
647 for (j = 0; j <= (int32) counts[0]; j++) {
651 fread(&bigrams,
sizeof(int32), 1, fp);
653 fread(&weight,
sizeof(weight), 1, fp);
655 SWAP_INT32(&weight.l);
657 model->
trie->unigrams[j].prob = weight.f;
658 fread(&weight,
sizeof(weight), 1, fp);
660 SWAP_INT32(&weight.l);
662 model->
trie->unigrams[j].bo = weight.f;
664 fread(&bigrams,
sizeof(int32), 1, fp);
666 SWAP_INT32(&bigrams);
667 model->
trie->unigrams[j].next = bigrams;
668 unigram_next[j] = bigrams;
673 ngrams_raw_read_dmp(fp, lmath, counts, order, unigram_next,
675 ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order);
676 for (i = 0; i < order; i++) {
677 base->
n_counts[i] = fixed_counts[i];
681 lm_trie_alloc_ngram(model->
trie, order > 2 ? fixed_counts : counts,
683 lm_trie_build(model->
trie, raw_ngrams, counts, order);
687 ngrams_raw_free(raw_ngrams, counts, order);
692 read_word_str(base, fp);
702 lm_trie_free(model->
trie);
706 trie_apply_weights(
ngram_model_t * base, float32 lw, float32 wip)
717 return (int32) (score * base->
lw + base->
log_wip);
721 ngram_model_trie_raw_score(
ngram_model_t * base, int32 wid, int32 * hist,
722 int32 n_hist, int32 * n_used)
727 if (n_hist > model->
base.
n - 1)
728 n_hist = model->
base.
n - 1;
729 for (i = 0; i < n_hist; i++) {
736 return (int32) lm_trie_score(model->
trie, model->
base.
n, wid, hist,
741 ngram_model_trie_score(
ngram_model_t * base, int32 wid, int32 * hist,
742 int32 n_hist, int32 * n_used)
744 return weight_score(base,
745 ngram_model_trie_raw_score(base, wid, hist, n_hist,
750 lm_trie_add_ug(
ngram_model_t * base, int32 wid, int32 lweight)
755 assert(!NGRAM_IS_CLASSWID(wid));
758 model->
trie->unigrams =
760 sizeof(*model->
trie->unigrams) *
762 memset(model->
trie->unigrams + (base->
n_counts[0] + 1), 0,
767 model->
trie->unigrams[wid + 1].next = model->
trie->unigrams[wid].next;
768 model->
trie->unigrams[wid].prob = (float) lweight;
771 model->
trie->unigrams[wid].bo = 0;
777 if ((uint32) wid >= base->
n_counts[0])
780 return (int32) weight_score(base, lweight);
788 memset(trie->prev_hist, -1,
sizeof(trie->prev_hist));
789 memset(trie->backoff, 0,
sizeof(trie->backoff));
794 ngram_model_trie_free,
796 ngram_model_trie_score,
797 ngram_model_trie_raw_score,
#define E_ERROR_SYSTEM(...)
Print error text; Call perror("");.
lm_trie_t * trie
Trie structure that stores ngram relations and weights.
Miscellaneous useful string functions.
#define E_INFO(...)
Print logging information to standard error stream.
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
#define E_ERROR(...)
Print error message to error log.
hash_table_t * wid
Mapping of unigram names to word IDs.
char ** word_str
Unigram names.
ngram_model_t base
Base ngram_model_t structure.
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT int logmath_log(logmath_t *lmath, float64 p)
Convert linear floating point number to integer log in base B.
uint8 writable
Are word strings writable?
#define ckd_salloc(ptr)
Macro for ckd_salloc
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Structure that stores address of certain value in bit array.
SPHINXBASE_EXPORT float64 logmath_log_float_to_log10(logmath_t *lmath, float log_p)
Convert float log in base B to base 10 log.
int32 n_1g_alloc
Number of allocated word strings (for new word addition)
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
SPHINXBASE_EXPORT void lineiter_free(lineiter_t *li)
Stop reading lines from a file.
uint32 * n_counts
Counts for 1, 2, 3, ...
SPHINXBASE_EXPORT uint32 bitarr_read_int25(bitarr_address_t address, uint8 length, uint32 mask)
Read uint32 value from bit array.
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
SPHINXBASE_EXPORT lineiter_t * lineiter_start(FILE *fh)
Start reading lines from a file.
uint8 n
This is an n-gram model (1, 2, 3, ...).
Implementation of logging routines.
logmath_t * lmath
Log-math object.
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
SPHINXBASE_EXPORT FILE * fopen_comp(const char *file, const char *mode, int32 *ispipe)
Like fopen, but use popen and zcat if it is determined that "file" is compressed (i.e., has a .z, .Z, .gz, or .GZ extension).
#define E_WARN(...)
Print warning message to error log.
SPHINXBASE_EXPORT float logmath_log10_to_log_float(logmath_t *lmath, float64 log_p)
Convert base 10 log (in floating point) to float log in base B.
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Opaque structure used to hold the results of command-line parsing.
Implementation-specific functions for operating on ngram_model_t objects.
float32 lw
Language model scaling factor.
SPHINXBASE_EXPORT char * string_trim(char *string, enum string_edge_e which)
Remove whitespace from a string, modifying it in-place.
Common implementation of ngram_model_t.
SPHINXBASE_EXPORT void fclose_comp(FILE *fp, int32 ispipe)
Close a file opened using fopen_comp.
#define ckd_realloc(ptr, sz)
Macro for ckd_realloc
file IO related operations.
int32 log_wip
Log of word insertion penalty.