SphinxBase  5prealpha
ngram_model_trie.c
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 2015 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 
38 #include <string.h>
39 
40 #include <sphinxbase/err.h>
41 #include <sphinxbase/pio.h>
42 #include <sphinxbase/strfuncs.h>
43 #include <sphinxbase/ckd_alloc.h>
44 #include <sphinxbase/byteorder.h>
45 
46 #include "ngram_model_trie.h"
47 
48 static const char trie_hdr[] = "Trie Language Model";
49 static const char dmp_hdr[] = "Darpa Trigram LM";
50 static ngram_funcs_t ngram_model_trie_funcs;
51 
52 /*
53  * Read and return #unigrams, #bigrams, #trigrams as stated in input file.
54  */
55 static int
56 read_counts_arpa(lineiter_t ** li, uint32 * counts, int *order)
57 {
58  int32 ngram, prev_ngram;
59  uint32 ngram_cnt;
60 
61  /* skip file until past the '\data\' marker */
62  while (*li) {
63  string_trim((*li)->buf, STRING_BOTH);
64  if (strcmp((*li)->buf, "\\data\\") == 0)
65  break;
66  *li = lineiter_next(*li);
67  }
68 
69  if (*li == NULL || strcmp((*li)->buf, "\\data\\") != 0) {
70  E_INFO("No \\data\\ mark in LM file\n");
71  return -1;
72  }
73 
74  prev_ngram = 0;
75  *order = 0;
76  while ((*li = lineiter_next(*li))) {
77  if (sscanf((*li)->buf, "ngram %d=%d", &ngram, &ngram_cnt) != 2)
78  break;
79  if (ngram != prev_ngram + 1) {
80  E_ERROR
81  ("Ngram counts in LM file is not in order. %d goes after %d\n",
82  ngram, prev_ngram);
83  return -1;
84  }
85  prev_ngram = ngram;
86  counts[*order] = ngram_cnt;
87  (*order)++;
88  }
89 
90  if (*li == NULL) {
91  E_ERROR("EOF while reading ngram counts\n");
92  return -1;
93  }
94 
95  /* Position iterator to the unigrams header '\1-grams:\' */
96  while ((*li = lineiter_next(*li))) {
97  string_trim((*li)->buf, STRING_BOTH);
98  if (strcmp((*li)->buf, "\\1-grams:") == 0)
99  break;
100  }
101 
102  if (*li == NULL) {
103  E_ERROR_SYSTEM("Failed to read \\1-grams: mark");
104  return -1;
105  }
106 
107  return 0;
108 }
109 
110 int
111 string_comparator(const void *a, const void *b)
112 {
113  const char **ia = (const char **) a;
114  const char **ib = (const char **) b;
115  return strcmp(*ia, *ib);
116 }
117 
118 static void
119 read_1grams_arpa(lineiter_t ** li, uint32 count, ngram_model_t * base,
120  unigram_t * unigrams)
121 {
122  uint32 i;
123  int n;
124  int n_parts;
125  char *wptr[3];
126 
127  n_parts = 2;
128  for (i = 0; i < count; i++) {
129  *li = lineiter_next(*li);
130  if (*li == NULL) {
131  E_ERROR
132  ("Unexpected end of ARPA file. Failed to read %dth unigram\n",
133  i + 1);
134  break;
135  }
136  string_trim((*li)->buf, STRING_BOTH);
137  if ((n = str2words((*li)->buf, wptr, 3)) < n_parts) {
138  if ((*li)->buf[0] != '\0')
139  E_WARN("Format error; unigram ignored: %s\n", (*li)->buf);
140  continue;
141  }
142  else {
143  unigram_t *unigram = &unigrams[i];
144  unigram->prob =
145  logmath_log10_to_log_float(base->lmath, atof_c(wptr[0]));
146  if (unigram->prob > 0) {
147  E_WARN("Unigram [%s] has positive probability. Zeroize\n",
148  wptr[1]);
149  unigram->prob = 0;
150  }
151  if (n == n_parts + 1) {
152  unigram->bo =
154  atof_c(wptr[2]));
155  }
156  else {
157  unigram->bo = 0.0f;
158  }
159  //TODO classify float with fpclassify and warn if bad value occurred
160  base->word_str[i] = ckd_salloc(wptr[1]);
161  }
162  }
163  //fill hash-table that maps unigram names to their word ids
164  for (i = 0; i < count; i++) {
165  if ((hash_table_enter
166  (base->wid, base->word_str[i],
167  (void *) (long) i)) != (void *) (long) i) {
168  E_WARN("Duplicate word in dictionary: %s\n",
169  base->word_str[i]);
170  }
171  }
172 }
173 
175 ngram_model_trie_read_arpa(cmd_ln_t * config,
176  const char *path, logmath_t * lmath)
177 {
178  FILE *fp;
179  lineiter_t *li;
180  ngram_model_trie_t *model;
181  ngram_model_t *base;
182  ngram_raw_t **raw_ngrams;
183  int32 is_pipe;
184  uint32 counts[NGRAM_MAX_ORDER];
185  uint32 fixed_counts[NGRAM_MAX_ORDER];
186  int order;
187  int i;
188 
189  E_INFO("Trying to read LM in arpa format\n");
190  if ((fp = fopen_comp(path, "r", &is_pipe)) == NULL) {
191  E_ERROR("File %s not found\n", path);
192  return NULL;
193  }
194 
195  model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model));
196  li = lineiter_start(fp);
197  /* Read n-gram counts from file */
198  if (read_counts_arpa(&li, counts, &order) == -1) {
199  ckd_free(model);
200  lineiter_free(li);
201  fclose_comp(fp, is_pipe);
202  return NULL;
203  }
204 
205  E_INFO("LM of order %d\n", order);
206  for (i = 0; i < order; i++) {
207  E_INFO("#%d-grams: %d\n", i + 1, counts[i]);
208  }
209 
210  base = &model->base;
211  ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
212  (int32) counts[0]);
213  base->writable = TRUE;
214 
215  model->trie = lm_trie_create(counts[0], QUANT_16, order);
216  read_1grams_arpa(&li, counts[0], base, model->trie->unigrams);
217 
218  if (order > 1) {
219  raw_ngrams =
220  ngrams_raw_read_arpa(&li, base->lmath, counts, order,
221  base->wid);
222  ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order);
223  for (i = 0; i < order; i++) {
224  base->n_counts[i] = fixed_counts[i];
225  }
226  lm_trie_alloc_ngram(model->trie, fixed_counts, order);
227  lm_trie_build(model->trie, raw_ngrams, counts, order);
228  ngrams_raw_free(raw_ngrams, counts, order);
229  }
230 
231  lineiter_free(li);
232  fclose_comp(fp, is_pipe);
233 
234  return base;
235 }
236 
237 static void
238 fill_raw_ngram(lm_trie_t * trie, logmath_t * lmath,
239  ngram_raw_t * raw_ngrams, uint32 * raw_ngram_idx,
240  uint32 * counts, node_range_t range, uint32 * hist,
241  int n_hist, int order, int max_order)
242 {
243  if (n_hist > 0 && range.begin == range.end) {
244  return;
245  }
246  if (n_hist == 0) {
247  uint32 i;
248  for (i = 0; i < counts[0]; i++) {
249  node_range_t node;
250  unigram_find(trie->unigrams, i, &node);
251  hist[0] = i;
252  fill_raw_ngram(trie, lmath, raw_ngrams, raw_ngram_idx, counts,
253  node, hist, 1, order, max_order);
254  }
255  }
256  else if (n_hist < order - 1) {
257  uint32 ptr;
258  node_range_t node;
259  bitarr_address_t address;
260  uint32 new_word;
261  middle_t *middle = &trie->middle_begin[n_hist - 1];
262  for (ptr = range.begin; ptr < range.end; ptr++) {
263  address.base = middle->base.base;
264  address.offset = ptr * middle->base.total_bits;
265  new_word =
266  bitarr_read_int25(address, middle->base.word_bits,
267  middle->base.word_mask);
268  hist[n_hist] = new_word;
269  address.offset += middle->base.word_bits + middle->quant_bits;
270  node.begin =
271  bitarr_read_int25(address, middle->next_mask.bits,
272  middle->next_mask.mask);
273  address.offset =
274  (ptr + 1) * middle->base.total_bits +
275  middle->base.word_bits + middle->quant_bits;
276  node.end =
277  bitarr_read_int25(address, middle->next_mask.bits,
278  middle->next_mask.mask);
279  fill_raw_ngram(trie, lmath, raw_ngrams, raw_ngram_idx, counts,
280  node, hist, n_hist + 1, order, max_order);
281  }
282  }
283  else {
284  bitarr_address_t address;
285  uint32 ptr;
286  float prob, backoff;
287  int i;
288  assert(n_hist == order - 1);
289  for (ptr = range.begin; ptr < range.end; ptr++) {
290  ngram_raw_t *raw_ngram = &raw_ngrams[*raw_ngram_idx];
291  raw_ngram->weights =
292  (float *) ckd_calloc(order == max_order ? 1 : 2,
293  sizeof(*raw_ngram->weights));
294  if (order == max_order) {
295  longest_t *longest = trie->longest; //access
296  address.base = longest->base.base;
297  address.offset = ptr * longest->base.total_bits;
298  hist[n_hist] =
299  bitarr_read_int25(address, longest->base.word_bits,
300  longest->base.word_mask);
301  address.offset += longest->base.word_bits;
302  prob = lm_trie_quant_lpread(trie->quant, address);
303  }
304  else {
305  middle_t *middle = &trie->middle_begin[n_hist - 1];
306  address.base = middle->base.base;
307  address.offset = ptr * middle->base.total_bits;
308  hist[n_hist] =
309  bitarr_read_int25(address, middle->base.word_bits,
310  middle->base.word_mask);
311  address.offset += middle->base.word_bits;
312  prob =
313  lm_trie_quant_mpread(trie->quant, address, n_hist - 1);
314  backoff =
315  lm_trie_quant_mboread(trie->quant, address,
316  n_hist - 1);
317  raw_ngram->weights[1] =
318  (float) logmath_log_float_to_log10(lmath, backoff);
319  }
320  raw_ngram->weights[0] =
321  (float) logmath_log_float_to_log10(lmath, prob);
322  raw_ngram->words =
323  (uint32 *) ckd_calloc(order, sizeof(*raw_ngram->words));
324  for (i = 0; i <= n_hist; i++) {
325  raw_ngram->words[i] = hist[n_hist - i];
326  }
327  (*raw_ngram_idx)++;
328  }
329  }
330 }
331 
332 int
333 ngram_model_trie_write_arpa(ngram_model_t * base, const char *path)
334 {
335  int i;
336  uint32 j;
337  ngram_model_trie_t *model = (ngram_model_trie_t *) base;
338  FILE *fp = fopen(path, "w");
339  if (!fp) {
340  E_ERROR("Unable to open %s to write arpa LM from trie\n", path);
341  return -1;
342  }
343  fprintf(fp,
344  "This is an ARPA-format language model file, generated by CMU Sphinx\n");
345  /* Write N-gram counts. */
346  fprintf(fp, "\\data\\\n");
347  for (i = 0; i < base->n; ++i) {
348  fprintf(fp, "ngram %d=%d\n", i + 1, base->n_counts[i]);
349  }
350  /* Write 1-grams */
351  fprintf(fp, "\n\\1-grams:\n");
352  for (j = 0; j < base->n_counts[0]; j++) {
353  unigram_t *unigram = &model->trie->unigrams[j];
354  fprintf(fp, "%.4f\t%s",
355  logmath_log_float_to_log10(base->lmath, unigram->prob),
356  base->word_str[j]);
357  if (base->n > 1) {
358  fprintf(fp, "\t%.4f",
359  logmath_log_float_to_log10(base->lmath, unigram->bo));
360  }
361  fprintf(fp, "\n");
362  }
363  /* Write ngrams */
364  if (base->n > 1) {
365  for (i = 2; i <= base->n; ++i) {
366  ngram_raw_t *raw_ngrams =
367  (ngram_raw_t *) ckd_calloc((size_t) base->n_counts[i - 1],
368  sizeof(*raw_ngrams));
369  uint32 raw_ngram_idx;
370  uint32 j;
371  uint32 hist[NGRAM_MAX_ORDER];
372  node_range_t range;
373  raw_ngram_idx = 0;
374  range.begin = range.end = 0; //initialize to disable warning
375  //we need to iterate over a trie here. recursion should do the job
376  fill_raw_ngram(model->trie, base->lmath, raw_ngrams,
377  &raw_ngram_idx, base->n_counts, range, hist, 0,
378  i, base->n);
379  assert(raw_ngram_idx == base->n_counts[i - 1]);
380  ngram_comparator(NULL, &i); //setting up order in comparator
381  qsort(raw_ngrams, (size_t) base->n_counts[i - 1],
382  sizeof(ngram_raw_t), &ngram_comparator);
383  //now we write sorted ngrams to file
384  fprintf(fp, "\n\\%d-grams:\n", i);
385  for (j = 0; j < base->n_counts[i - 1]; j++) {
386  int k;
387  fprintf(fp, "%.4f", raw_ngrams[j].weights[0]);
388  for (k = 0; k < i; k++) {
389  fprintf(fp, "\t%s",
390  base->word_str[raw_ngrams[j].words[k]]);
391  }
392  ckd_free(raw_ngrams[j].words);
393  if (i < base->n) {
394  fprintf(fp, "\t%.4f", raw_ngrams[j].weights[1]);
395  }
396  ckd_free(raw_ngrams[j].weights);
397  fprintf(fp, "\n");
398  }
399  ckd_free(raw_ngrams);
400  }
401  }
402  fprintf(fp, "\n\\end\\\n");
403  return fclose(fp);
404 }
405 
406 static void
407 read_word_str(ngram_model_t * base, FILE * fp)
408 {
409  int32 k;
410  uint32 i, j;
411  char *tmp_word_str;
412  /* read ascii word strings */
413  base->writable = TRUE;
414  fread(&k, sizeof(k), 1, fp);
415  tmp_word_str = (char *) ckd_calloc((size_t) k, 1);
416  fread(tmp_word_str, 1, (size_t) k, fp);
417 
418  /* First make sure string just read contains n_counts[0] words (PARANOIA!!) */
419  for (i = 0, j = 0; i < (uint32) k; i++)
420  if (tmp_word_str[i] == '\0')
421  j++;
422  if (j != base->n_counts[0]) {
423  E_ERROR
424  ("Error reading word strings (%d doesn't match n_unigrams %d)\n",
425  j, base->n_counts[0]);
426  }
427 
428  /* Break up string just read into words */
429  j = 0;
430  for (i = 0; i < base->n_counts[0]; i++) {
431  base->word_str[i] = ckd_salloc(tmp_word_str + j);
432  if (hash_table_enter(base->wid, base->word_str[i],
433  (void *) (long) i) != (void *) (long) i) {
434  E_WARN("Duplicate word in dictionary: %s\n",
435  base->word_str[i]);
436  }
437  j += strlen(base->word_str[i]) + 1;
438  }
439  free(tmp_word_str);
440 }
441 
443 ngram_model_trie_read_bin(cmd_ln_t * config,
444  const char *path, logmath_t * lmath)
445 {
446  int32 is_pipe;
447  FILE *fp;
448  size_t hdr_size;
449  char *hdr;
450  int cmp_res;
451  uint8 i, order;
452  uint32 counts[NGRAM_MAX_ORDER];
453  ngram_model_trie_t *model;
454  ngram_model_t *base;
455 
456  E_INFO("Trying to read LM in trie binary format\n");
457  if ((fp = fopen_comp(path, "rb", &is_pipe)) == NULL) {
458  E_ERROR("File %s not found\n", path);
459  return NULL;
460  }
461  hdr_size = strlen(trie_hdr);
462  hdr = (char *) ckd_calloc(hdr_size + 1, sizeof(*hdr));
463  fread(hdr, sizeof(*hdr), hdr_size, fp);
464  cmp_res = strcmp(hdr, trie_hdr);
465  ckd_free(hdr);
466  if (cmp_res) {
467  E_INFO("Header doesn't match\n");
468  fclose_comp(fp, is_pipe);
469  return NULL;
470  }
471  model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model));
472  base = &model->base;
473  fread(&order, sizeof(order), 1, fp);
474  for (i = 0; i < order; i++) {
475  fread(&counts[i], sizeof(counts[i]), 1, fp);
476  }
477  ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
478  (int32) counts[0]);
479  for (i = 0; i < order; i++) {
480  base->n_counts[i] = counts[i];
481  }
482 
483  model->trie = lm_trie_read_bin(counts, order, fp);
484  read_word_str(base, fp);
485  fclose_comp(fp, is_pipe);
486 
487  return base;
488 }
489 
490 static void
491 write_word_str(FILE * fp, ngram_model_t * model)
492 {
493  int32 k;
494  uint32 i;
495 
496  k = 0;
497  for (i = 0; i < model->n_counts[0]; i++)
498  k += strlen(model->word_str[i]) + 1;
499  fwrite(&k, sizeof(k), 1, fp);
500  for (i = 0; i < model->n_counts[0]; i++)
501  fwrite(model->word_str[i], 1, strlen(model->word_str[i]) + 1, fp);
502 }
503 
504 int
505 ngram_model_trie_write_bin(ngram_model_t * base, const char *path)
506 {
507  int i;
508  int32 is_pipe;
509  ngram_model_trie_t *model = (ngram_model_trie_t *) base;
510  FILE *fp = fopen_comp(path, "wb", &is_pipe);
511  if (!fp) {
512  E_ERROR("Unable to open %s to write binary trie LM\n", path);
513  return -1;
514  }
515 
516  fwrite(trie_hdr, sizeof(*trie_hdr), strlen(trie_hdr), fp);
517  fwrite(&model->base.n, sizeof(model->base.n), 1, fp);
518  for (i = 0; i < model->base.n; i++) {
519  fwrite(&model->base.n_counts[i], sizeof(model->base.n_counts[i]),
520  1, fp);
521  }
522  lm_trie_write_bin(model->trie, base->n_counts[0], fp);
523  write_word_str(fp, base);
524  fclose_comp(fp, is_pipe);
525  return 0;
526 }
527 
529 ngram_model_trie_read_dmp(cmd_ln_t * config,
530  const char *file_name, logmath_t * lmath)
531 {
532  uint8 do_swap;
533  int32 is_pipe;
534  int32 k;
535  uint32 j;
536  int32 vn, ts;
537  int32 count;
538  uint32 counts[3];
539  uint32 fixed_counts[3];
540  uint32 *unigram_next;
541  int i, order;
542  char str[1024];
543  FILE *fp;
544  ngram_model_trie_t *model;
545  ngram_model_t *base;
546  ngram_raw_t **raw_ngrams;
547 
548  E_INFO("Trying to read LM in DMP format\n");
549  if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) {
550  E_ERROR("Dump file %s not found\n", file_name);
551  return NULL;
552  }
553 
554  do_swap = FALSE;
555  fread(&k, sizeof(k), 1, fp);
556  if (k != strlen(dmp_hdr) + 1) {
557  SWAP_INT32(&k);
558  if (k != strlen(dmp_hdr) + 1) {
559  E_ERROR
560  ("Wrong magic header size number %x: %s is not a dump file\n",
561  k, file_name);
562  return NULL;
563  }
564  do_swap = 1;
565  }
566  if (fread(str, 1, k, fp) != (size_t) k) {
567  E_ERROR("Cannot read header\n");
568  return NULL;
569  }
570  if (strncmp(str, dmp_hdr, k) != 0) {
571  E_ERROR("Wrong header %s: %s is not a dump file\n", dmp_hdr);
572  return NULL;
573  }
574 
575  if (fread(&k, sizeof(k), 1, fp) != 1)
576  return NULL;
577  if (do_swap)
578  SWAP_INT32(&k);
579  if (fread(str, 1, k, fp) != (size_t) k) {
580  E_ERROR("Cannot read LM filename in header\n");
581  return NULL;
582  }
583 
584  /* read version#, if present (must be <= 0) */
585  if (fread(&vn, sizeof(vn), 1, fp) != 1)
586  return NULL;
587  if (do_swap)
588  SWAP_INT32(&vn);
589  if (vn <= 0) {
590  /* read and don't compare timestamps (we don't care) */
591  if (fread(&ts, sizeof(ts), 1, fp) != 1)
592  return NULL;
593  if (do_swap)
594  SWAP_INT32(&ts);
595 
596  /* read and skip format description */
597  for (;;) {
598  if (fread(&k, sizeof(k), 1, fp) != 1)
599  return NULL;
600  if (do_swap)
601  SWAP_INT32(&k);
602  if (k == 0)
603  break;
604  if (fread(str, 1, k, fp) != (size_t) k) {
605  E_ERROR("Failed to read word\n");
606  return NULL;
607  }
608  }
609  /* read model->ucount */
610  if (fread(&count, sizeof(count), 1, fp) != 1)
611  return NULL;
612  if (do_swap)
613  SWAP_INT32(&count);
614  counts[0] = count;
615  }
616  else {
617  counts[0] = vn;
618  }
619  /* read model->bcount, tcount */
620  if (fread(&count, sizeof(count), 1, fp) != 1)
621  return NULL;
622  if (do_swap)
623  SWAP_INT32(&count);
624  counts[1] = count;
625  if (fread(&count, sizeof(count), 1, fp) != 1)
626  return NULL;
627  if (do_swap)
628  SWAP_INT32(&count);
629  counts[2] = count;
630  E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", counts[0], counts[1], counts[2]);
631 
632  model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model));
633  base = &model->base;
634  if (counts[2] > 0)
635  order = 3;
636  else if (counts[1] > 0)
637  order = 2;
638  else
639  order = 1;
640  ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
641  (int32) counts[0]);
642 
643  model->trie = lm_trie_create(counts[0], QUANT_16, order);
644  //read unigrams. no tricks here
645  unigram_next =
646  (uint32 *) ckd_calloc((int32) counts[0] + 1, sizeof(unigram_next));
647  for (j = 0; j <= (int32) counts[0]; j++) {
648  int32 bigrams;
649  dmp_weight_t weight;
650  /* Skip over the mapping ID, we don't care about it. */
651  fread(&bigrams, sizeof(int32), 1, fp);
652  /* Read the weights from actual unigram structure. */
653  fread(&weight, sizeof(weight), 1, fp);
654  if (do_swap)
655  SWAP_INT32(&weight.l);
656  weight.f = logmath_log10_to_log_float(lmath, weight.f);
657  model->trie->unigrams[j].prob = weight.f;
658  fread(&weight, sizeof(weight), 1, fp);
659  if (do_swap)
660  SWAP_INT32(&weight.l);
661  weight.f = logmath_log10_to_log_float(lmath, weight.f);
662  model->trie->unigrams[j].bo = weight.f;
663  //store pointer to dmp next to recognize wid
664  fread(&bigrams, sizeof(int32), 1, fp);
665  if (do_swap)
666  SWAP_INT32(&bigrams);
667  model->trie->unigrams[j].next = bigrams;
668  unigram_next[j] = bigrams;
669  }
670 
671  if (order > 1) {
672  raw_ngrams =
673  ngrams_raw_read_dmp(fp, lmath, counts, order, unigram_next,
674  do_swap);
675  ngrams_raw_fix_counts(raw_ngrams, counts, fixed_counts, order);
676  for (i = 0; i < order; i++) {
677  base->n_counts[i] = fixed_counts[i];
678  }
679 
680  //build reversed trie
681  lm_trie_alloc_ngram(model->trie, order > 2 ? fixed_counts : counts,
682  order);
683  lm_trie_build(model->trie, raw_ngrams, counts, order);
684  counts[1]++;
685 
686  //free raw ngrams
687  ngrams_raw_free(raw_ngrams, counts, order);
688  }
689  ckd_free(unigram_next);
690 
691  /* read ascii word strings */
692  read_word_str(base, fp);
693 
694  fclose_comp(fp, is_pipe);
695  return base;
696 }
697 
698 static void
699 ngram_model_trie_free(ngram_model_t * base)
700 {
701  ngram_model_trie_t *model = (ngram_model_trie_t *) base;
702  lm_trie_free(model->trie);
703 }
704 
705 static int
706 trie_apply_weights(ngram_model_t * base, float32 lw, float32 wip)
707 {
708  //just update weights that are going to be used on score calculation
709  base->lw = lw;
710  base->log_wip = logmath_log(base->lmath, wip);
711  return 0;
712 }
713 
714 static int32
715 weight_score(ngram_model_t * base, int32 score)
716 {
717  return (int32) (score * base->lw + base->log_wip);
718 }
719 
720 static int32
721 ngram_model_trie_raw_score(ngram_model_t * base, int32 wid, int32 * hist,
722  int32 n_hist, int32 * n_used)
723 {
724  int32 i;
725  ngram_model_trie_t *model = (ngram_model_trie_t *) base;
726 
727  if (n_hist > model->base.n - 1)
728  n_hist = model->base.n - 1;
729  for (i = 0; i < n_hist; i++) {
730  if (hist[i] < 0) {
731  n_hist = i;
732  break;
733  }
734  }
735 
736  return (int32) lm_trie_score(model->trie, model->base.n, wid, hist,
737  n_hist, n_used);
738 }
739 
740 static int32
741 ngram_model_trie_score(ngram_model_t * base, int32 wid, int32 * hist,
742  int32 n_hist, int32 * n_used)
743 {
744  return weight_score(base,
745  ngram_model_trie_raw_score(base, wid, hist, n_hist,
746  n_used));
747 }
748 
749 static int32
750 lm_trie_add_ug(ngram_model_t * base, int32 wid, int32 lweight)
751 {
752  ngram_model_trie_t *model = (ngram_model_trie_t *) base;
753 
754  /* This would be very bad if this happened! */
755  assert(!NGRAM_IS_CLASSWID(wid));
756 
757  /* Reallocate unigram array. */
758  model->trie->unigrams =
759  (unigram_t *) ckd_realloc(model->trie->unigrams,
760  sizeof(*model->trie->unigrams) *
761  (base->n_1g_alloc + 1));
762  memset(model->trie->unigrams + (base->n_counts[0] + 1), 0,
763  (size_t) (base->n_1g_alloc -
764  base->n_counts[0]) * sizeof(*model->trie->unigrams));
765  ++base->n_counts[0];
766  lweight += logmath_log(base->lmath, 1.0 / base->n_counts[0]);
767  model->trie->unigrams[wid + 1].next = model->trie->unigrams[wid].next;
768  model->trie->unigrams[wid].prob = (float) lweight;
769  /* This unigram by definition doesn't participate in any bigrams,
770  * so its backoff weight is undefined and next pointer same as in finish unigram*/
771  model->trie->unigrams[wid].bo = 0;
772  /* Finally, increase the unigram count */
773  /* FIXME: Note that this can actually be quite bogus due to the
774  * presence of class words. If wid falls outside the unigram
775  * count, increase it to compensate, at the cost of no longer
776  * really knowing how many unigrams we have :( */
777  if ((uint32) wid >= base->n_counts[0])
778  base->n_counts[0] = wid + 1;
779 
780  return (int32) weight_score(base, lweight);
781 }
782 
783 static void
784 lm_trie_flush(ngram_model_t * base)
785 {
786  ngram_model_trie_t *model = (ngram_model_trie_t *) base;
787  lm_trie_t *trie = model->trie;
788  memset(trie->prev_hist, -1, sizeof(trie->prev_hist)); //prepare request history
789  memset(trie->backoff, 0, sizeof(trie->backoff));
790  return;
791 }
792 
793 static ngram_funcs_t ngram_model_trie_funcs = {
794  ngram_model_trie_free, /* free */
795  trie_apply_weights, /* apply_weights */
796  ngram_model_trie_score, /* score */
797  ngram_model_trie_raw_score, /* raw_score */
798  lm_trie_add_ug, /* add_ug */
799  lm_trie_flush /* flush */
800 };
#define E_ERROR_SYSTEM(...)
Print error text; Call perror("");.
Definition: err.h:99
lm_trie_t * trie
Trie structure that stores ngram relations and weights.
Miscellaneous useful string functions.
#define E_INFO(...)
Print logging information to standard error stream.
Definition: err.h:114
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
#define E_ERROR(...)
Print error message to error log.
Definition: err.h:104
hash_table_t * wid
Mapping of unigram names to word IDs.
char ** word_str
Unigram names.
ngram_model_t base
Base ngram_model_t structure.
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT int logmath_log(logmath_t *lmath, float64 p)
Convert linear floating point number to integer log in base B.
Definition: logmath.c:447
uint8 writable
Are word strings writable?
Line iterator for files.
Definition: pio.h:177
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition: ckd_alloc.h:264
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:244
Structure that stores address of certain value in bit array.
Definition: bitarr.h:75
SPHINXBASE_EXPORT float64 logmath_log_float_to_log10(logmath_t *lmath, float log_p)
Convert float log in base B to base 10 log.
Definition: logmath.c:496
int32 n_1g_alloc
Number of allocated word strings (for new word addition)
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
Definition: strfuncs.c:55
SPHINXBASE_EXPORT void lineiter_free(lineiter_t *li)
Stop reading lines from a file.
Definition: pio.c:367
uint32 * n_counts
Counts for 1, 2, 3, ...
SPHINXBASE_EXPORT uint32 bitarr_read_int25(bitarr_address_t address, uint8 length, uint32 mask)
Read uint32 value from bit array.
Definition: bitarr.c:116
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
Definition: pio.c:347
SPHINXBASE_EXPORT lineiter_t * lineiter_start(FILE *fh)
Start reading lines from a file.
Definition: pio.c:264
uint8 n
This is an n-gram model (1, 2, 3, ...).
Implementation of logging routines.
logmath_t * lmath
Log-math object.
Both ends of string.
Definition: strfuncs.h:73
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
Definition: hash_table.c:508
SPHINXBASE_EXPORT FILE * fopen_comp(const char *file, const char *mode, int32 *ispipe)
Like fopen, but use popen and zcat if it is determined that "file" is compressed (i.e., has a .z, .Z, .gz, or .GZ extension).
Definition: pio.c:107
#define E_WARN(...)
Print warning message to error log.
Definition: err.h:109
SPHINXBASE_EXPORT float logmath_log10_to_log_float(logmath_t *lmath, float64 log_p)
Convert base 10 log (in floating point) to float log in base B.
Definition: logmath.c:480
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Definition: strfuncs.c:123
Opaque structure used to hold the results of command-line parsing.
Implementation-specific functions for operating on ngram_model_t objects.
float32 lw
Language model scaling factor.
SPHINXBASE_EXPORT char * string_trim(char *string, enum string_edge_e which)
Remove whitespace from a string, modifying it in-place.
Definition: strfuncs.c:97
Common implementation of ngram_model_t.
SPHINXBASE_EXPORT void fclose_comp(FILE *fp, int32 ispipe)
Close a file opened using fopen_comp.
Definition: pio.c:184
#define ckd_realloc(ptr, sz)
Macro for ckd_realloc
Definition: ckd_alloc.h:258
file IO related operations.
int32 log_wip
Log of word insertion penalty.