SphinxBase  5prealpha
ngrams_raw.c
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 2015 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 
38 #include <string.h>
39 #include <assert.h>
40 
41 #include <sphinxbase/err.h>
42 #include <sphinxbase/pio.h>
43 #include <sphinxbase/strfuncs.h>
44 #include <sphinxbase/ckd_alloc.h>
45 #include <sphinxbase/priority_queue.h>
46 #include <sphinxbase/byteorder.h>
47 
48 #include "ngram_model_internal.h"
49 #include "ngrams_raw.h"
50 
51 int
52 ngram_comparator(const void *first_void, const void *second_void)
53 {
54  static int order = -1;
55  uint32 *first, *second, *end;
56 
57  if (first_void == NULL) {
58  //technical usage, setuping order
59  order = *(int *) second_void;
60  return 0;
61  }
62  if (order < 2) {
63  E_ERROR("Order for ngram comprator was not set\n");
64  return 0;
65  }
66  first = ((ngram_raw_t *) first_void)->words;
67  second = ((ngram_raw_t *) second_void)->words;
68  end = first + order;
69  for (; first != end; ++first, ++second) {
70  if (*first < *second)
71  return -1;
72  if (*first > *second)
73  return 1;
74  }
75  return 0;
76 }
77 
78 int
79 ngram_ord_comparator(void *a_raw, void *b_raw)
80 {
81  ngram_raw_ord_t *a = (ngram_raw_ord_t *) a_raw;
82  ngram_raw_ord_t *b = (ngram_raw_ord_t *) b_raw;
83  int a_w_ptr = 0;
84  int b_w_ptr = 0;
85  while (a_w_ptr < a->order && b_w_ptr < b->order) {
86  if (a->instance.words[a_w_ptr] == b->instance.words[b_w_ptr]) {
87  a_w_ptr++;
88  b_w_ptr++;
89  continue;
90  }
91  if (a->instance.words[a_w_ptr] < b->instance.words[b_w_ptr])
92  return 1;
93  else
94  return -1;
95  }
96  return b->order - a->order;
97 }
98 
99 static void
100 read_ngram_instance(lineiter_t ** li, hash_table_t * wid,
101  logmath_t * lmath, int order, int order_max,
102  ngram_raw_t * raw_ngram)
103 {
104  int n;
105  int words_expected;
106  int i;
107  char *wptr[NGRAM_MAX_ORDER + 1];
108  uint32 *word_out;
109 
110  *li = lineiter_next(*li);
111  if (*li == NULL) {
112  E_ERROR("Unexpected end of ARPA file. Failed to read %d-gram\n",
113  order);
114  return;
115  }
116  string_trim((*li)->buf, STRING_BOTH);
117  words_expected = order + 1;
118 
119  if ((n =
120  str2words((*li)->buf, wptr,
121  NGRAM_MAX_ORDER + 1)) < words_expected) {
122  if ((*li)->buf[0] != '\0') {
123  E_WARN("Format error; %d-gram ignored: %s\n", order,
124  (*li)->buf);
125  }
126  }
127  else {
128  if (order == order_max) {
129  raw_ngram->weights =
130  (float *) ckd_calloc(1, sizeof(*raw_ngram->weights));
131  raw_ngram->weights[0] = atof_c(wptr[0]);
132  if (raw_ngram->weights[0] > 0) {
133  E_WARN("%d-gram [%s] has positive probability. Zeroize\n",
134  order, wptr[1]);
135  raw_ngram->weights[0] = 0.0f;
136  }
137  raw_ngram->weights[0] =
138  logmath_log10_to_log_float(lmath, raw_ngram->weights[0]);
139  }
140  else {
141  float weight, backoff;
142  raw_ngram->weights =
143  (float *) ckd_calloc(2, sizeof(*raw_ngram->weights));
144 
145  weight = atof_c(wptr[0]);
146  if (weight > 0) {
147  E_WARN("%d-gram [%s] has positive probability. Zeroize\n",
148  order, wptr[1]);
149  raw_ngram->weights[0] = 0.0f;
150  }
151  else {
152  raw_ngram->weights[0] =
153  logmath_log10_to_log_float(lmath, weight);
154  }
155 
156  if (n == order + 1) {
157  raw_ngram->weights[1] = 0.0f;
158  }
159  else {
160  backoff = atof_c(wptr[order + 1]);
161  raw_ngram->weights[1] =
162  logmath_log10_to_log_float(lmath, backoff);
163  }
164  }
165  raw_ngram->words =
166  (uint32 *) ckd_calloc(order, sizeof(*raw_ngram->words));
167  for (word_out = raw_ngram->words + order - 1, i = 1;
168  word_out >= raw_ngram->words; --word_out, i++) {
169  hash_table_lookup_int32(wid, wptr[i], (int32 *) word_out);
170  }
171  }
172 }
173 
174 static void
175 ngrams_raw_read_order(ngram_raw_t ** raw_ngrams, lineiter_t ** li,
176  hash_table_t * wid, logmath_t * lmath, uint32 count,
177  int order, int order_max)
178 {
179  char expected_header[20];
180  uint32 i;
181 
182  sprintf(expected_header, "\\%d-grams:", order);
183  while ((*li = lineiter_next(*li))) {
184  string_trim((*li)->buf, STRING_BOTH);
185  if (strcmp((*li)->buf, expected_header) == 0)
186  break;
187  }
188  *raw_ngrams = (ngram_raw_t *) ckd_calloc(count, sizeof(ngram_raw_t));
189  for (i = 0; i < count; i++) {
190  read_ngram_instance(li, wid, lmath, order, order_max,
191  &((*raw_ngrams)[i]));
192  }
193 
194  //sort raw ngrams that was read
195  ngram_comparator(NULL, &order); //setting up order in comparator
196  qsort(*raw_ngrams, count, sizeof(ngram_raw_t), &ngram_comparator);
197 }
198 
199 ngram_raw_t **
200 ngrams_raw_read_arpa(lineiter_t ** li, logmath_t * lmath, uint32 * counts,
201  int order, hash_table_t * wid)
202 {
203  ngram_raw_t **raw_ngrams;
204  int order_it;
205 
206  raw_ngrams =
207  (ngram_raw_t **) ckd_calloc(order - 1, sizeof(*raw_ngrams));
208  for (order_it = 2; order_it <= order; order_it++) {
209  ngrams_raw_read_order(&raw_ngrams[order_it - 2], li, wid, lmath,
210  counts[order_it - 1], order_it, order);
211  }
212  //check for end-mark in arpa file
213  *li = lineiter_next(*li);
214  string_trim((*li)->buf, STRING_BOTH);
215  //skip empty lines if any
216  while (*li && strlen((*li)->buf) == 0) {
217  *li = lineiter_next(*li);
218  string_trim((*li)->buf, STRING_BOTH);
219  }
220  //check if we finished reading
221  if (*li == NULL)
222  E_ERROR("ARPA file ends without end-mark\n");
223  //check if we found ARPA end-mark
224  if (strcmp((*li)->buf, "\\end\\") != 0)
225  E_ERROR
226  ("Finished reading ARPA file. Expecting end mark but found [%s]\n",
227  (*li)->buf);
228 
229  return raw_ngrams;
230 }
231 
232 static void
233 read_dmp_weight_array(FILE * fp, logmath_t * lmath, uint8 do_swap,
234  int32 counts, ngram_raw_t * raw_ngrams,
235  int weight_idx)
236 {
237  int32 i, k;
238  dmp_weight_t *tmp_weight_arr;
239 
240  fread(&k, sizeof(k), 1, fp);
241  if (do_swap)
242  SWAP_INT32(&k);
243  tmp_weight_arr =
244  (dmp_weight_t *) ckd_calloc(k, sizeof(*tmp_weight_arr));
245  fread(tmp_weight_arr, sizeof(*tmp_weight_arr), k, fp);
246  for (i = 0; i < k; i++) {
247  if (do_swap)
248  SWAP_INT32(&tmp_weight_arr[i].l);
249  /* Convert values to log. */
250  tmp_weight_arr[i].f =
251  logmath_log10_to_log_float(lmath, tmp_weight_arr[i].f);
252  }
253  //replace indexes with real probs in raw bigrams
254  for (i = 0; i < counts; i++) {
255  raw_ngrams[i].weights[weight_idx] =
256  tmp_weight_arr[(int) raw_ngrams[i].weights[weight_idx]].f;
257  }
258  ckd_free(tmp_weight_arr);
259 }
260 
261 #define BIGRAM_SEGMENT_SIZE 9
262 
263 ngram_raw_t **
264 ngrams_raw_read_dmp(FILE * fp, logmath_t * lmath, uint32 * counts,
265  int order, uint32 * unigram_next, uint8 do_swap)
266 {
267  int i;
268  uint32 j, ngram_idx;
269  uint16 *bigrams_next;
270  ngram_raw_t **raw_ngrams =
271  (ngram_raw_t **) ckd_calloc(order - 1, sizeof(*raw_ngrams));
272 
273  //read bigrams
274  raw_ngrams[0] =
275  (ngram_raw_t *) ckd_calloc((size_t) (counts[1] + 1),
276  sizeof(*raw_ngrams[0]));
277  bigrams_next =
278  (uint16 *) ckd_calloc((size_t) (counts[1] + 1),
279  sizeof(*bigrams_next));
280  ngram_idx = 1;
281  for (j = 0; j <= (int32) counts[1]; j++) {
282  uint16 wid, prob_idx, bo_idx;
283  ngram_raw_t *raw_ngram = &raw_ngrams[0][j];
284 
285  fread(&wid, sizeof(wid), 1, fp);
286  if (do_swap)
287  SWAP_INT16(&wid);
288  raw_ngram->words =
289  (uint32 *) ckd_calloc(2, sizeof(*raw_ngram->words));
290  raw_ngram->words[0] = (uint32) wid;
291  while (ngram_idx < counts[0] && j == unigram_next[ngram_idx]) {
292  ngram_idx++;
293  }
294  raw_ngram->words[1] = (uint32) ngram_idx - 1;
295  raw_ngram->weights =
296  (float *) ckd_calloc(2, sizeof(*raw_ngram->weights));
297  fread(&prob_idx, sizeof(prob_idx), 1, fp);
298  if (do_swap)
299  SWAP_INT16(&prob_idx);
300  raw_ngram->weights[0] = prob_idx + 0.5f; //keep index in float. ugly but avoiding using extra memory
301  fread(&bo_idx, sizeof(bo_idx), 1, fp);
302  if (do_swap)
303  SWAP_INT16(&bo_idx);
304  raw_ngram->weights[1] = bo_idx + 0.5f; //keep index in float. ugly but avoiding using extra memory
305  fread(&bigrams_next[j], sizeof(bigrams_next[j]), 1, fp);
306  if (do_swap)
307  SWAP_INT16(&bigrams_next[j]);
308  }
309  assert(ngram_idx == counts[0]);
310 
311  //read trigrams
312  if (order > 2) {
313  raw_ngrams[1] =
314  (ngram_raw_t *) ckd_calloc((size_t) counts[2],
315  sizeof(*raw_ngrams[1]));
316  for (j = 0; j < (int32) counts[2]; j++) {
317  uint16 wid, prob_idx;
318  ngram_raw_t *raw_ngram = &raw_ngrams[1][j];
319 
320  fread(&wid, sizeof(wid), 1, fp);
321  if (do_swap)
322  SWAP_INT16(&wid);
323  raw_ngram->words =
324  (uint32 *) ckd_calloc(3, sizeof(*raw_ngram->words));
325  raw_ngram->words[0] = (uint32) wid;
326  raw_ngram->weights =
327  (float *) ckd_calloc(1, sizeof(*raw_ngram->weights));
328  fread(&prob_idx, sizeof(prob_idx), 1, fp);
329  if (do_swap)
330  SWAP_INT16(&prob_idx);
331  raw_ngram->weights[0] = prob_idx + 0.5f; //keep index in float. ugly but avoiding using extra memory
332  }
333  }
334 
335  //read prob2
336  read_dmp_weight_array(fp, lmath, do_swap, (int32) counts[1],
337  raw_ngrams[0], 0);
338  //read bo2
339  if (order > 2) {
340  int32 k;
341  int32 *tseg_base;
342  read_dmp_weight_array(fp, lmath, do_swap, (int32) counts[1],
343  raw_ngrams[0], 1);
344  //read prob3
345  read_dmp_weight_array(fp, lmath, do_swap, (int32) counts[2],
346  raw_ngrams[1], 0);
347  /* Read tseg_base size and tseg_base to fill trigram's first words */
348  fread(&k, sizeof(k), 1, fp);
349  if (do_swap)
350  SWAP_INT32(&k);
351  tseg_base = (int32 *) ckd_calloc(k, sizeof(int32));
352  fread(tseg_base, sizeof(int32), k, fp);
353  if (do_swap) {
354  for (j = 0; j < (uint32) k; j++) {
355  SWAP_INT32(&tseg_base[j]);
356  }
357  }
358  ngram_idx = 0;
359  for (j = 1; j <= counts[1]; j++) {
360  uint32 next_ngram_idx =
361  (uint32) (tseg_base[j >> BIGRAM_SEGMENT_SIZE] +
362  bigrams_next[j]);
363  while (ngram_idx < next_ngram_idx) {
364  raw_ngrams[1][ngram_idx].words[1] =
365  raw_ngrams[0][j - 1].words[0];
366  raw_ngrams[1][ngram_idx].words[2] =
367  raw_ngrams[0][j - 1].words[1];
368  ngram_idx++;
369  }
370  }
371  ckd_free(tseg_base);
372  assert(ngram_idx == counts[2]);
373  }
374  ckd_free(bigrams_next);
375 
376  //sort raw ngrams for reverse trie
377  i = 2; //set order
378  ngram_comparator(NULL, &i);
379  qsort(raw_ngrams[0], (size_t) counts[1], sizeof(*raw_ngrams[0]),
380  &ngram_comparator);
381  if (order > 2) {
382  i = 3; //set order
383  ngram_comparator(NULL, &i);
384  qsort(raw_ngrams[1], (size_t) counts[2], sizeof(*raw_ngrams[1]),
385  &ngram_comparator);
386  }
387  return raw_ngrams;
388 }
389 
390 void
391 ngrams_raw_fix_counts(ngram_raw_t ** raw_ngrams, uint32 * counts,
392  uint32 * fixed_counts, int order)
393 {
394  priority_queue_t *ngrams =
395  priority_queue_create(order - 1, &ngram_ord_comparator);
396  uint32 raw_ngram_ptrs[NGRAM_MAX_ORDER - 1];
397  uint32 words[NGRAM_MAX_ORDER];
398  int i;
399 
400  memset(words, -1, sizeof(words)); //since we have unsigned word idx that will give us unreachable maximum word index
401  memcpy(fixed_counts, counts, order * sizeof(*fixed_counts));
402  for (i = 2; i <= order; i++) {
403  ngram_raw_ord_t *tmp_ngram;
404 
405  if (counts[i - 1] <= 0)
406  continue;
407  tmp_ngram =
408  (ngram_raw_ord_t *) ckd_calloc(1, sizeof(*tmp_ngram));
409  tmp_ngram->order = i;
410  raw_ngram_ptrs[i - 2] = 0;
411  tmp_ngram->instance = raw_ngrams[i - 2][0];
412  priority_queue_add(ngrams, tmp_ngram);
413  }
414 
415  for (;;) {
416  int32 to_increment = TRUE;
417  ngram_raw_ord_t *top;
418  if (priority_queue_size(ngrams) == 0) {
419  break;
420  }
421  top = (ngram_raw_ord_t *) priority_queue_poll(ngrams);
422  if (top->order == 2) {
423  memcpy(words, top->instance.words, 2 * sizeof(*words));
424  }
425  else {
426  for (i = 0; i < top->order - 1; i++) {
427  if (words[i] != top->instance.words[i]) {
428  int num;
429  num = (i == 0) ? 1 : i;
430  memcpy(words, top->instance.words,
431  (num + 1) * sizeof(*words));
432  fixed_counts[num]++;
433  to_increment = FALSE;
434  break;
435  }
436  }
437  words[top->order - 1] = top->instance.words[top->order - 1];
438  }
439  if (to_increment) {
440  raw_ngram_ptrs[top->order - 2]++;
441  }
442  if (raw_ngram_ptrs[top->order - 2] < counts[top->order - 1]) {
443  top->instance =
444  raw_ngrams[top->order - 2][raw_ngram_ptrs[top->order - 2]];
445  priority_queue_add(ngrams, top);
446  }
447  else {
448  ckd_free(top);
449  }
450  }
451 
452  assert(priority_queue_size(ngrams) == 0);
453  priority_queue_free(ngrams, NULL);
454 }
455 
456 void
457 ngrams_raw_free(ngram_raw_t ** raw_ngrams, uint32 * counts, int order)
458 {
459  uint32 num;
460  int order_it;
461 
462  for (order_it = 0; order_it < order - 1; order_it++) {
463  for (num = 0; num < counts[order_it + 1]; num++) {
464  ckd_free(raw_ngrams[order_it][num].weights);
465  ckd_free(raw_ngrams[order_it][num].words);
466  }
467  ckd_free(raw_ngrams[order_it]);
468  }
469  ckd_free(raw_ngrams);
470 }
SPHINXBASE_EXPORT int32 hash_table_lookup_int32(hash_table_t *h, const char *key, int32 *val)
Look up a 32-bit integer value in a hash table.
Definition: hash_table.c:329
Miscellaneous useful string functions.
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
#define E_ERROR(...)
Print error message to error log.
Definition: err.h:104
Sphinx's memory allocation/deallocation routines.
Line iterator for files.
Definition: pio.h:177
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:244
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
Definition: strfuncs.c:55
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
Definition: pio.c:347
Implementation of logging routines.
Both ends of string.
Definition: strfuncs.h:73
#define E_WARN(...)
Print warning message to error log.
Definition: err.h:109
SPHINXBASE_EXPORT float logmath_log10_to_log_float(logmath_t *lmath, float64 log_p)
Convert base 10 log (in floating point) to float log in base B.
Definition: logmath.c:480
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Definition: strfuncs.c:123
SPHINXBASE_EXPORT char * string_trim(char *string, enum string_edge_e which)
Remove whitespace from a string, modifying it in-place.
Definition: strfuncs.c:97
file IO related operations.