SphinxBase  5prealpha
sphinx_lm_convert.c
Go to the documentation of this file.
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 2009 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
41 #include <sphinxbase/logmath.h>
42 #include <sphinxbase/ngram_model.h>
43 #include <sphinxbase/cmd_ln.h>
44 #include <sphinxbase/ckd_alloc.h>
45 #include <sphinxbase/err.h>
46 #include <sphinxbase/pio.h>
47 #include <sphinxbase/strfuncs.h>
48 
49 #include <stdio.h>
50 #include <string.h>
51 #include <math.h>
52 
53 static const arg_t defn[] = {
54  { "-help",
56  "no",
57  "Shows the usage of the tool"},
58 
59  { "-logbase",
61  "1.0001",
62  "Base in which all log-likelihoods calculated" },
63 
64  { "-i",
66  NULL,
67  "Input language model file (required)"},
68 
69  { "-o",
71  NULL,
72  "Output language model file (required)"},
73 
74  { "-ifmt",
75  ARG_STRING,
76  NULL,
77  "Input language model format (will guess if not specified)"},
78 
79  { "-ofmt",
80  ARG_STRING,
81  NULL,
82  "Output language model file (will guess if not specified)"},
83 
84  { "-case",
85  ARG_STRING,
86  NULL,
87  "Ether 'lower' or 'upper' - case fold to lower/upper case (NOT UNICODE AWARE)" },
88 
89  { "-mmap",
91  "no",
92  "Use memory-mapped I/O for reading binary LM files"},
93 
94  { "-lm_trie",
96  "no",
97  "Whether trie structure should be used for model holding during convertion"},
98 
99  { "-debug",
100  ARG_INT32,
101  NULL,
102  "Verbosity level for debugging messages"
103  },
104 
105  { NULL, 0, NULL, NULL }
106 };
107 
108 static void
109 usagemsg(char *pgm)
110 {
111  E_INFO("Usage: %s -i <input.lm> \\\n", pgm);
112  E_INFOCONT("\t[-ifmt txt] [-ofmt dmp]\n");
113  E_INFOCONT("\t-o <output.lm.DMP>\n");
114 
115  exit(0);
116 }
117 
118 
119 int
120 main(int argc, char *argv[])
121 {
122  cmd_ln_t *config;
123  ngram_model_t *lm = NULL;
124  logmath_t *lmath;
125  int itype, otype;
126  char const *kase;
127 
128  if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
129  return 1;
130 
131  if (cmd_ln_boolean_r(config, "-help")) {
132  usagemsg(argv[0]);
133  }
134 
135  err_set_debug_level(cmd_ln_int32_r(config, "-debug"));
136 
137  /* Create log math object. */
138  if ((lmath = logmath_init
139  (cmd_ln_float64_r(config, "-logbase"), 0, 0)) == NULL) {
140  E_FATAL("Failed to initialize log math\n");
141  }
142 
143  if (cmd_ln_str_r(config, "-i") == NULL || cmd_ln_str_r(config, "-i") == NULL) {
144  E_ERROR("Please specify both input and output models\n");
145  goto error_out;
146  }
147 
148 
149  /* Load the input language model. */
150  if (cmd_ln_str_r(config, "-ifmt")) {
151  if ((itype = ngram_str_to_type(cmd_ln_str_r(config, "-ifmt")))
152  == NGRAM_INVALID) {
153  E_ERROR("Invalid input type %s\n", cmd_ln_str_r(config, "-ifmt"));
154  goto error_out;
155  }
156  lm = ngram_model_read(config, cmd_ln_str_r(config, "-i"),
157  itype, lmath);
158  }
159  else {
160  lm = ngram_model_read(config, cmd_ln_str_r(config, "-i"),
161  NGRAM_AUTO, lmath);
162  }
163 
164  if (lm == NULL) {
165  E_FATAL("Failed to read the model from the file '%s'", cmd_ln_str_r(config, "-i"));
166  }
167 
168  /* Guess or set the output language model type. */
169  if (cmd_ln_str_r(config, "-ofmt")) {
170  if ((otype = ngram_str_to_type(cmd_ln_str_r(config, "-ofmt")))
171  == NGRAM_INVALID) {
172  E_ERROR("Invalid output type %s\n", cmd_ln_str_r(config, "-ofmt"));
173  goto error_out;
174  }
175  }
176  else {
177  otype = ngram_file_name_to_type(cmd_ln_str_r(config, "-o"));
178  }
179 
180  /* Case fold if requested. */
181  if ((kase = cmd_ln_str_r(config, "-case"))) {
182  if (0 == strcmp(kase, "lower")) {
183  ngram_model_casefold(lm, NGRAM_LOWER);
184  }
185  else if (0 == strcmp(kase, "upper")) {
186  ngram_model_casefold(lm, NGRAM_UPPER);
187  }
188  else {
189  E_ERROR("Unknown value for -case: %s\n", kase);
190  goto error_out;
191  }
192  }
193 
194  /* Write the output language model. */
195  if (ngram_model_write(lm, cmd_ln_str_r(config, "-o"), otype) != 0) {
196  E_ERROR("Failed to write language model in format %s to %s\n",
197  ngram_type_to_str(otype), cmd_ln_str_r(config, "-o"));
198  goto error_out;
199  }
200 
201  /* That's all folks! */
202  ngram_model_free(lm);
203  if (lmath) {
204  logmath_free(lmath);
205  }
206  if (config) {
207  cmd_ln_free_r(config);
208  }
209  return 0;
210 
211 error_out:
212  ngram_model_free(lm);
213  if (lmath) {
214  logmath_free(lmath);
215  }
216  if (config) {
217  cmd_ln_free_r(config);
218  }
219  return 1;
220 }
Command-line and other configurationparsing and handling.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_read(cmd_ln_t *config, const char *file_name, ngram_file_type_t file_type, logmath_t *lmath)
Read an N-Gram model from a file on disk.
Definition: ngram_model.c:124
Miscellaneous useful string functions.
#define E_INFO(...)
Print logging information to standard error stream.
Definition: err.h:114
SPHINXBASE_EXPORT int ngram_model_write(ngram_model_t *model, const char *file_name, ngram_file_type_t format)
Write an N-Gram model to disk.
Definition: ngram_model.c:178
SPHINXBASE_EXPORT int ngram_model_casefold(ngram_model_t *model, int kase)
Case-fold word strings in an N-Gram model.
Definition: ngram_model.c:308
#define E_ERROR(...)
Print error message to error log.
Definition: err.h:104
SPHINXBASE_EXPORT int err_set_debug_level(int level)
Set debugging verbosity level.
Definition: err.c:279
#define ARG_INT32
Definition: cmd_ln.h:144
Sphinx's memory allocation/deallocation routines.
#define E_INFOCONT(...)
Continue printing the information to standard error stream.
Definition: err.h:119
SPHINXBASE_EXPORT int cmd_ln_free_r(cmd_ln_t *cmdln)
Release a command-line argument set and all associated strings.
Definition: cmd_ln.c:1032
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_r(cmd_ln_t *inout_cmdln, arg_t const *defn, int32 argc, char *argv[], int32 strict)
Parse a list of strings into argumetns.
Definition: cmd_ln.c:553
#define ARG_STRING
String argument (optional).
Definition: cmd_ln.h:114
SPHINXBASE_EXPORT int logmath_free(logmath_t *lmath)
Free a log table.
Definition: logmath.c:342
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
Definition: ngram_model.c:263
SPHINXBASE_EXPORT char const * cmd_ln_str_r(cmd_ln_t *cmdln, char const *name)
Retrieve a string from a command-line object.
Definition: cmd_ln.c:945
#define REQARG_STRING
Required string argument.
Definition: cmd_ln.h:135
SPHINXBASE_EXPORT logmath_t * logmath_init(float64 base, int shift, int use_table)
Initialize a log math computation table.
Definition: logmath.c:62
#define ARG_FLOAT64
Definition: cmd_ln.h:152
N-Gram language models.
SPHINXBASE_EXPORT char const * ngram_type_to_str(int type)
Get the canonical name for an N-Gram file type.
Definition: ngram_model.c:110
Implementation of logging routines.
#define ARG_BOOLEAN
Boolean (true/false) argument (optional).
Definition: cmd_ln.h:118
Argument definition structure.
SPHINXBASE_EXPORT ngram_file_type_t ngram_str_to_type(const char *str_name)
Get the N-Gram file type from a string.
Definition: ngram_model.c:99
Opaque structure used to hold the results of command-line parsing.
SPHINXBASE_EXPORT ngram_file_type_t ngram_file_name_to_type(const char *file_name)
Guess the file type for an N-Gram model from the filename.
Definition: ngram_model.c:63
Not a valid file type.
Definition: ngram_model.h:77
#define cmd_ln_boolean_r(c, n)
Retrieve a boolean value from a command-line object.
Definition: cmd_ln.h:334
cmd_ln_t * config
Configuration parameters.
Definition: sphinx_fe.c:74
#define E_FATAL(...)
Exit with non-zero status after error message.
Definition: err.h:81
Common implementation of ngram_model_t.
Fast integer logarithmic addition operations.
Determine file type automatically.
Definition: ngram_model.h:78
file IO related operations.