MeCab
|
00001 /* 00002 MeCab -- Yet Another Part-of-Speech and Morphological Analyzer 00003 00004 Copyright(C) 2001-2011 Taku Kudo <taku@chasen.org> 00005 Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation 00006 */ 00007 #ifndef MECAB_MECAB_H_ 00008 #define MECAB_MECAB_H_ 00009 00010 /* C/C++ common data structures */ 00011 00015 struct mecab_dictionary_info_t { 00020 const char *filename; 00021 00025 const char *charset; 00026 00030 unsigned int size; 00031 00036 int type; 00037 00041 unsigned int lsize; 00042 00046 unsigned int rsize; 00047 00051 unsigned short version; 00052 00056 struct mecab_dictionary_info_t *next; 00057 }; 00058 00062 struct mecab_path_t { 00066 struct mecab_node_t* rnode; 00067 00071 struct mecab_path_t* rnext; 00072 00076 struct mecab_node_t* lnode; 00077 00082 struct mecab_path_t* lnext; 00083 00087 int cost; 00088 00092 float prob; 00093 }; 00094 00098 struct mecab_node_t { 00102 struct mecab_node_t *prev; 00103 00107 struct mecab_node_t *next; 00108 00112 struct mecab_node_t *enext; 00113 00117 struct mecab_node_t *bnext; 00118 00123 struct mecab_path_t *rpath; 00124 00129 struct mecab_path_t *lpath; 00130 00136 const char *surface; 00137 00141 const char *feature; 00142 00146 unsigned int id; 00147 00151 unsigned short length; 00152 00156 unsigned short rlength; 00157 00161 unsigned short rcAttr; 00162 00166 unsigned short lcAttr; 00167 00171 unsigned short posid; 00172 00176 unsigned char char_type; 00177 00182 unsigned char stat; 00183 00187 unsigned char isbest; 00188 00193 float alpha; 00194 00199 float beta; 00200 00205 float prob; 00206 00210 short wcost; 00211 00215 long cost; 00216 }; 00217 00221 enum { 00225 MECAB_NOR_NODE = 0, 00229 MECAB_UNK_NODE = 1, 00233 MECAB_BOS_NODE = 2, 00237 MECAB_EOS_NODE = 3, 00238 00242 MECAB_EON_NODE = 4 00243 }; 00244 00248 enum { 00252 MECAB_SYS_DIC = 0, 00253 00257 MECAB_USR_DIC = 1, 00258 00262 MECAB_UNK_DIC = 2 00263 }; 00264 00268 enum { 00272 MECAB_ONE_BEST = 1, 00276 MECAB_NBEST = 2, 00282 MECAB_PARTIAL = 4, 00288 MECAB_MARGINAL_PROB = 8, 00293 MECAB_ALTERNATIVE = 16, 00298 MECAB_ALL_MORPHS = 32, 00299 00304 MECAB_ALLOCATE_SENTENCE = 64 00305 }; 00306 00310 enum { 00314 MECAB_ANY_BOUNDARY = 0, 00315 00319 MECAB_TOKEN_BOUNDARY = 1, 00320 00324 MECAB_INSIDE_TOKEN = 2 00325 }; 00326 00327 /* C interface */ 00328 #ifdef __cplusplus 00329 #include <cstdio> 00330 #else 00331 #include <stdio.h> 00332 #endif 00333 00334 #ifdef __cplusplus 00335 extern "C" { 00336 #endif 00337 00338 #ifdef _WIN32 00339 #include <windows.h> 00340 # ifdef DLL_EXPORT 00341 # define MECAB_DLL_EXTERN __declspec(dllexport) 00342 # define MECAB_DLL_CLASS_EXTERN __declspec(dllexport) 00343 # else 00344 # define MECAB_DLL_EXTERN __declspec(dllimport) 00345 # endif 00346 #endif 00347 00348 #ifndef MECAB_DLL_EXTERN 00349 # define MECAB_DLL_EXTERN extern 00350 #endif 00351 00352 #ifndef MECAB_DLL_CLASS_EXTERN 00353 # define MECAB_DLL_CLASS_EXTERN 00354 #endif 00355 00356 typedef struct mecab_t mecab_t; 00357 typedef struct mecab_model_t mecab_model_t; 00358 typedef struct mecab_lattice_t mecab_lattice_t; 00359 typedef struct mecab_dictionary_info_t mecab_dictionary_info_t; 00360 typedef struct mecab_node_t mecab_node_t; 00361 typedef struct mecab_path_t mecab_path_t; 00362 00363 #ifndef SWIG 00364 /* C interface */ 00365 00366 /* old mecab interface */ 00370 MECAB_DLL_EXTERN mecab_t* mecab_new(int argc, char **argv); 00371 00375 MECAB_DLL_EXTERN mecab_t* mecab_new2(const char *arg); 00376 00380 MECAB_DLL_EXTERN const char* mecab_version(); 00381 00385 MECAB_DLL_EXTERN const char* mecab_strerror(mecab_t *mecab); 00386 00390 MECAB_DLL_EXTERN void mecab_destroy(mecab_t *mecab); 00391 00395 MECAB_DLL_EXTERN int mecab_get_partial(mecab_t *mecab); 00396 00400 MECAB_DLL_EXTERN void mecab_set_partial(mecab_t *mecab, int partial); 00401 00405 MECAB_DLL_EXTERN float mecab_get_theta(mecab_t *mecab); 00406 00410 MECAB_DLL_EXTERN void mecab_set_theta(mecab_t *mecab, float theta); 00411 00415 MECAB_DLL_EXTERN int mecab_get_lattice_level(mecab_t *mecab); 00416 00420 MECAB_DLL_EXTERN void mecab_set_lattice_level(mecab_t *mecab, int level); 00421 00425 MECAB_DLL_EXTERN int mecab_get_all_morphs(mecab_t *mecab); 00426 00430 MECAB_DLL_EXTERN void mecab_set_all_morphs(mecab_t *mecab, int all_morphs); 00431 00435 MECAB_DLL_EXTERN int mecab_parse_lattice(mecab_t *mecab, mecab_lattice_t *lattice); 00436 00440 MECAB_DLL_EXTERN const char* mecab_sparse_tostr(mecab_t *mecab, const char *str); 00441 00445 MECAB_DLL_EXTERN const char* mecab_sparse_tostr2(mecab_t *mecab, const char *str, size_t len); 00446 00450 MECAB_DLL_EXTERN char* mecab_sparse_tostr3(mecab_t *mecab, const char *str, size_t len, 00451 char *ostr, size_t olen); 00452 00456 MECAB_DLL_EXTERN const mecab_node_t* mecab_sparse_tonode(mecab_t *mecab, const char*); 00457 00461 MECAB_DLL_EXTERN const mecab_node_t* mecab_sparse_tonode2(mecab_t *mecab, const char*, size_t); 00462 00466 MECAB_DLL_EXTERN const char* mecab_nbest_sparse_tostr(mecab_t *mecab, size_t N, const char *str); 00467 00471 MECAB_DLL_EXTERN const char* mecab_nbest_sparse_tostr2(mecab_t *mecab, size_t N, 00472 const char *str, size_t len); 00473 00477 MECAB_DLL_EXTERN char* mecab_nbest_sparse_tostr3(mecab_t *mecab, size_t N, 00478 const char *str, size_t len, 00479 char *ostr, size_t olen); 00480 00484 MECAB_DLL_EXTERN int mecab_nbest_init(mecab_t *mecab, const char *str); 00485 00489 MECAB_DLL_EXTERN int mecab_nbest_init2(mecab_t *mecab, const char *str, size_t len); 00490 00494 MECAB_DLL_EXTERN const char* mecab_nbest_next_tostr(mecab_t *mecab); 00495 00499 MECAB_DLL_EXTERN char* mecab_nbest_next_tostr2(mecab_t *mecab, char *ostr, size_t olen); 00500 00504 MECAB_DLL_EXTERN const mecab_node_t* mecab_nbest_next_tonode(mecab_t *mecab); 00505 00509 MECAB_DLL_EXTERN const char* mecab_format_node(mecab_t *mecab, const mecab_node_t *node); 00510 00514 MECAB_DLL_EXTERN const mecab_dictionary_info_t* mecab_dictionary_info(mecab_t *mecab); 00515 00516 /* lattice interface */ 00520 MECAB_DLL_EXTERN mecab_lattice_t *mecab_lattice_new(); 00521 00525 MECAB_DLL_EXTERN void mecab_lattice_destroy(mecab_lattice_t *lattice); 00526 00530 MECAB_DLL_EXTERN void mecab_lattice_clear(mecab_lattice_t *lattice); 00531 00536 MECAB_DLL_EXTERN int mecab_lattice_is_available(mecab_lattice_t *lattice); 00537 00541 MECAB_DLL_EXTERN mecab_node_t *mecab_lattice_get_bos_node(mecab_lattice_t *lattice); 00542 00546 MECAB_DLL_EXTERN mecab_node_t *mecab_lattice_get_eos_node(mecab_lattice_t *lattice); 00547 00552 MECAB_DLL_EXTERN mecab_node_t **mecab_lattice_get_all_begin_nodes(mecab_lattice_t *lattice); 00556 MECAB_DLL_EXTERN mecab_node_t **mecab_lattice_get_all_end_nodes(mecab_lattice_t *lattice); 00557 00561 MECAB_DLL_EXTERN mecab_node_t *mecab_lattice_get_begin_nodes(mecab_lattice_t *lattice, size_t pos); 00562 00566 MECAB_DLL_EXTERN mecab_node_t *mecab_lattice_get_end_nodes(mecab_lattice_t *lattice, size_t pos); 00567 00571 MECAB_DLL_EXTERN const char *mecab_lattice_get_sentence(mecab_lattice_t *lattice); 00572 00576 MECAB_DLL_EXTERN void mecab_lattice_set_sentence(mecab_lattice_t *lattice, const char *sentence); 00577 00582 MECAB_DLL_EXTERN void mecab_lattice_set_sentence2(mecab_lattice_t *lattice, const char *sentence, size_t len); 00583 00587 MECAB_DLL_EXTERN size_t mecab_lattice_get_size(mecab_lattice_t *lattice); 00588 00592 MECAB_DLL_EXTERN double mecab_lattice_get_z(mecab_lattice_t *lattice); 00593 00597 MECAB_DLL_EXTERN void mecab_lattice_set_z(mecab_lattice_t *lattice, double Z); 00598 00602 MECAB_DLL_EXTERN double mecab_lattice_get_theta(mecab_lattice_t *lattice); 00603 00608 MECAB_DLL_EXTERN void mecab_lattice_set_theta(mecab_lattice_t *lattice, double theta); 00609 00613 MECAB_DLL_EXTERN int mecab_lattice_next(mecab_lattice_t *lattice); 00614 00618 MECAB_DLL_EXTERN int mecab_lattice_get_request_type(mecab_lattice_t *lattice); 00619 00623 MECAB_DLL_EXTERN int mecab_lattice_has_request_type(mecab_lattice_t *lattice, int request_type); 00624 00628 MECAB_DLL_EXTERN void mecab_lattice_set_request_type(mecab_lattice_t *lattice, int request_type); 00629 00634 MECAB_DLL_EXTERN void mecab_lattice_add_request_type(mecab_lattice_t *lattice, int request_type); 00635 00639 MECAB_DLL_EXTERN void mecab_lattice_remove_request_type(mecab_lattice_t *lattice, int request_type); 00640 00644 MECAB_DLL_EXTERN mecab_node_t *mecab_lattice_new_node(mecab_lattice_t *lattice); 00645 00649 MECAB_DLL_EXTERN const char *mecab_lattice_tostr(mecab_lattice_t *lattice); 00650 00654 MECAB_DLL_EXTERN const char *mecab_lattice_tostr2(mecab_lattice_t *lattice, char *buf, size_t size); 00655 00659 MECAB_DLL_EXTERN const char *mecab_lattice_nbest_tostr(mecab_lattice_t *lattice, size_t N); 00660 00665 MECAB_DLL_EXTERN const char *mecab_lattice_nbest_tostr2(mecab_lattice_t *lattice, size_t N, char *buf, size_t size); 00666 00670 MECAB_DLL_EXTERN int mecab_lattice_has_constraint(mecab_lattice_t *lattice); 00671 00675 MECAB_DLL_EXTERN int mecab_lattice_get_boundary_constraint(mecab_lattice_t *lattice, size_t pos); 00676 00677 00681 MECAB_DLL_EXTERN const char *mecab_lattice_get_feature_constraint(mecab_lattice_t *lattice, size_t pos); 00682 00686 MECAB_DLL_EXTERN void mecab_lattice_set_boundary_constraint(mecab_lattice_t *lattice, size_t pos, int boundary_type); 00687 00691 MECAB_DLL_EXTERN void mecab_lattice_set_feature_constraint(mecab_lattice_t *lattice, size_t begin_pos, size_t end_pos, const char *feature); 00692 00696 MECAB_DLL_EXTERN const char *mecab_lattice_strerror(mecab_lattice_t *lattice); 00697 00698 00699 /* model interface */ 00703 MECAB_DLL_EXTERN mecab_model_t *mecab_model_new(int argc, char **argv); 00704 00708 MECAB_DLL_EXTERN mecab_model_t *mecab_model_new2(const char *arg); 00709 00714 MECAB_DLL_EXTERN void mecab_model_destroy(mecab_model_t *model); 00715 00719 MECAB_DLL_EXTERN mecab_t *mecab_model_new_tagger(mecab_model_t *model); 00720 00724 MECAB_DLL_EXTERN mecab_lattice_t *mecab_model_new_lattice(mecab_model_t *model); 00725 00729 MECAB_DLL_EXTERN int mecab_model_swap(mecab_model_t *model, mecab_model_t *new_model); 00730 00734 MECAB_DLL_EXTERN const mecab_dictionary_info_t* mecab_model_dictionary_info(mecab_model_t *model); 00735 00739 MECAB_DLL_EXTERN int mecab_model_transition_cost(mecab_model_t *model, 00740 unsigned short rcAttr, 00741 unsigned short lcAttr); 00742 00746 MECAB_DLL_EXTERN mecab_node_t *mecab_model_lookup(mecab_model_t *model, 00747 const char *begin, 00748 const char *end, 00749 mecab_lattice_t *lattice); 00750 00751 /* static functions */ 00752 MECAB_DLL_EXTERN int mecab_do(int argc, char **argv); 00753 MECAB_DLL_EXTERN int mecab_dict_index(int argc, char **argv); 00754 MECAB_DLL_EXTERN int mecab_dict_gen(int argc, char **argv); 00755 MECAB_DLL_EXTERN int mecab_cost_train(int argc, char **argv); 00756 MECAB_DLL_EXTERN int mecab_system_eval(int argc, char **argv); 00757 MECAB_DLL_EXTERN int mecab_test_gen(int argc, char **argv); 00758 #endif 00759 00760 #ifdef __cplusplus 00761 } 00762 #endif 00763 00764 /* C++ interface */ 00765 #ifdef __cplusplus 00766 00767 namespace MeCab { 00768 typedef struct mecab_dictionary_info_t DictionaryInfo; 00769 typedef struct mecab_path_t Path; 00770 typedef struct mecab_node_t Node; 00771 00772 template <typename N, typename P> class Allocator; 00773 class Tagger; 00774 00778 class MECAB_DLL_CLASS_EXTERN Lattice { 00779 public: 00783 virtual void clear() = 0; 00784 00789 virtual bool is_available() const = 0; 00790 00796 virtual Node *bos_node() const = 0; 00797 00802 virtual Node *eos_node() const = 0; 00803 00804 #ifndef SWIG 00805 00808 virtual Node **begin_nodes() const = 0; 00809 00813 virtual Node **end_nodes() const = 0; 00814 #endif 00815 00822 virtual Node *end_nodes(size_t pos) const = 0; 00823 00830 virtual Node *begin_nodes(size_t pos) const = 0; 00831 00837 virtual const char *sentence() const = 0; 00838 00843 virtual void set_sentence(const char *sentence) = 0; 00844 00845 #ifndef SWIG 00846 00851 virtual void set_sentence(const char *sentence, size_t len) = 0; 00852 #endif 00853 00858 virtual size_t size() const = 0; 00859 00864 virtual void set_Z(double Z) = 0; 00865 00870 virtual double Z() const = 0; 00871 00876 virtual void set_theta(float theta) = 0; 00877 00882 virtual float theta() const = 0; 00883 00890 virtual bool next() = 0; 00891 00896 virtual int request_type() const = 0; 00897 00902 virtual bool has_request_type(int request_type) const = 0; 00903 00908 virtual void set_request_type(int request_type) = 0; 00909 00914 virtual void add_request_type(int request_type) = 0; 00915 00920 virtual void remove_request_type(int request_type) = 0; 00921 00922 #ifndef SWIG 00923 00926 virtual Allocator<Node, Path> *allocator() const = 0; 00927 #endif 00928 00933 virtual Node *newNode() = 0; 00934 00941 virtual const char *toString() = 0; 00942 00950 virtual const char *toString(const Node *node) = 0; 00951 00959 virtual const char *enumNBestAsString(size_t N) = 0; 00960 00961 #ifndef SWIG 00962 00969 virtual const char *toString(char *buf, size_t size) = 0; 00970 00979 virtual const char *toString(const Node *node, 00980 char *buf, size_t size) = 0; 00981 00990 virtual const char *enumNBestAsString(size_t N, char *buf, size_t size) = 0; 00991 #endif 00992 00996 virtual bool has_constraint() const = 0; 00997 01003 virtual int boundary_constraint(size_t pos) const = 0; 01004 01010 virtual const char *feature_constraint(size_t pos) const = 0; 01011 01017 virtual void set_boundary_constraint(size_t pos, 01018 int boundary_constraint_type) = 0; 01019 01026 virtual void set_feature_constraint( 01027 size_t begin_pos, size_t end_pos, 01028 const char *feature) = 0; 01029 01034 virtual const char *what() const = 0; 01035 01040 virtual void set_what(const char *str) = 0; 01041 01042 #ifndef SWIG 01043 01047 static Lattice *create(); 01048 #endif 01049 01050 virtual ~Lattice() {} 01051 }; 01052 01056 class MECAB_DLL_CLASS_EXTERN Model { 01057 public: 01062 virtual const DictionaryInfo *dictionary_info() const = 0; 01063 01068 virtual int transition_cost(unsigned short rcAttr, 01069 unsigned short lcAttr) const = 0; 01070 01076 virtual Node *lookup(const char *begin, const char *end, 01077 Lattice *lattice) const = 0; 01078 01085 virtual Tagger *createTagger() const = 0; 01086 01091 virtual Lattice *createLattice() const = 0; 01092 01104 virtual bool swap(Model *model) = 0; 01105 01110 static const char *version(); 01111 01112 virtual ~Model() {} 01113 01114 #ifndef SIWG 01115 01123 static Model* create(int argc, char **argv); 01124 01133 static Model* create(const char *arg); 01134 #endif 01135 }; 01136 01140 class MECAB_DLL_CLASS_EXTERN Tagger { 01141 public: 01154 static bool parse(const Model &model, Lattice *lattice); 01155 01165 virtual bool parse(Lattice *lattice) const = 0; 01166 01175 virtual const char* parse(const char *str) = 0; 01176 01186 virtual const Node* parseToNode(const char *str) = 0; 01187 01198 virtual const char* parseNBest(size_t N, const char *str) = 0; 01199 01209 virtual bool parseNBestInit(const char *str) = 0; 01210 01218 virtual const Node* nextNode() = 0; 01219 01227 virtual const char* next() = 0; 01228 01239 virtual const char* formatNode(const Node *node) = 0; 01240 01241 #ifndef SWIG 01242 01252 virtual const char* parse(const char *str, size_t len, char *ostr, size_t olen) = 0; 01253 01260 virtual const char* parse(const char *str, size_t len) = 0; 01261 01268 virtual const Node* parseToNode(const char *str, size_t len) = 0; 01269 01277 virtual const char* parseNBest(size_t N, const char *str, size_t len) = 0; 01278 01286 virtual bool parseNBestInit(const char *str, size_t len) = 0; 01287 01295 virtual const char* next(char *ostr , size_t olen) = 0; 01296 01307 virtual const char* parseNBest(size_t N, const char *str, 01308 size_t len, char *ostr, size_t olen) = 0; 01309 01318 virtual const char* formatNode(const Node *node, char *ostr, size_t olen) = 0; 01319 #endif 01320 01326 virtual void set_request_type(int request_type) = 0; 01327 01333 virtual int request_type() const = 0; 01334 01340 virtual bool partial() const = 0; 01341 01347 virtual void set_partial(bool partial) = 0; 01348 01354 virtual int lattice_level() const = 0; 01355 01361 virtual void set_lattice_level(int level) = 0; 01362 01368 virtual bool all_morphs() const = 0; 01369 01375 virtual void set_all_morphs(bool all_morphs) = 0; 01376 01381 virtual void set_theta(float theta) = 0; 01382 01387 virtual float theta() const = 0; 01388 01393 virtual const DictionaryInfo* dictionary_info() const = 0; 01394 01399 virtual const char* what() const = 0; 01400 01401 virtual ~Tagger() {} 01402 01403 #ifndef SIWG 01404 01412 static Tagger *create(int argc, char **argv); 01413 01422 static Tagger *create(const char *arg); 01423 #endif 01424 01429 static const char *version(); 01430 }; 01431 01432 #ifndef SWIG 01433 01436 MECAB_DLL_EXTERN Lattice *createLattice(); 01437 01441 MECAB_DLL_EXTERN Model *createModel(int argc, char **argv); 01442 01446 MECAB_DLL_EXTERN Model *createModel(const char *arg); 01447 01451 MECAB_DLL_EXTERN Tagger *createTagger(int argc, char **argv); 01452 01456 MECAB_DLL_EXTERN Tagger *createTagger(const char *arg); 01457 01464 MECAB_DLL_EXTERN void deleteLattice(Lattice *lattice); 01465 01466 01473 MECAB_DLL_EXTERN void deleteModel(Model *model); 01474 01481 MECAB_DLL_EXTERN void deleteTagger(Tagger *tagger); 01482 01487 MECAB_DLL_EXTERN const char* getLastError(); 01488 01494 MECAB_DLL_EXTERN const char* getTaggerError(); 01495 #endif 01496 } 01497 #endif 01498 #endif /* MECAB_MECAB_H_ */