tokenizer.h
Go to the documentation of this file.
1 /*
2  * Copyright 2006-2008 The FLWOR Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 #ifndef ZORBA_TOKENIZER_API_H
19 #define ZORBA_TOKENIZER_API_H
20 
21 #include <vector>
22 
23 #include <zorba/config.h>
24 #include <zorba/locale.h>
26 #include <zorba/internal/ztd.h>
27 
28 namespace zorba {
29 
30 class Item;
31 
32 ///////////////////////////////////////////////////////////////////////////////
33 
34 /**
35  * A %Tokenizer breaks a string into a stream of word tokens. Each token is
36  * assigned a token, sentence, and paragraph number.
37  *
38  * A %Tokenizer determines word and sentence boundaries automatically, but must
39  * be told when to increment the paragraph number.
40  */
41 class ZORBA_DLL_PUBLIC Tokenizer {
42 public:
44  ptr;
45 
46  typedef unsigned size_type;
47 
48  /////////////////////////////////////////////////////////////////////////////
49 
50  /**
51  * A %State contains inter-Tokenizer state, currently the current token,
52  * sentence, and paragraph numbers.
53  */
54  struct State {
56 
57  value_type token; ///< Token number.
58  value_type sent; ///< Sentence number.
59  value_type para; ///< Paragraph number.
60 
61  /**
62  * Default constructor.
63  */
64  State();
65  };
66 
67  /////////////////////////////////////////////////////////////////////////////
68 
69  /**
70  * A %Callback is called once per token.
71  * This is only internally by Zorba.
72  * You do not need to derive from this class.
73  */
74  class Callback {
75  public:
77 
78  virtual ~Callback();
79 
80  /**
81  * This member-function is called whenever an item that is being tokenized
82  * is entered or exited. The default implementation does nothing.
83  *
84  * @param item The item being entered or exited.
85  * @param entering If \c true, the item is being entered; if \c false, the
86  * item is being exited.
87  */
88  virtual void item( Item const &item, bool entering );
89 
90  /**
91  * This member-function is called once per token.
92  *
93  * @param utf8_s The UTF-8 token string. It is not null-terminated.
94  * @param utf8_len The number of bytes in the token string.
95  * @param lang The language of the token.
96  * @param token_no The token number. Token numbers start at 0.
97  * @param sent_no The sentence number. Sentence numbers start at 1.
98  * @param para_no The paragraph number. Paragraph numbers start at 1.
99  * @param item The Item this token is from, if any.
100  */
101  virtual void token( char const *utf8_s, size_type utf8_len,
103  size_type token_no, size_type sent_no,
104  size_type para_no, Item const *item = 0 ) = 0;
105  };
106 
107  /////////////////////////////////////////////////////////////////////////////
108 
109  /**
110  * Various properties of this %Tokenizer.
111  */
112  struct Properties {
113  typedef std::vector<locale::iso639_1::type> languages_type;
114 
115  /**
116  * If \c true, XML comments separate tokens. For example,
117  * \c net&lt;!----&gt;work would be 2 tokens instead of 1.
118  */
120 
121  /**
122  * If \c true, XML elements separate tokens. For example,
123  * \c &lt;b&gt;B&lt;/b&gt;old would be 2 tokens instead of 1.
124  */
126 
127  /**
128  * If \c true, XML processing instructions separate tokens. For example,
129  * <code>net&lt;?PI pi?&gt;work</code> would be 2 tokens instead of 1.
130  */
132 
133  /**
134  * The set of languages supported.
135  */
137 
138  /**
139  * The URI that uniquely identifies this %Tokenizer.
140  */
141  char const* uri;
142  };
143 
144  /**
145  * Gets the Properties of this %Tokenizer.
146  *
147  * @param result The Properties to populate.
148  */
149  virtual void properties( Properties *result ) const = 0;
150 
151  /////////////////////////////////////////////////////////////////////////////
152 
153  /**
154  * Destroys this %Tokenizer.
155  * This function is called by Zorba when the %Tokenizer is no longer needed.
156  *
157  * If your TokenizerProvider dynamically allocates %Tokenizer objects, then
158  * the implementation can simply be (and usually is) <code>delete this</code>.
159  *
160  * If your TokenizerProvider returns a pointer to a static %Tokenizer object,
161  * then the implementation should do nothing.
162  */
163  virtual void destroy() const = 0;
164 
165  /**
166  * Gets this %Tokenizer's associated State.
167  *
168  * @return Returns said State.
169  */
170  State& state();
171 
172  /**
173  * Gets this %Tokenizer's associated State.
174  *
175  * @return Returns said State.
176  */
177  State const& state() const;
178 
179  /**
180  * Tokenizes the given node.
181  *
182  * @param node The node to tokenize.
183  * @param lang The default language to use.
184  * @param callback The Callback to call once per token.
185  */
186  void tokenize_node( Item const &node, locale::iso639_1::type lang,
187  Callback &callback );
188 
189  /**
190  * Tokenizes the given string.
191  *
192  * @param utf8_s The UTF-8 string to tokenize. It need not be
193  * null-terminated.
194  * @param utf8_len The number of bytes in the string to be tokenized.
195  * @param lang The language of the string.
196  * @param wildcards If \c true, allows XQuery wildcard syntax characters to
197  * be part of tokens.
198  * @param callback The Callback to call once per token.
199  * @param item The Item this string is from, if any.
200  */
201  virtual void tokenize_string( char const *utf8_s, size_type utf8_len,
202  locale::iso639_1::type lang, bool wildcards,
203  Callback &callback, Item const *item = 0 ) = 0;
204 
205  /////////////////////////////////////////////////////////////////////////////
206 
207 protected:
208  /**
209  * Constructs a %Tokenizer.
210  *
211  * @param state the State to use.
212  */
213  Tokenizer( State &state );
214 
215  /**
216  * Destroys a %Tokenizer.
217  */
218  virtual ~Tokenizer() = 0;
219 
220  /**
221  * Given an element, finds its \c xml:lang attribute, if any, and gets its
222  * value.
223  *
224  * @param element The element to check.
225  * @param lang A pointer to where to put the found language, if any.
226  * @return Returns \c true only if an \c xml:lang attribute is found and the
227  * value is a known language.
228  */
229  bool find_lang_attribute( Item const &element, locale::iso639_1::type *lang );
230 
231  /**
232  * This member-function is called whenever an item that is being tokenized is
233  * entered or exited.
234  *
235  * @param item The item being entered or exited.
236  * @param entering If \c true, the item is being entered; if \c false, the
237  * item is being exited.
238  */
239  virtual void item( Item const &item, bool entering );
240 
241  /**
242  * Tokenizes the given node and all of its child nodes, if any. For each
243  * node, it is required that this function call the item() member function of
244  * both this %Tokenizer and of the Callback twice, once each for entrance and
245  * exit.
246  *
247  * @param node The node to tokenize.
248  * @param lang The default language to use.
249  * @param callback The Callback to call per token.
250  * @param tokenize_acp If \c true, additionally tokenize all attribute,
251  * comment, and processing-instruction nodes encountered;
252  * if \c false, skip them.
253  */
254  virtual void tokenize_node_impl( Item const &node,
256  Callback &callback, bool tokenize_acp );
257 
258 private:
259  State *state_;
260 };
261 
262 inline Tokenizer::Tokenizer( State &state ) : state_( &state ) {
263 }
264 
266  return *state_;
267 }
268 
269 inline Tokenizer::State const& Tokenizer::state() const {
270  return *state_;
271 }
272 
273 inline void Tokenizer::tokenize_node( Item const &item,
275  Callback &callback ) {
276  tokenize_node_impl( item, lang, callback, true );
277 }
278 
279 ///////////////////////////////////////////////////////////////////////////////
280 
281 /**
282  * A %TokenizerProvider provides a Tokenizer for a given language.
283  */
284 class ZORBA_DLL_PUBLIC TokenizerProvider {
285 public:
286  virtual ~TokenizerProvider();
287 
288  /**
289  * Creates a new %Tokenizer.
290  *
291  * @param lang The language of the text that the tokenizer will tokenize.
292  * @param state The State to use. If \c null, \a t is not set.
293  * @param t If not \c null, set to point to a Tokenizer for \a lang.
294  * @return Returns \c true only if this provider can provide a tokenizer for
295  * \a lang.
296  */
297  virtual bool getTokenizer( locale::iso639_1::type lang,
298  Tokenizer::State *state = 0,
299  Tokenizer::ptr *t = 0 ) const = 0;
300 };
301 
302 ///////////////////////////////////////////////////////////////////////////////
303 
304 } // namespace zorba
305 #endif /* ZORBA_TOKENIZER_API_H */
306 /* vim:set et sw=2 ts=2: */