Hubbub
initial.c
Go to the documentation of this file.
1 /*
2  * This file is part of Hubbub.
3  * Licensed under the MIT License,
4  * http://www.opensource.org/licenses/mit-license.php
5  * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
6  * Copyright 2008 Andrew Sidwell <takkaria@netsurf-browser.org>
7  */
8 
9 #include <assert.h>
10 #include <string.h>
11 
12 #include "treebuilder/modes.h"
13 #include "treebuilder/internal.h"
15 #include "utils/utils.h"
16 #include "utils/string.h"
17 
18 
19 #define S(s) { s, sizeof s - 1 }
20 
21 struct {
22  const char *name;
23  size_t len;
24 } public_doctypes[] = {
25  S("+//Silmaril//dtd html Pro v0r11 19970101//"),
26  S("-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"),
27  S("-//AS//DTD HTML 3.0 asWedit + extensions//"),
28  S("-//IETF//DTD HTML 2.0 Level 1//"),
29  S("-//IETF//DTD HTML 2.0 Level 2//"),
30  S("-//IETF//DTD HTML 2.0 Strict Level 1//"),
31  S("-//IETF//DTD HTML 2.0 Strict Level 2//"),
32  S("-//IETF//DTD HTML 2.0 Strict//"),
33  S("-//IETF//DTD HTML 2.0//"),
34  S("-//IETF//DTD HTML 2.1E//"),
35  S("-//IETF//DTD HTML 3.0//"),
36  S("-//IETF//DTD HTML 3.2 Final//"),
37  S("-//IETF//DTD HTML 3.2//"),
38  S("-//IETF//DTD HTML 3//"),
39  S("-//IETF//DTD HTML Level 0//"),
40  S("-//IETF//DTD HTML Level 1//"),
41  S("-//IETF//DTD HTML Level 2//"),
42  S("-//IETF//DTD HTML Level 3//"),
43  S("-//IETF//DTD HTML Strict Level 0//"),
44  S("-//IETF//DTD HTML Strict Level 1//"),
45  S("-//IETF//DTD HTML Strict Level 2//"),
46  S("-//IETF//DTD HTML Strict Level 3//"),
47  S("-//IETF//DTD HTML Strict//"),
48  S("-//IETF//DTD HTML//"),
49  S("-//Metrius//DTD Metrius Presentational//"),
50  S("-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"),
51  S("-//Microsoft//DTD Internet Explorer 2.0 HTML//"),
52  S("-//Microsoft//DTD Internet Explorer 2.0 Tables//"),
53  S("-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"),
54  S("-//Microsoft//DTD Internet Explorer 3.0 HTML//"),
55  S("-//Microsoft//DTD Internet Explorer 3.0 Tables//"),
56  S("-//Netscape Comm. Corp.//DTD HTML//"),
57  S("-//Netscape Comm. Corp.//DTD Strict HTML//"),
58  S("-//O'Reilly and Associates//DTD HTML 2.0//"),
59  S("-//O'Reilly and Associates//DTD HTML Extended 1.0//"),
60  S("-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"),
61  S("-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//"),
62  S("-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//"),
63  S("-//Spyglass//DTD HTML 2.0 Extended//"),
64  S("-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"),
65  S("-//Sun Microsystems Corp.//DTD HotJava HTML//"),
66  S("-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"),
67  S("-//W3C//DTD HTML 3 1995-03-24//"),
68  S("-//W3C//DTD HTML 3.2 Draft//"),
69  S("-//W3C//DTD HTML 3.2 Final//"),
70  S("-//W3C//DTD HTML 3.2//"),
71  S("-//W3C//DTD HTML 3.2S Draft//"),
72  S("-//W3C//DTD HTML 4.0 Frameset//"),
73  S("-//W3C//DTD HTML 4.0 Transitional//"),
74  S("-//W3C//DTD HTML Experimental 19960712//"),
75  S("-//W3C//DTD HTML Experimental 970421//"),
76  S("-//W3C//DTD W3 HTML//"),
77  S("-//W3O//DTD W3 HTML 3.0//"),
78 };
79 
80 #undef S
81 
82 
91 static bool starts_with(const uint8_t *a, size_t a_len, const uint8_t *b,
92  size_t b_len)
93 {
94  if (a_len < b_len)
95  return false;
96 
97  /* Now perform an insensitive comparison on the prefix */
98  return hubbub_string_match_ci(a, b_len, b, b_len);
99 }
100 
101 
109 static bool lookup_full_quirks(hubbub_treebuilder *treebuilder,
110  const hubbub_doctype *cdoc)
111 {
112  size_t i;
113 
114  const uint8_t *name = cdoc->name.ptr;
115  size_t name_len = cdoc->name.len;
116 
117  const uint8_t *public_id = cdoc->public_id.ptr;
118  size_t public_id_len = cdoc->public_id.len;
119 
120  const uint8_t *system_id = cdoc->system_id.ptr;
121  size_t system_id_len = cdoc->system_id.len;
122 
123  UNUSED(treebuilder);
124 
125 #define S(s) (uint8_t *) s, sizeof s - 1
126 
127  /* Check the name is "HTML" (case-insensitively) */
128  if (!hubbub_string_match_ci(name, name_len, S("HTML")))
129  return true;
130 
131  /* No public id means not-quirks */
132  if (cdoc->public_missing)
133  return false;
134 
135  for (i = 0; i < sizeof public_doctypes / sizeof public_doctypes[0]; i++)
136  {
137  if (starts_with(public_id, public_id_len,
138  (uint8_t *) public_doctypes[i].name,
139  public_doctypes[i].len)) {
140  return true;
141  }
142  }
143 
144  if (hubbub_string_match_ci(public_id, public_id_len,
145  S("-//W3O//DTD W3 HTML Strict 3.0//EN//")) ||
146  hubbub_string_match_ci(public_id, public_id_len,
147  S("-/W3C/DTD HTML 4.0 Transitional/EN")) ||
148  hubbub_string_match_ci(public_id, public_id_len,
149  S("HTML")) ||
150  hubbub_string_match_ci(system_id, system_id_len,
151  S("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"))) {
152  return true;
153  }
154 
155  if (cdoc->system_missing == true &&
156  (starts_with(public_id, public_id_len,
157  S("-//W3C//DTD HTML 4.01 Frameset//")) ||
158  starts_with(public_id, public_id_len,
159  S("-//W3C//DTD HTML 4.01 Transitional//")))) {
160  return true;
161  }
162 
163 #undef S
164 
165  return false;
166 }
167 
168 
176 static bool lookup_limited_quirks(hubbub_treebuilder *treebuilder,
177  const hubbub_doctype *cdoc)
178 {
179  const uint8_t *public_id = cdoc->public_id.ptr;
180  size_t public_id_len = cdoc->public_id.len;
181 
182  UNUSED(treebuilder);
183 
184 #define S(s) (uint8_t *) s, sizeof s - 1
185 
186  if (starts_with(public_id, public_id_len,
187  S("-//W3C//DTD XHTML 1.0 Frameset//")) ||
188  starts_with(public_id, public_id_len,
189  S("-//W3C//DTD XHTML 1.0 Transitional//"))) {
190  return true;
191  }
192 
193  if (cdoc->system_missing == false &&
194  (starts_with(public_id, public_id_len,
195  S("-//W3C//DTD HTML 4.01 Frameset//")) ||
196  starts_with(public_id, public_id_len,
197  S("-//W3C//DTD HTML 4.01 Transitional//")))) {
198  return true;
199  }
200 
201 #undef S
202 
203  return false;
204 }
205 
206 
215  const hubbub_token *token)
216 {
217  hubbub_error err = HUBBUB_OK;
218 
219  switch (token->type) {
221  err = process_characters_expect_whitespace(treebuilder, token,
222  false);
223  if (err == HUBBUB_REPROCESS) {
226  treebuilder->tree_handler->set_quirks_mode(
227  treebuilder->tree_handler->ctx,
229  treebuilder->context.mode = BEFORE_HTML;
230  }
231  break;
233  err = process_comment_append(treebuilder, token,
234  treebuilder->context.document);
235  break;
237  {
238  void *doctype, *appended;
239  const hubbub_doctype *cdoc;
240 
243  err = treebuilder->tree_handler->create_doctype(
244  treebuilder->tree_handler->ctx,
245  &token->data.doctype,
246  &doctype);
247  if (err != HUBBUB_OK)
248  return err;
249 
250  /* Append to Document node */
251  err = treebuilder->tree_handler->append_child(
252  treebuilder->tree_handler->ctx,
253  treebuilder->context.document,
254  doctype, &appended);
255 
256  treebuilder->tree_handler->unref_node(
257  treebuilder->tree_handler->ctx,
258  doctype);
259 
260  if (err != HUBBUB_OK)
261  return err;
262 
263  treebuilder->tree_handler->unref_node(
264  treebuilder->tree_handler->ctx, appended);
265 
266  cdoc = &token->data.doctype;
267 
268  /* Work out whether we need quirks mode or not */
269  if (cdoc->force_quirks == true ||
270  lookup_full_quirks(treebuilder, cdoc)) {
271  treebuilder->tree_handler->set_quirks_mode(
272  treebuilder->tree_handler->ctx,
274  } else if (lookup_limited_quirks(treebuilder, cdoc)) {
275  treebuilder->tree_handler->set_quirks_mode(
276  treebuilder->tree_handler->ctx,
278  }
279 
280  treebuilder->context.mode = BEFORE_HTML;
281  }
282  break;
285  case HUBBUB_TOKEN_EOF:
287  treebuilder->tree_handler->set_quirks_mode(
288  treebuilder->tree_handler->ctx,
290  err = HUBBUB_REPROCESS;
291  break;
292  }
293 
294  if (err == HUBBUB_REPROCESS) {
295  treebuilder->context.mode = BEFORE_HTML;
296  }
297 
298  return err;
299 }
300 
hubbub_error process_comment_append(hubbub_treebuilder *treebuilder, const hubbub_token *token, void *parent)
Process a comment token, appending it to the given parent.
Definition: treebuilder.c:421
hubbub_token_type type
The token type.
Definition: types.h:120
hubbub_tree_create_doctype create_doctype
Create doctype.
Definition: tree.h:275
void * ctx
Context pointer.
Definition: tree.h:292
Token data.
Definition: types.h:119
hubbub_tree_handler * tree_handler
Callback table.
Definition: internal.h:122
const uint8_t * ptr
Pointer to data.
Definition: types.h:77
Data for doctype token.
Definition: types.h:93
hubbub_error process_characters_expect_whitespace(hubbub_treebuilder *treebuilder, const hubbub_token *token, bool insert_into_current_node)
Process a character token in cases where we expect only whitespace.
Definition: treebuilder.c:375
hubbub_string public_id
Doctype public identifier.
Definition: types.h:97
#define UNUSED(x)
Definition: utils.h:25
bool system_missing
Whether the system id is missing.
Definition: types.h:99
insertion_mode mode
The current insertion mode.
Definition: internal.h:75
hubbub_doctype doctype
Definition: types.h:123
size_t len
Byte length of string.
Definition: types.h:78
bool hubbub_string_match_ci(const uint8_t *a, size_t a_len, const uint8_t *b, size_t b_len)
Check that one string is case-insensitively equal to another.
Definition: string.c:40
static bool starts_with(const uint8_t *a, size_t a_len, const uint8_t *b, size_t b_len)
Check if one string starts with another.
Definition: initial.c:91
const char * name
Definition: initial.c:22
hubbub_error handle_initial(hubbub_treebuilder *treebuilder, const hubbub_token *token)
Handle token in initial insertion mode.
Definition: initial.c:214
#define S(s)
Definition: initial.c:19
hubbub_treebuilder_context context
Our context.
Definition: internal.h:120
hubbub_error
Definition: errors.h:18
hubbub_string system_id
Doctype system identifier.
Definition: types.h:100
hubbub_string name
Doctype name.
Definition: types.h:94
hubbub_tree_unref_node unref_node
Unreference node.
Definition: tree.h:279
hubbub_tree_set_quirks_mode set_quirks_mode
Set quirks mode.
Definition: tree.h:289
No error.
Definition: errors.h:19
size_t len
Definition: initial.c:23
void * document
Pointer to the document node.
Definition: internal.h:93
struct @12 public_doctypes[]
union hubbub_token::@3 data
Type-specific data.
static bool lookup_limited_quirks(hubbub_treebuilder *treebuilder, const hubbub_doctype *cdoc)
Determine whether this doctype triggers limited quirks mode.
Definition: initial.c:176
bool force_quirks
Doctype force-quirks flag.
Definition: types.h:102
Treebuilder object.
Definition: internal.h:116
hubbub_tree_append_child append_child
Append child.
Definition: tree.h:280
bool public_missing
Whether the public id is missing.
Definition: types.h:96
static bool lookup_full_quirks(hubbub_treebuilder *treebuilder, const hubbub_doctype *cdoc)
Determine whether this doctype triggers full quirks mode.
Definition: initial.c:109