Package cssutils :: Module codec
[hide private]
[frames] | no frames]

Source Code for Module cssutils.codec

  1  #!/usr/bin/env python 
  2  """Python codec for CSS.""" 
  3  __docformat__ = 'restructuredtext' 
  4  __author__ = 'Walter Doerwald' 
  5  __version__ = '$Id: util.py 1114 2008-03-05 13:22:59Z cthedot $' 
  6   
  7  import codecs, marshal 
  8   
  9  # We're using bits to store all possible candidate encodings (or variants, i.e. 
 10  # we have two bits for the variants of UTF-16 and two for the 
 11  # variants of UTF-32). 
 12  # 
 13  # Prefixes for various CSS encodings 
 14  # UTF-8-SIG   xEF  xBB  xBF 
 15  # UTF-16 (LE) xFF  xFE ~x00|~x00 
 16  # UTF-16 (BE) xFE  xFF 
 17  # UTF-16-LE    @   x00   @   x00 
 18  # UTF-16-BE   x00   @ 
 19  # UTF-32 (LE) xFF  xFE  x00  x00 
 20  # UTF-32 (BE) x00  x00  xFE  xFF 
 21  # UTF-32-LE    @   x00  x00  x00 
 22  # UTF-32-BE   x00  x00  x00   @ 
 23  # CHARSET      @    c    h    a  ... 
 24   
 25   
26 -def detectencoding_str(input, final=False):
27 """ 28 Detect the encoding of the byte string ``input``, which contains the 29 beginning of a CSS file. This function returs the detected encoding (or 30 ``None`` if it hasn't got enough data), and a flag that indicates whether 31 to encoding has been detected explicitely or implicitely. To detect the 32 encoding the first few bytes are used (or if ``input`` is ASCII compatible 33 and starts with a charset rule the encoding name from the rule). "Explicit" 34 detection means that the bytes start with a BOM or a charset rule. 35 36 If the encoding can't be detected yet, ``None`` is returned as the encoding. 37 ``final`` specifies whether more data is available in later calls or not. 38 If ``final`` is true, ``detectencoding_str()`` will never return ``None`` 39 as the encoding. 40 """ 41 42 # A bit for every candidate 43 CANDIDATE_UTF_8_SIG = 1 44 CANDIDATE_UTF_16_AS_LE = 2 45 CANDIDATE_UTF_16_AS_BE = 4 46 CANDIDATE_UTF_16_LE = 8 47 CANDIDATE_UTF_16_BE = 16 48 CANDIDATE_UTF_32_AS_LE = 32 49 CANDIDATE_UTF_32_AS_BE = 64 50 CANDIDATE_UTF_32_LE = 128 51 CANDIDATE_UTF_32_BE = 256 52 CANDIDATE_CHARSET = 512 53 54 candidates = 1023 # all candidates 55 56 li = len(input) 57 if li>=1: 58 # Check first byte 59 c = input[0] 60 if c != "\xef": 61 candidates &= ~CANDIDATE_UTF_8_SIG 62 if c != "\xff": 63 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_16_AS_LE) 64 if c != "\xfe": 65 candidates &= ~CANDIDATE_UTF_16_AS_BE 66 if c != "@": 67 candidates &= ~(CANDIDATE_UTF_32_LE|CANDIDATE_UTF_16_LE|CANDIDATE_CHARSET) 68 if c != "\x00": 69 candidates &= ~(CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_BE|CANDIDATE_UTF_16_BE) 70 if li>=2: 71 # Check second byte 72 c = input[1] 73 if c != "\xbb": 74 candidates &= ~CANDIDATE_UTF_8_SIG 75 if c != "\xfe": 76 candidates &= ~(CANDIDATE_UTF_16_AS_LE|CANDIDATE_UTF_32_AS_LE) 77 if c != "\xff": 78 candidates &= ~CANDIDATE_UTF_16_AS_BE 79 if c != "\x00": 80 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE) 81 if c != "@": 82 candidates &= ~CANDIDATE_UTF_16_BE 83 if c != "c": 84 candidates &= ~CANDIDATE_CHARSET 85 if li>=3: 86 # Check third byte 87 c = input[2] 88 if c != "\xbf": 89 candidates &= ~CANDIDATE_UTF_8_SIG 90 if c != "c": 91 candidates &= ~CANDIDATE_UTF_16_LE 92 if c != "\x00": 93 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE) 94 if c != "\xfe": 95 candidates &= ~CANDIDATE_UTF_32_AS_BE 96 if c != "h": 97 candidates &= ~CANDIDATE_CHARSET 98 if li>=4: 99 # Check fourth byte 100 c = input[3] 101 if input[2:4] == "\x00\x00": 102 candidates &= ~CANDIDATE_UTF_16_AS_LE 103 if c != "\x00": 104 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE) 105 if c != "\xff": 106 candidates &= ~CANDIDATE_UTF_32_AS_BE 107 if c != "@": 108 candidates &= ~CANDIDATE_UTF_32_BE 109 if c != "a": 110 candidates &= ~CANDIDATE_CHARSET 111 if candidates == 0: 112 return ("utf-8", False) 113 if not (candidates & (candidates-1)): # only one candidate remaining 114 if candidates == CANDIDATE_UTF_8_SIG and li >= 3: 115 return ("utf-8-sig", True) 116 elif candidates == CANDIDATE_UTF_16_AS_LE and li >= 2: 117 return ("utf-16", True) 118 elif candidates == CANDIDATE_UTF_16_AS_BE and li >= 2: 119 return ("utf-16", True) 120 elif candidates == CANDIDATE_UTF_16_LE and li >= 4: 121 return ("utf-16-le", False) 122 elif candidates == CANDIDATE_UTF_16_BE and li >= 2: 123 return ("utf-16-be", False) 124 elif candidates == CANDIDATE_UTF_32_AS_LE and li >= 4: 125 return ("utf-32", True) 126 elif candidates == CANDIDATE_UTF_32_AS_BE and li >= 4: 127 return ("utf-32", True) 128 elif candidates == CANDIDATE_UTF_32_LE and li >= 4: 129 return ("utf-32-le", False) 130 elif candidates == CANDIDATE_UTF_32_BE and li >= 4: 131 return ("utf-32-be", False) 132 elif candidates == CANDIDATE_CHARSET and li >= 4: 133 prefix = '@charset "' 134 if input[:len(prefix)] == prefix: 135 pos = input.find('"', len(prefix)) 136 if pos >= 0: 137 return (input[len(prefix):pos], True) 138 # if this is the last call, and we haven't determined an encoding yet, 139 # we default to UTF-8 140 if final: 141 return ("utf-8", False) 142 return (None, False) # dont' know yet
143 144
145 -def detectencoding_unicode(input, final=False):
146 """ 147 Detect the encoding of the unicode string ``input``, which contains the 148 beginning of a CSS file. The encoding is detected from the charset rule 149 at the beginning of ``input``. If there is no charset rule, ``"utf-8"`` 150 will be returned. 151 152 If the encoding can't be detected yet, ``None`` is returned. ``final`` 153 specifies whether more data will be available in later calls or not. If 154 ``final`` is true, ``detectencoding_unicode()`` will never return ``None``. 155 """ 156 prefix = u'@charset "' 157 if input.startswith(prefix): 158 pos = input.find(u'"', len(prefix)) 159 if pos >= 0: 160 return (input[len(prefix):pos], True) 161 elif final or not prefix.startswith(input): 162 # if this is the last call, and we haven't determined an encoding yet, 163 # (or the string definitely doesn't start with prefix) we default to UTF-8 164 return ("utf-8", False) 165 return (None, False) # don't know yet
166 167
168 -def _fixencoding(input, encoding, final=False):
169 """ 170 Replace the name of the encoding in the charset rule at the beginning of 171 ``input`` with ``encoding``. If ``input`` doesn't starts with a charset 172 rule, ``input`` will be returned unmodified. 173 174 If the encoding can't be found yet, ``None`` is returned. ``final`` 175 specifies whether more data will be available in later calls or not. 176 If ``final`` is true, ``_fixencoding()`` will never return ``None``. 177 """ 178 prefix = u'@charset "' 179 if len(input) > len(prefix): 180 if input.startswith(prefix): 181 pos = input.find(u'"', len(prefix)) 182 if pos >= 0: 183 if encoding.replace("_", "-").lower() == "utf-8-sig": 184 encoding = u"utf-8" 185 return prefix + encoding + input[pos:] 186 # we haven't seen the end of the encoding name yet => fall through 187 else: 188 return input # doesn't start with prefix, so nothing to fix 189 elif not prefix.startswith(input) or final: 190 # can't turn out to be a @charset rule later (or there is no "later") 191 return input 192 if final: 193 return input 194 return None # don't know yet
195 196
197 -def decode(input, errors="strict", encoding=None, force=True):
198 if encoding is None or not force: 199 (_encoding, explicit) = detectencoding_str(input, True) 200 if _encoding == "css": 201 raise ValueError("css not allowed as encoding name") 202 if (explicit and not force) or encoding is None: # Take the encoding from the input 203 encoding = _encoding 204 (input, consumed) = codecs.getdecoder(encoding)(input, errors) 205 return (_fixencoding(input, unicode(encoding), True), consumed)
206 207
208 -def encode(input, errors="strict", encoding=None):
209 consumed = len(input) 210 if encoding is None: 211 encoding = detectencoding_unicode(input, True)[0] 212 if encoding.replace("_", "-").lower() == "utf-8-sig": 213 input = _fixencoding(input, u"utf-8", True) 214 else: 215 input = _fixencoding(input, unicode(encoding), True) 216 if encoding == "css": 217 raise ValueError("css not allowed as encoding name") 218 encoder = codecs.getencoder(encoding) 219 return (encoder(input, errors)[0], consumed)
220 221
222 -def _bytes2int(bytes):
223 # Helper: convert an 8 bit string into an ``int``. 224 i = 0 225 for byte in bytes: 226 i = (i<<8) + ord(byte) 227 return i
228 229
230 -def _int2bytes(i):
231 # Helper: convert an ``int`` into an 8-bit string. 232 v = [] 233 while i: 234 v.insert(0, chr(i&0xff)) 235 i >>= 8 236 return "".join(v)
237 238 239 if hasattr(codecs, "IncrementalDecoder"):
240 - class IncrementalDecoder(codecs.IncrementalDecoder):
241 - def __init__(self, errors="strict", encoding=None, force=True):
242 self.decoder = None 243 self.encoding = encoding 244 self.force = force 245 codecs.IncrementalDecoder.__init__(self, errors) 246 # Store ``errors`` somewhere else, 247 # because we have to hide it in a property 248 self._errors = errors 249 self.buffer = "" 250 self.headerfixed = False
251
252 - def iterdecode(self, input):
253 for part in input: 254 result = self.decode(part, False) 255 if result: 256 yield result 257 result = self.decode("", True) 258 if result: 259 yield result
260
261 - def decode(self, input, final=False):
262 # We're doing basically the same as a ``BufferedIncrementalDecoder``, 263 # but since the buffer is only relevant until the encoding has been 264 # detected (in which case the buffer of the underlying codec might 265 # kick in), we're implementing buffering ourselves to avoid some 266 # overhead. 267 if self.decoder is None: 268 input = self.buffer + input 269 # Do we have to detect the encoding from the input? 270 if self.encoding is None or not self.force: 271 (encoding, explicit) = detectencoding_str(input, final) 272 if encoding is None: # no encoding determined yet 273 self.buffer = input # retry the complete input on the next call 274 return u"" # no encoding determined yet, so no output 275 elif encoding == "css": 276 raise ValueError("css not allowed as encoding name") 277 if (explicit and not self.force) or self.encoding is None: # Take the encoding from the input 278 self.encoding = encoding 279 self.buffer = "" # drop buffer, as the decoder might keep its own 280 decoder = codecs.getincrementaldecoder(self.encoding) 281 self.decoder = decoder(self._errors) 282 if self.headerfixed: 283 return self.decoder.decode(input, final) 284 # If we haven't fixed the header yet, 285 # the content of ``self.buffer`` is a ``unicode`` object 286 output = self.buffer + self.decoder.decode(input, final) 287 encoding = self.encoding 288 if encoding.replace("_", "-").lower() == "utf-8-sig": 289 encoding = "utf-8" 290 newoutput = _fixencoding(output, unicode(encoding), final) 291 if newoutput is None: 292 # retry fixing the @charset rule (but keep the decoded stuff) 293 self.buffer = output 294 return u"" 295 self.headerfixed = True 296 return newoutput
297
298 - def reset(self):
299 codecs.IncrementalDecoder.reset(self) 300 self.decoder = None 301 self.buffer = "" 302 self.headerfixed = False
303
304 - def _geterrors(self):
305 return self._errors
306
307 - def _seterrors(self, errors):
308 # Setting ``errors`` must be done on the real decoder too 309 if self.decoder is not None: 310 self.decoder.errors = errors 311 self._errors = errors
312 errors = property(_geterrors, _seterrors) 313
314 - def getstate(self):
315 if self.decoder is not None: 316 state = (self.encoding, self.buffer, self.headerfixed, True, self.decoder.getstate()) 317 else: 318 state = (self.encoding, self.buffer, self.headerfixed, False, None) 319 return ("", _bytes2int(marshal.dumps(state)))
320
321 - def setstate(self, state):
322 state = _int2bytes(marshal.loads(state[1])) # ignore buffered input 323 self.encoding = state[0] 324 self.buffer = state[1] 325 self.headerfixed = state[2] 326 if state[3] is not None: 327 self.decoder = codecs.getincrementaldecoder(self.encoding)(self._errors) 328 self.decoder.setstate(state[4]) 329 else: 330 self.decoder = None
331 332 333 if hasattr(codecs, "IncrementalEncoder"):
334 - class IncrementalEncoder(codecs.IncrementalEncoder):
335 - def __init__(self, errors="strict", encoding=None):
336 self.encoder = None 337 self.encoding = encoding 338 codecs.IncrementalEncoder.__init__(self, errors) 339 # Store ``errors`` somewhere else, 340 # because we have to hide it in a property 341 self._errors = errors 342 self.buffer = u""
343
344 - def iterencode(self, input):
345 for part in input: 346 result = self.encode(part, False) 347 if result: 348 yield result 349 result = self.encode(u"", True) 350 if result: 351 yield result
352
353 - def encode(self, input, final=False):
354 if self.encoder is None: 355 input = self.buffer + input 356 if self.encoding is not None: 357 # Replace encoding in the @charset rule with the specified one 358 encoding = self.encoding 359 if encoding.replace("_", "-").lower() == "utf-8-sig": 360 encoding = "utf-8" 361 newinput = _fixencoding(input, unicode(encoding), final) 362 if newinput is None: # @charset rule incomplete => Retry next time 363 self.buffer = input 364 return "" 365 input = newinput 366 else: 367 # Use encoding from the @charset declaration 368 self.encoding = detectencoding_unicode(input, final)[0] 369 if self.encoding is not None: 370 if self.encoding == "css": 371 raise ValueError("css not allowed as encoding name") 372 info = codecs.lookup(self.encoding) 373 encoding = self.encoding 374 if self.encoding.replace("_", "-").lower() == "utf-8-sig": 375 input = _fixencoding(input, u"utf-8", True) 376 self.encoder = info.incrementalencoder(self._errors) 377 self.buffer = u"" 378 else: 379 self.buffer = input 380 return "" 381 return self.encoder.encode(input, final)
382
383 - def reset(self):
384 codecs.IncrementalEncoder.reset(self) 385 self.encoder = None 386 self.buffer = u""
387
388 - def _geterrors(self):
389 return self._errors
390
391 - def _seterrors(self, errors):
392 # Setting ``errors ``must be done on the real encoder too 393 if self.encoder is not None: 394 self.encoder.errors = errors 395 self._errors = errors
396 errors = property(_geterrors, _seterrors) 397
398 - def getstate(self):
399 if self.encoder is not None: 400 state = (self.encoding, self.buffer, True, self.encoder.getstate()) 401 else: 402 state = (self.encoding, self.buffer, False, None) 403 return _bytes2int(marshal.dumps(state))
404
405 - def setstate(self, state):
406 state = _int2bytes(marshal.loads(state)) 407 self.encoding = state[0] 408 self.buffer = state[1] 409 if state[2] is not None: 410 self.encoder = codecs.getincrementalencoder(self.encoding)(self._errors) 411 self.encoder.setstate(state[4]) 412 else: 413 self.encoder = None
414 415
416 -class StreamWriter(codecs.StreamWriter):
417 - def __init__(self, stream, errors="strict", encoding=None, header=False):
418 codecs.StreamWriter.__init__(self, stream, errors) 419 self.streamwriter = None 420 self.encoding = encoding 421 self._errors = errors 422 self.buffer = u""
423
424 - def encode(self, input, errors='strict'):
425 li = len(input) 426 if self.streamwriter is None: 427 input = self.buffer + input 428 li = len(input) 429 if self.encoding is not None: 430 # Replace encoding in the @charset rule with the specified one 431 encoding = self.encoding 432 if encoding.replace("_", "-").lower() == "utf-8-sig": 433 encoding = "utf-8" 434 newinput = _fixencoding(input, unicode(encoding), False) 435 if newinput is None: # @charset rule incomplete => Retry next time 436 self.buffer = input 437 return ("", 0) 438 input = newinput 439 else: 440 # Use encoding from the @charset declaration 441 self.encoding = detectencoding_unicode(input, False)[0] 442 if self.encoding is not None: 443 if self.encoding == "css": 444 raise ValueError("css not allowed as encoding name") 445 self.streamwriter = codecs.getwriter(self.encoding)(self.stream, self._errors) 446 encoding = self.encoding 447 if self.encoding.replace("_", "-").lower() == "utf-8-sig": 448 input = _fixencoding(input, u"utf-8", True) 449 self.buffer = u"" 450 else: 451 self.buffer = input 452 return ("", 0) 453 return (self.streamwriter.encode(input, errors)[0], li)
454
455 - def _geterrors(self):
456 return self._errors
457
458 - def _seterrors(self, errors):
459 # Setting ``errors`` must be done on the streamwriter too 460 if self.streamwriter is not None: 461 self.streamwriter.errors = errors 462 self._errors = errors
463 errors = property(_geterrors, _seterrors)
464 465
466 -class StreamReader(codecs.StreamReader):
467 - def __init__(self, stream, errors="strict", encoding=None, force=True):
468 codecs.StreamReader.__init__(self, stream, errors) 469 self.streamreader = None 470 self.encoding = encoding 471 self.force = force 472 self._errors = errors
473
474 - def decode(self, input, errors='strict'):
475 if self.streamreader is None: 476 if self.encoding is None or not self.force: 477 (encoding, explicit) = detectencoding_str(input, False) 478 if encoding is None: # no encoding determined yet 479 return (u"", 0) # no encoding determined yet, so no output 480 elif encoding == "css": 481 raise ValueError("css not allowed as encoding name") 482 if (explicit and not self.force) or self.encoding is None: # Take the encoding from the input 483 self.encoding = encoding 484 streamreader = codecs.getreader(self.encoding) 485 streamreader = streamreader(self.stream, self._errors) 486 (output, consumed) = streamreader.decode(input, errors) 487 encoding = self.encoding 488 if encoding.replace("_", "-").lower() == "utf-8-sig": 489 encoding = "utf-8" 490 newoutput = _fixencoding(output, unicode(encoding), False) 491 if newoutput is not None: 492 self.streamreader = streamreader 493 return (newoutput, consumed) 494 return (u"", 0) # we will create a new streamreader on the next call 495 return self.streamreader.decode(input, errors)
496
497 - def _geterrors(self):
498 return self._errors
499
500 - def _seterrors(self, errors):
501 # Setting ``errors`` must be done on the streamreader too 502 if self.streamreader is not None: 503 self.streamreader.errors = errors 504 self._errors = errors
505 errors = property(_geterrors, _seterrors)
506 507 508 if hasattr(codecs, "CodecInfo"): 509 # We're running on Python 2.5 or better
510 - def search_function(name):
511 if name == "css": 512 return codecs.CodecInfo( 513 name="css", 514 encode=encode, 515 decode=decode, 516 incrementalencoder=IncrementalEncoder, 517 incrementaldecoder=IncrementalDecoder, 518 streamwriter=StreamWriter, 519 streamreader=StreamReader, 520 )
521 else: 522 # If we're running on Python 2.4, define the utf-8-sig codec here
523 - def utf8sig_encode(input, errors='strict'):
524 return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
525
526 - def utf8sig_decode(input, errors='strict'):
527 prefix = 0 528 if input[:3] == codecs.BOM_UTF8: 529 input = input[3:] 530 prefix = 3 531 (output, consumed) = codecs.utf_8_decode(input, errors, True) 532 return (output, consumed+prefix)
533
534 - class UTF8SigStreamWriter(codecs.StreamWriter):
535 - def reset(self):
536 codecs.StreamWriter.reset(self) 537 try: 538 del self.encode 539 except AttributeError: 540 pass
541
542 - def encode(self, input, errors='strict'):
543 self.encode = codecs.utf_8_encode 544 return utf8sig_encode(input, errors)
545
546 - class UTF8SigStreamReader(codecs.StreamReader):
547 - def reset(self):
548 codecs.StreamReader.reset(self) 549 try: 550 del self.decode 551 except AttributeError: 552 pass
553
554 - def decode(self, input, errors='strict'):
555 if len(input) < 3 and codecs.BOM_UTF8.startswith(input): 556 # not enough data to decide if this is a BOM 557 # => try again on the next call 558 return (u"", 0) 559 self.decode = codecs.utf_8_decode 560 return utf8sig_decode(input, errors)
561
562 - def search_function(name):
563 import encodings 564 name = encodings.normalize_encoding(name) 565 if name == "css": 566 return (encode, decode, StreamReader, StreamWriter) 567 elif name == "utf_8_sig": 568 return (utf8sig_encode, utf8sig_decode, UTF8SigStreamReader, UTF8SigStreamWriter)
569 570 571 codecs.register(search_function) 572 573 574 # Error handler for CSS escaping 575
576 -def cssescape(exc):
577 if not isinstance(exc, UnicodeEncodeError): 578 raise TypeError("don't know how to handle %r" % exc) 579 return (u"".join(u"\\%06x" % ord(c) for c in exc.object[exc.start:exc.end]), exc.end)
580 581 codecs.register_error("cssescape", cssescape) 582