Package cssutils :: Module tokenize2
[hide private]
[frames] | no frames]

Source Code for Module cssutils.tokenize2

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  """New CSS Tokenizer (a generator) 
  4  """ 
  5  __all__ = ['Tokenizer', 'CSSProductions'] 
  6  __docformat__ = 'restructuredtext' 
  7  __version__ = '$Id: tokenize2.py 1420 2008-08-09 19:28:34Z cthedot $' 
  8   
  9  import re 
 10  from helper import normalize 
 11  from cssproductions import * 
 12   
13 -class Tokenizer(object):
14 """ 15 generates a list of Token tuples: 16 (Tokenname, value, startline, startcolumn) 17 """ 18 _atkeywords = { 19 u'@font-face': CSSProductions.FONT_FACE_SYM, 20 u'@import': CSSProductions.IMPORT_SYM, 21 u'@media': CSSProductions.MEDIA_SYM, 22 u'@namespace': CSSProductions.NAMESPACE_SYM, 23 u'@page': CSSProductions.PAGE_SYM 24 } 25 _linesep = u'\n' 26
27 - def __init__(self, macros=None, productions=None):
28 """ 29 inits tokenizer with given macros and productions which default to 30 cssutils own macros and productions 31 """ 32 if not macros: 33 macros = MACROS 34 if not productions: 35 productions = PRODUCTIONS 36 self.tokenmatches = self._compile_productions( 37 self._expand_macros(macros, 38 productions)) 39 self.commentmatcher = [x[1] for x in self.tokenmatches if x[0] == 'COMMENT'][0] 40 self.urimatcher = [x[1] for x in self.tokenmatches if x[0] == 'URI'][0] 41 self.unicodesub = re.compile(r'\\[0-9a-fA-F]{1,6}(?:\r\n|[\t|\r|\n|\f|\x20])?').sub
42
43 - def _expand_macros(self, macros, productions):
44 """returns macro expanded productions, order of productions is kept""" 45 def macro_value(m): 46 return '(?:%s)' % macros[m.groupdict()['macro']]
47 expanded = [] 48 for key, value in productions: 49 while re.search(r'{[a-zA-Z][a-zA-Z0-9-]*}', value): 50 value = re.sub(r'{(?P<macro>[a-zA-Z][a-zA-Z0-9-]*)}', 51 macro_value, value) 52 expanded.append((key, value)) 53 return expanded
54
55 - def _compile_productions(self, expanded_productions):
56 """compile productions into callable match objects, order is kept""" 57 compiled = [] 58 for key, value in expanded_productions: 59 compiled.append((key, re.compile('^(?:%s)' % value, re.U).match)) 60 return compiled
61
62 - def tokenize(self, text, fullsheet=False):
63 """Generator: Tokenize text and yield tokens, each token is a tuple 64 of:: 65 66 (nname, value, line, col) 67 68 The token value will contain a normal string, meaning CSS unicode 69 escapes have been resolved to normal characters. The serializer 70 escapes needed characters back to unicode escapes depending on 71 the stylesheet target encoding. 72 73 text 74 to be tokenized 75 fullsheet 76 if ``True`` appends EOF token as last one and completes incomplete 77 COMMENT or INVALID (to STRING) tokens 78 """ 79 def _repl(m): 80 "used by unicodesub" 81 num = int(m.group(0)[1:], 16) 82 if num < 0x10000: 83 return unichr(num) 84 else: 85 return m.group(0)
86 87 def _normalize(value): 88 "normalize and do unicodesub" 89 return normalize(self.unicodesub(_repl, value)) 90 91 line = col = 1 92 93 # check for BOM first as it should only be max one at the start 94 (BOM, matcher), productions = self.tokenmatches[0], self.tokenmatches[1:] 95 match = matcher(text) 96 if match: 97 found = match.group(0) 98 yield (BOM, found, line, col) 99 text = text[len(found):] 100 101 # check for @charset which is valid only at start of CSS 102 if text.startswith('@charset '): 103 found = '@charset ' # production has trailing S! 104 yield (CSSProductions.CHARSET_SYM, found, line, col) 105 text = text[len(found):] 106 col += len(found) 107 108 while text: 109 # speed test for most used CHARs 110 c = text[0] 111 if c in '{}:;,': 112 yield ('CHAR', c, line, col) 113 col += 1 114 text = text[1:] 115 116 else: 117 # check all other productions, at least CHAR must match 118 for name, matcher in productions: 119 if fullsheet and name == 'CHAR' and text.startswith(u'/*'): 120 # before CHAR production test for incomplete comment 121 possiblecomment = u'%s*/' % text 122 match = self.commentmatcher(possiblecomment) 123 if match: 124 yield ('COMMENT', possiblecomment, line, col) 125 text = None # eats all remaining text 126 break 127 128 match = matcher(text) # if no match try next production 129 if match: 130 found = match.group(0) # needed later for line/col 131 if fullsheet: 132 # check if found may be completed into a full token 133 if 'INVALID' == name and text == found: 134 # complete INVALID to STRING with start char " or ' 135 name, found = 'STRING', '%s%s' % (found, found[0]) 136 137 elif 'FUNCTION' == name and\ 138 u'url(' == _normalize(found): 139 # FUNCTION url( is fixed to URI if fullsheet 140 # FUNCTION production MUST BE after URI production! 141 for end in (u"')", u'")', u')'): 142 possibleuri = '%s%s' % (text, end) 143 match = self.urimatcher(possibleuri) 144 if match: 145 name, found = 'URI', match.group(0) 146 break 147 148 if name in ('DIMENSION', 'IDENT', 'STRING', 'URI', 149 'HASH', 'COMMENT', 'FUNCTION', 'INVALID'): 150 # may contain unicode escape, replace with normal char 151 # but do not _normalize (?) 152 value = self.unicodesub(_repl, found) 153 154 else: 155 if 'ATKEYWORD' == name: 156 # get actual ATKEYWORD SYM 157 if '@charset' == found and ' ' == text[len(found):len(found)+1]: 158 # only this syntax! 159 name = CSSProductions.CHARSET_SYM 160 found += ' ' 161 else: 162 name = self._atkeywords.get(_normalize(found), 'ATKEYWORD') 163 164 value = found # should not contain unicode escape (?) 165 166 yield (name, value, line, col) 167 text = text[len(found):] 168 nls = found.count(self._linesep) 169 line += nls 170 if nls: 171 col = len(found[found.rfind(self._linesep):]) 172 else: 173 col += len(found) 174 break 175 176 if fullsheet: 177 yield ('EOF', u'', line, col) 178