Package translate :: Package misc :: Module sparse
[hide private]
[frames] | no frames]

Source Code for Module translate.misc.sparse

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """simple parser / string tokenizer 
  5  rather than returning a list of token types etc, we simple return a list of tokens... 
  6  each tokenizing function takes a string as input and returns a list of tokens 
  7  """ 
  8   
  9  # Copyright 2002, 2003 St James Software 
 10  # 
 11  # This file is part of translate. 
 12  # 
 13  # translate is free software; you can redistribute it and/or modify 
 14  # it under the terms of the GNU General Public License as published by 
 15  # the Free Software Foundation; either version 2 of the License, or 
 16  # (at your option) any later version. 
 17  # 
 18  # translate is distributed in the hope that it will be useful, 
 19  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 20  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 21  # GNU General Public License for more details. 
 22  # 
 23  # You should have received a copy of the GNU General Public License 
 24  # along with translate; if not, write to the Free Software 
 25  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 26   
 27   
28 -def stringeval(text):
29 """takes away repeated quotes (escapes) and returns the string represented by the text""" 30 stringchar = text[0] 31 if text[-1] != stringchar or stringchar not in ("'", '"'): 32 # scratch your head 33 raise ValueError("error parsing escaped string: %r" % text) 34 return text[1:-1].replace(stringchar+stringchar, stringchar)
35 36
37 -def stringquote(text):
38 """escapes quotes as neccessary and returns a string representing the text""" 39 if "'" in text: 40 if '"' in text: 41 return '"' + text.replace('"', '""') + '"' 42 else: 43 return '"' + text + '"' 44 else: 45 return "'" + text + "'"
46 47
48 -class ParserError(ValueError):
49 """Intelligent parser error""" 50
51 - def __init__(self, parser, message, tokennum):
52 """takes a message and the number of the token that caused the error""" 53 tokenpos = parser.findtokenpos(tokennum) 54 line, charpos = parser.getlinepos(tokenpos) 55 ValueError.__init__(self, "%s at line %d, char %d (token %r)" % \ 56 (message, line, charpos, parser.tokens[tokennum])) 57 self.parser = parser 58 self.tokennum = tokennum
59 60
61 -class SimpleParser:
62 """this is a simple parser""" 63
64 - def __init__(self, defaulttokenlist=None, whitespacechars=" \t\r\n", includewhitespacetokens=0):
65 if defaulttokenlist is None: 66 self.defaulttokenlist = ['<=', '>=', '==', '!=', '+=', '-=', '*=', '/=', '<>'] 67 self.defaulttokenlist.extend('(),[]:=+-') 68 else: 69 self.defaulttokenlist = defaulttokenlist 70 self.whitespacechars = whitespacechars 71 self.includewhitespacetokens = includewhitespacetokens 72 self.standardtokenizers = [self.stringtokenize, self.removewhitespace, self.separatetokens] 73 self.quotechars = ('"', "'") 74 self.endquotechars = {'"': '"', "'": "'"} 75 self.stringescaping = 1
76
77 - def stringtokenize(self, text):
78 """makes strings in text into tokens...""" 79 tokens = [] 80 laststart = 0 81 instring = 0 82 endstringchar, escapechar = '', '\\' 83 gotclose, gotescape = 0, 0 84 for pos in range(len(text)): 85 char = text[pos] 86 if instring: 87 if self.stringescaping and (gotescape or char == escapechar) and not gotclose: 88 gotescape = not gotescape 89 elif char == endstringchar: 90 gotclose = not gotclose 91 elif gotclose: 92 tokens.append(text[laststart:pos]) 93 instring, laststart, endstringchar = 0, pos, '' 94 if not instring: 95 if char in self.quotechars: 96 if pos > laststart: 97 tokens.append(text[laststart:pos]) 98 instring, laststart, endstringchar, gotclose = 1, pos, self.endquotechars[char], 0 99 if laststart < len(text): 100 tokens.append(text[laststart:]) 101 return tokens
102
103 - def keeptogether(self, text):
104 """checks whether a token should be kept together""" 105 return self.isstringtoken(text)
106
107 - def isstringtoken(self, text):
108 """checks whether a token is a string token""" 109 return text[:1] in self.quotechars
110
111 - def separatetokens(self, text, tokenlist=None):
112 """this separates out tokens in tokenlist from whitespace etc""" 113 if self.keeptogether(text): 114 return [text] 115 if tokenlist is None: 116 tokenlist = self.defaulttokenlist 117 # loop through and put tokens into a list 118 tokens = [] 119 pos = 0 120 laststart = 0 121 lentext = len(text) 122 while pos < lentext: 123 foundtoken = 0 124 for token in tokenlist: 125 lentoken = len(token) 126 if text[pos:pos+lentoken] == token: 127 if laststart < pos: 128 tokens.append(text[laststart:pos]) 129 tokens.append(token) 130 pos += lentoken 131 foundtoken, laststart = 1, pos 132 break 133 if not foundtoken: 134 pos += 1 135 if laststart < lentext: 136 tokens.append(text[laststart:]) 137 return tokens
138
139 - def removewhitespace(self, text):
140 """this removes whitespace but lets it separate things out into separate tokens""" 141 if self.keeptogether(text): 142 return [text] 143 # loop through and put tokens into a list 144 tokens = [] 145 pos = 0 146 inwhitespace = 0 147 laststart = 0 148 for pos in range(len(text)): 149 char = text[pos] 150 if inwhitespace: 151 if char not in self.whitespacechars: 152 if laststart < pos and self.includewhitespacetokens: 153 tokens.append(text[laststart:pos]) 154 inwhitespace, laststart = 0, pos 155 else: 156 if char in self.whitespacechars: 157 if laststart < pos: 158 tokens.append(text[laststart:pos]) 159 inwhitespace, laststart = 1, pos 160 if laststart < len(text) and (not inwhitespace or self.includewhitespacetokens): 161 tokens.append(text[laststart:]) 162 return tokens
163
164 - def applytokenizer(self, inputlist, tokenizer):
165 """apply a tokenizer to a set of text, flattening the result""" 166 tokenizedlists = [tokenizer(text) for text in inputlist] 167 joined = [] 168 map(joined.extend, tokenizedlists) 169 return joined
170
171 - def applytokenizers(self, inputlist, tokenizers):
172 """apply a set of tokenizers to a set of text, flattening each time""" 173 for tokenizer in tokenizers: 174 inputlist = self.applytokenizer(inputlist, tokenizer) 175 return inputlist
176
177 - def tokenize(self, source, tokenizers=None):
178 """tokenize the text string with the standard tokenizers""" 179 self.source = source 180 if tokenizers is None: 181 tokenizers = self.standardtokenizers 182 self.tokens = self.applytokenizers([self.source], tokenizers) 183 return self.tokens
184
185 - def findtokenpos(self, tokennum):
186 """finds the position of the given token in the text""" 187 currenttokenpos = 0 188 for currenttokennum in range(tokennum+1): 189 currenttokenpos = self.source.find(self.tokens[currenttokennum], currenttokenpos) 190 return currenttokenpos
191
192 - def getlinepos(self, tokenpos):
193 """finds the line and character position of the given character""" 194 sourcecut = self.source[:tokenpos] 195 line = sourcecut.count("\n")+1 196 charpos = tokenpos - sourcecut.rfind("\n") 197 return line, charpos
198
199 - def raiseerror(self, message, tokennum):
200 """raises a ParserError""" 201 raise ParserError(self, message, tokennum)
202