1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 from libxyz.parser import SourceData
18 from libxyz.exceptions import LexerError
19
21 """
22 Lexical analyzer
23
24 Lexer rules:
25 -----------
26 * Blank chars are usually ignored. Except from in quotes.
27 * Quote can be one-line: "quoted value", or multiline:
28 '''quoted value1,
29 quoted value2,
30 '''
31 * New-line char ends commented line if any.
32 * Values can be provided as simple literals or quoted ones.
33 * If value contains spaces or any other non-alphanumeric values it is better
34 to quote it or escape it using escapechar.
35 * Variable can take list of values, separated by comma
36 * Escaping can only be used in rval position.
37
38 Macros:
39 ------
40 Macros are special internal variables that get expanded upon parsing.
41 Macro definition is similar to variable definition, but macro char
42 (default '&') is prepended to var name:
43 ¯o = value
44 var = ¯o
45 """
46
47 TOKEN_IDT = 0
48 TOKEN_MACRO = 1
49
50 - def __init__(self, source, tokens, comment=u"#", macro=u"&"):
51 """
52 @param source: Parsing source. If file object is passed, it must be
53 closed by caller function after parsing completes.
54 @type source: string, file-like object or SourceData object
55
56 @param tokens: List of tokens
57 @type tokens: sequence
58
59 @param comment: Comment char
60 @param macro: Macros char
61 """
62
63 if isinstance(source, SourceData):
64 self.sdata = source
65 else:
66 self.sdata = SourceData(source)
67
68 self.tokens = tokens
69 self.comment = comment
70 self.macro = macro
71
72 self._escapechar = u"\\"
73 self._xqchar = u"'"
74 self._xqcount = 3
75 self._xqtotal = 0
76 self._skip_next = 0
77
78
79 self._done = False
80
81 self._can_escape = False
82 self._escaped = False
83 self._in_quote = False
84 self._in_xquote = False
85 self._in_comment = False
86
87 self._idt = []
88
89
90
92 """
93 Scan input for lexemes and return to parser
94
95 @return: typle (token_type, token_value)
96 """
97
98 def _token_type(tok):
99 """
100 Determine token type
101 """
102
103 _type = self.TOKEN_IDT
104 _tok = tok
105
106 if tok and self.macro and tok[0] == self.macro:
107 _type = self.TOKEN_MACRO
108 _tok = tok[1:]
109
110 return (_type, _tok)
111
112
113
114 _quoted = False
115
116 for char in self.sdata:
117 if self._done:
118 self.unget(char)
119 return None
120
121 if self._in_comment and char != u"\n":
122 continue
123
124 if self._skip_next == 0:
125 if 0 < self._xqtotal < self._xqcount:
126 if char != self._xqchar:
127
128 _back_tk = "%s%s" %(self._xqchar * self._xqtotal, char)
129 self.unget(_back_tk)
130 self._skip_next = len(_back_tk)
131 self._xqtotal = 0
132 continue
133
134 if char == self._xqchar:
135 self._xqtotal += 1
136
137
138 if self._xqtotal == self._xqcount:
139 if self._in_xquote:
140
141 self._in_xquote = False
142 else:
143
144 self._in_xquote = True
145 _quoted = True
146
147 self._xqtotal = 0
148
149 continue
150 else:
151 self._skip_next -= 1
152
153 if self._in_xquote:
154 self._idt.append(char)
155 continue
156
157
158 if self._can_escape:
159 if self._escaped:
160 self._idt.append(char)
161 self._escaped = False
162 continue
163
164 if char == self._escapechar:
165 self._escaped = True
166 continue
167
168 if char == u"\n":
169 if self._in_quote:
170 raise LexerError(_(u"Unterminated quote"))
171
172 _token = None
173
174 if self._idt or _quoted:
175 _token = u"".join(self._idt)
176 self._idt = []
177 _quoted = False
178 else:
179 self._in_comment = False
180
181 if char in self.tokens:
182 if _token is not None:
183 self.unget(char)
184 else:
185 _token = char
186
187 if _token is not None:
188 return _token_type(_token)
189 else:
190 continue
191
192 if char == u'"':
193 if self._in_quote:
194 self._in_quote = False
195 else:
196 self._in_quote = True
197 _quoted = True
198
199 continue
200
201 if self._in_quote:
202 self._idt.append(char)
203 continue
204
205 if char in self.tokens or char.isspace():
206 _token = None
207
208
209 if self._idt or _quoted:
210 _token = u"".join(self._idt)
211 self._idt = []
212 _quoted = False
213 if not char.isspace():
214 if _token is not None:
215 self.unget(char)
216 else:
217 _token = char
218
219 if _token is not None:
220 return _token_type(_token)
221 else:
222 continue
223
224 if char == self.comment and not self._in_xquote:
225
226 self._in_comment = True
227 continue
228
229 self._idt.append(char)
230
231 if self._idt:
232 _token = u"".join(self._idt)
233 self._idt = []
234 return _token_type(_token)
235
236
237
239 """
240 Return current state of token buffer
241 """
242
243 return self._idt
244
245
246
248 """
249 Order lexer to stop processing
250 """
251
252 self._done = True
253
254
256 """
257 Put read token back to input stream
258 """
259
260 self.sdata.unget(token)
261
262
263
265 """
266 Enable escaping
267 """
268
269 self._can_escape = True
270
271
272
274 """
275 Disable escaping
276 """
277
278 self._can_escape = False
279