blob: 5187a718f1a551d848a55e3c6f361f492f6329c1 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
|
from enum import Enum
import io
from typing import Union
import pdb
class State(Enum):
QUOTE = 1
ESCAPE = 2
TEXT = 3
class Lexer:
def __init__(self, text: Union[str, io.StringIO]):
self._tokens = []
self._count = 0
self._parsed = False
self._state = State.TEXT
self._states = []
self._text_pos = 0
self._quote_start_pos = 0
if isinstance(text, str):
self.text = io.StringIO(text)
else:
self.text = text
def __iter__(self):
return self
def __next__(self):
t = self.get_token()
if t == '':
raise StopIteration
return t
def get_token(self):
try:
return self.parse()
except ValueError as e:
raise
if len(self._tokens) > 0:
ret = self._tokens[0]
self._tokens = self._tokens[1:]
else:
ret = None
return ret
#def get_remainder(self):
# try:
# self.parse()
# except ValueError as e:
# return self.text
#
# return ' '.join(self._tokens)
def parse(self):
acc = ''
quote = '' # used by the parser
tokens = []
self._state = State.TEXT
text = self.text
i = 0
#self.text.seek(self._text_pos)
while True:
ch = self.text.read(1)
self._text_pos += 1
#pdb.set_trace()
# If this is the last char of the string, let's save the token
if ch == '' or ch is None:
break
if self._state == State.QUOTE:
if ch == '\\':
self._states.append(self._state)
self._state = State.ESCAPE
acc += ch
elif ch == quote:
self._state = self._states.pop()
acc += ch
else:
acc += ch
elif self._state == State.ESCAPE:
acc += ch
self._state = self._states.pop()
elif self._state == State.TEXT:
if ch == ' ':
if acc != '':
break
elif ch == '"' or ch == "'":
quote = ch
self._quote_start_pos = self._text_pos
self._states.append(self._state)
self._state = State.QUOTE
acc += ch
elif ch == '\\':
# TODO: Does it make sense to go to State.ESCAPE from State.TEXT?
self._states.append(self._state)
self._state = State.ESCAPE
acc += ch
else:
acc += ch
else:
print("This shouldn't have happened")
exit(-1)
self._token = acc
if self._state == State.QUOTE:
raise ValueError("No closing quotation for quote in position %d" % self._quote_start_pos)
return self._token
if __name__ == '__main__':
cases = []
cases.append(r'abc')
cases.append(r'Hello World')
cases.append(r'"Hello \" World"')
cases.append(r"'Hello \' World'")
cases.append(r'"\""')
cases.append(r'abc "def\" \x bla \z \\ \e \ " xpto')
cases.append(r'')
cases.append(r' ')
cases.append(r' ')
cases.append(r' ')
cases.append(r' ')
cases.append(r'Hello World ')
for s in cases:
lex = Lexer(s)
tokens = list(lex)
if len(tokens) == 1:
print('%s = %d token' % (str(tokens), len(tokens)))
else:
print('%s = %d tokens' % (str(tokens), len(tokens)))
|