Skip to content

Commit 5edbade

Browse files
committed
modure: Import needed files from re1.5 v0.5.
https://github.com/pfalcon/re1.5
1 parent c71e045 commit 5edbade

File tree

4 files changed

+472
-0
lines changed

4 files changed

+472
-0
lines changed

extmod/re1.5/compilecode.c

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
// Copyright 2014 Paul Sokolovsky.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
#include "regexp.h"
6+
7+
static void insert_code(char *code, int at, int num, int *pc)
8+
{
9+
memmove(code + at + num, code + at, *pc - at);
10+
*pc += num;
11+
}
12+
13+
#define REL(at, to) (to - at - 2)
14+
15+
int re1_5_sizecode(const char *re)
16+
{
17+
int pc = 5 + NON_ANCHORED_PREFIX; // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
18+
19+
for (; *re; re++) {
20+
switch (*re) {
21+
case '\\':
22+
re++;
23+
default:
24+
pc += 2;
25+
break;
26+
case '+':
27+
// Skip entire "+?"
28+
if (re[1] == '?')
29+
re++;
30+
case '?':
31+
pc += 2;
32+
break;
33+
case '.':
34+
case '^':
35+
case '$':
36+
pc++;
37+
break;
38+
case '*':
39+
// Skip entire "*?"
40+
if (re[1] == '?')
41+
re++;
42+
case '|':
43+
case '(':
44+
pc += 4;
45+
break;
46+
case ')':
47+
break;
48+
}
49+
}
50+
51+
return pc;
52+
}
53+
54+
#define EMIT(at, byte) code[at] = byte
55+
56+
const char *_compilecode(const char *re, ByteProg *prog)
57+
{
58+
char *code = prog->insts;
59+
int pc = prog->bytelen;
60+
int start = pc;
61+
int term = pc;
62+
int alt_label = 0;
63+
64+
for (; *re && *re != ')'; re++) {
65+
switch (*re) {
66+
case '\\':
67+
re++;
68+
default:
69+
term = pc;
70+
EMIT(pc++, Char);
71+
EMIT(pc++, *re);
72+
prog->len++;
73+
break;
74+
case '.':
75+
term = pc;
76+
EMIT(pc++, Any);
77+
prog->len++;
78+
break;
79+
case '(':
80+
term = pc;
81+
82+
EMIT(pc++, Save);
83+
EMIT(pc++, 2 * ++prog->sub);
84+
prog->len++;
85+
86+
prog->bytelen = pc;
87+
re = _compilecode(re + 1, prog);
88+
pc = prog->bytelen;
89+
90+
EMIT(pc++, Save);
91+
EMIT(pc++, 2 * prog->sub + 1);
92+
prog->len++;
93+
94+
break;
95+
case '?':
96+
insert_code(code, term, 2, &pc);
97+
EMIT(term, Split);
98+
EMIT(term + 1, REL(term, pc));
99+
prog->len++;
100+
break;
101+
case '*':
102+
insert_code(code, term, 2, &pc);
103+
EMIT(pc, Jmp);
104+
EMIT(pc + 1, REL(pc, term));
105+
pc += 2;
106+
if (re[1] == '?') {
107+
EMIT(term, RSplit);
108+
re++;
109+
} else {
110+
EMIT(term, Split);
111+
}
112+
EMIT(term + 1, REL(term, pc));
113+
prog->len += 2;
114+
break;
115+
case '+':
116+
if (re[1] == '?') {
117+
EMIT(pc, Split);
118+
re++;
119+
} else {
120+
EMIT(pc, RSplit);
121+
}
122+
EMIT(pc + 1, REL(pc, term));
123+
pc += 2;
124+
prog->len++;
125+
break;
126+
case '|':
127+
if (alt_label) {
128+
EMIT(alt_label, REL(alt_label, pc) + 1);
129+
}
130+
insert_code(code, start, 2, &pc);
131+
EMIT(pc++, Jmp);
132+
alt_label = pc++;
133+
EMIT(start, Split);
134+
EMIT(start + 1, REL(start, pc));
135+
prog->len += 2;
136+
break;
137+
case '^':
138+
EMIT(pc++, Bol);
139+
prog->len++;
140+
break;
141+
case '$':
142+
EMIT(pc++, Eol);
143+
prog->len++;
144+
break;
145+
}
146+
}
147+
148+
if (alt_label) {
149+
EMIT(alt_label, REL(alt_label, pc) + 1);
150+
}
151+
prog->bytelen = pc;
152+
return re;
153+
}
154+
155+
int re1_5_compilecode(ByteProg *prog, const char *re)
156+
{
157+
prog->len = 0;
158+
prog->bytelen = 0;
159+
prog->sub = 0;
160+
161+
// Add code to implement non-anchored operation ("search"),
162+
// for anchored operation ("match"), this code will be just skipped.
163+
// TODO: Implement search in much more efficient manner
164+
prog->insts[prog->bytelen++] = RSplit;
165+
prog->insts[prog->bytelen++] = 3;
166+
prog->insts[prog->bytelen++] = Any;
167+
prog->insts[prog->bytelen++] = Jmp;
168+
prog->insts[prog->bytelen++] = -5;
169+
prog->len += 3;
170+
171+
prog->insts[prog->bytelen++] = Save;
172+
prog->insts[prog->bytelen++] = 0;
173+
prog->len++;
174+
175+
_compilecode(re, prog);
176+
177+
prog->insts[prog->bytelen++] = Save;
178+
prog->insts[prog->bytelen++] = 1;
179+
prog->len++;
180+
181+
prog->insts[prog->bytelen++] = Match;
182+
prog->len++;
183+
184+
return 0;
185+
}
186+
187+
void
188+
cleanmarks(ByteProg *prog)
189+
{
190+
char *pc = prog->insts;
191+
char *end = pc + prog->bytelen;
192+
while (pc < end) {
193+
*pc &= 0x7f;
194+
switch (*pc) {
195+
case Jmp:
196+
case Split:
197+
case RSplit:
198+
case Save:
199+
case Char:
200+
pc++;
201+
}
202+
pc++;
203+
}
204+
}
205+
206+
#if 0
207+
int main(int argc, char *argv[])
208+
{
209+
int pc = 0;
210+
ByteProg *code = re1_5_compilecode(argv[1]);
211+
re1_5_dumpcode(code);
212+
}
213+
#endif

extmod/re1.5/dumpcode.c

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// Copyright 2014 Paul Sokolovsky.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
#include "regexp.h"
6+
7+
void re1_5_dumpcode(ByteProg *prog)
8+
{
9+
int pc = 0;
10+
char *code = prog->insts;
11+
while (pc < prog->bytelen) {
12+
printf("%2d: ", pc);
13+
switch(code[pc++]) {
14+
default:
15+
assert(0);
16+
// re1_5_fatal("printprog");
17+
case Split:
18+
printf("split %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]);
19+
pc++;
20+
break;
21+
case RSplit:
22+
printf("rsplit %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]);
23+
pc++;
24+
break;
25+
case Jmp:
26+
printf("jmp %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]);
27+
pc++;
28+
break;
29+
case Char:
30+
printf("char %c\n", code[pc++]);
31+
break;
32+
case Any:
33+
printf("any\n");
34+
break;
35+
case Match:
36+
printf("match\n");
37+
break;
38+
case Save:
39+
printf("save %d\n", (unsigned char)code[pc++]);
40+
break;
41+
case Bol:
42+
printf("assert bol\n");
43+
break;
44+
case Eol:
45+
printf("assert eol\n");
46+
break;
47+
}
48+
}
49+
printf("Bytes: %d, insts: %d\n", prog->bytelen, prog->len);
50+
}

extmod/re1.5/recursiveloop.c

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
#include "regexp.h"
6+
7+
static int
8+
recursiveloop(char *pc, const char *sp, Subject *input, const char **subp, int nsubp)
9+
{
10+
const char *old;
11+
int off;
12+
13+
for(;;) {
14+
if(inst_is_consumer(*pc)) {
15+
// If we need to match a character, but there's none left, it's fail
16+
if(sp >= input->end)
17+
return 0;
18+
}
19+
switch(*pc++) {
20+
case Char:
21+
if(*sp != *pc++)
22+
return 0;
23+
case Any:
24+
sp++;
25+
continue;
26+
case Match:
27+
return 1;
28+
case Jmp:
29+
off = (signed char)*pc++;
30+
pc = pc + off;
31+
continue;
32+
case Split:
33+
off = (signed char)*pc++;
34+
if(recursiveloop(pc, sp, input, subp, nsubp))
35+
return 1;
36+
pc = pc + off;
37+
continue;
38+
case RSplit:
39+
off = (signed char)*pc++;
40+
if(recursiveloop(pc + off, sp, input, subp, nsubp))
41+
return 1;
42+
continue;
43+
case Save:
44+
off = (unsigned char)*pc++;
45+
if(off >= nsubp) {
46+
continue;
47+
}
48+
old = subp[off];
49+
subp[off] = sp;
50+
if(recursiveloop(pc, sp, input, subp, nsubp))
51+
return 1;
52+
subp[off] = old;
53+
return 0;
54+
case Bol:
55+
if(sp != input->begin)
56+
return 0;
57+
continue;
58+
case Eol:
59+
if(sp != input->end)
60+
return 0;
61+
continue;
62+
}
63+
re1_5_fatal("recursiveloop");
64+
}
65+
}
66+
67+
int
68+
re1_5_recursiveloopprog(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored)
69+
{
70+
return recursiveloop(HANDLE_ANCHORED(prog->insts, is_anchored), input->begin, input, subp, nsubp);
71+
}

0 commit comments

Comments
 (0)