From 7035731624b2290077801e039d2ec18ad211697e Mon Sep 17 00:00:00 2001 From: tic Date: Wed, 8 Nov 2017 17:08:12 +0300 Subject: [PATCH] #367 --- Makefile | 20 +- README.md | 4 +- src/ext/lpeg/HISTORY | 96 +++ src/ext/lpeg/lpcap.c | 537 +++++++++++++ src/ext/lpeg/lpcap.h | 56 ++ src/ext/lpeg/lpcode.c | 1014 +++++++++++++++++++++++++ src/ext/lpeg/lpcode.h | 40 + src/ext/lpeg/lpeg-128.gif | Bin 0 -> 4923 bytes src/ext/lpeg/lpeg.html | 1445 +++++++++++++++++++++++++++++++++++ src/ext/lpeg/lpprint.c | 244 ++++++ src/ext/lpeg/lpprint.h | 36 + src/ext/lpeg/lptree.c | 1303 ++++++++++++++++++++++++++++++++ src/ext/lpeg/lptree.h | 82 ++ src/ext/lpeg/lptypes.h | 149 ++++ src/ext/lpeg/lpvm.c | 364 +++++++++ src/ext/lpeg/lpvm.h | 58 ++ src/ext/lpeg/makefile | 55 ++ src/ext/lpeg/re.html | 498 ++++++++++++ src/ext/lpeg/re.lua | 259 +++++++ src/ext/lpeg/test.lua | 1503 +++++++++++++++++++++++++++++++++++++ 20 files changed, 7755 insertions(+), 8 deletions(-) create mode 100644 src/ext/lpeg/HISTORY create mode 100644 src/ext/lpeg/lpcap.c create mode 100644 src/ext/lpeg/lpcap.h create mode 100644 src/ext/lpeg/lpcode.c create mode 100644 src/ext/lpeg/lpcode.h create mode 100644 src/ext/lpeg/lpeg-128.gif create mode 100644 src/ext/lpeg/lpeg.html create mode 100644 src/ext/lpeg/lpprint.c create mode 100644 src/ext/lpeg/lpprint.h create mode 100644 src/ext/lpeg/lptree.c create mode 100644 src/ext/lpeg/lptree.h create mode 100644 src/ext/lpeg/lptypes.h create mode 100644 src/ext/lpeg/lpvm.c create mode 100644 src/ext/lpeg/lpvm.h create mode 100644 src/ext/lpeg/makefile create mode 100644 src/ext/lpeg/re.html create mode 100644 src/ext/lpeg/re.lua create mode 100755 src/ext/lpeg/test.lua diff --git a/Makefile b/Makefile index 51baaa9..052656e 100644 --- a/Makefile +++ b/Makefile @@ -26,12 +26,15 @@ MINGW_LINKER_FLAGS= \ LINUX_INCLUDES= \ `pkg-config --cflags gtk+-3.0` +LINUX_LIBS= \ + `pkg-config --libs gtk+-3.0` + LINUX64_LIBS= \ - `pkg-config --libs gtk+-3.0` \ + $(LINUX_LIBS) \ -Llib/linux LINUX32_LIBS= \ - `pkg-config --libs gtk+-3.0` \ + $(LINUX_LIBS) \ -Llib/linux32 LINUX_ARM_LIBS= \ @@ -40,7 +43,7 @@ LINUX_ARM_LIBS= \ LINUX_LINKER_FLAGS= \ -D_GNU_SOURCE \ -lSDL2 \ - -llua \ + -llua5.3 \ -ldl \ -lm \ -lpthread \ @@ -114,6 +117,8 @@ SOURCES=\ SOURCES_EXT= \ src/html.c +LPEG_SRC= src/ext/lpeg/*.c + DEMO_ASSETS= \ bin/assets/fire.tic.dat \ bin/assets/p3d.tic.dat \ @@ -272,15 +277,18 @@ mingw: $(DEMO_ASSETS) $(TIC80_DLL) $(TIC_O) bin/html.o bin/res.o run: mingw $(MINGW_OUTPUT) -linux64: +linux64-flto: $(CC) $(LINUX_INCLUDES) $(SOURCES) $(TIC80_SRC) $(SOURCES_EXT) $(OPT) $(INCLUDES) $(LINUX64_LIBS) $(LINUX_LINKER_FLAGS) -flto -o bin/tic -linux32: +linux32-flto: $(CC) $(LINUX_INCLUDES) $(SOURCES) $(TIC80_SRC) $(SOURCES_EXT) $(OPT) $(INCLUDES) $(LINUX32_LIBS) $(LINUX_LINKER_FLAGS) -flto -o bin/tic -arm: +arm-flto: $(CC) $(OPT_ARM) $(SOURCES) $(TIC80_SRC) $(OPT) $(INCLUDES) $(LINUX_ARM_LIBS) $(LINUX_LINKER_FLAGS) -flto -o bin/tic +linux: + $(CC) $(LINUX_INCLUDES) $(SOURCES) $(LPEG_SRC) $(SOURCES_EXT) $(TIC80_SRC) $(OPT) $(INCLUDES) $(LINUX_LIBS) $(LINUX_LINKER_FLAGS) -o bin/tic + macosx: $(CC) $(SOURCES) $(TIC80_SRC) $(SOURCES_EXT) src/ext/file_dialog.m $(OPT) $(MACOSX_OPT) $(INCLUDES) $(MACOSX_LIBS) -o bin/tic diff --git a/README.md b/README.md index 6be9e4b..5eb9502 100644 --- a/README.md +++ b/README.md @@ -31,10 +31,10 @@ made by [@matheuslessarodrigues](https://github.com/matheuslessarodrigues) ## Linux run the following commands in the Terminal ``` -sudo apt-get install git build-essential libgtk-3-dev +sudo apt-get install git build-essential libgtk-3-dev libsdl2-dev lua5.3-dev libgif-dev lua-lpeg-dev zlib1g-dev git clone https://github.com/nesbox/TIC-80 cd TIC-80 -make linux32 (or linux64/arm depending on your system) +make linux ``` ## iOS / tvOS diff --git a/src/ext/lpeg/HISTORY b/src/ext/lpeg/HISTORY new file mode 100644 index 0000000..0c10edd --- /dev/null +++ b/src/ext/lpeg/HISTORY @@ -0,0 +1,96 @@ +HISTORY for LPeg 1.0 + +* Changes from version 0.12 to 1.0 + --------------------------------- + + group "names" can be any Lua value + + some bugs fixed + + other small improvements + +* Changes from version 0.11 to 0.12 + --------------------------------- + + no "unsigned short" limit for pattern sizes + + mathtime captures considered nullable + + some bugs fixed + +* Changes from version 0.10 to 0.11 + ------------------------------- + + complete reimplementation of the code generator + + new syntax for table captures + + new functions in module 're' + + other small improvements + +* Changes from version 0.9 to 0.10 + ------------------------------- + + backtrack stack has configurable size + + better error messages + + Notation for non-terminals in 're' back to A instead o + + experimental look-behind pattern + + support for external extensions + + works with Lua 5.2 + + consumes less C stack + + - "and" predicates do not keep captures + +* Changes from version 0.8 to 0.9 + ------------------------------- + + The accumulator capture was replaced by a fold capture; + programs that used the old 'lpeg.Ca' will need small changes. + + Some support for character classes from old C locales. + + A new named-group capture. + +* Changes from version 0.7 to 0.8 + ------------------------------- + + New "match-time" capture. + + New "argument capture" that allows passing arguments into the pattern. + + Better documentation for 're'. + + Several small improvements for 're'. + + The 're' module has an incompatibility with previous versions: + now, any use of a non-terminal must be enclosed in angle brackets + (like ). + +* Changes from version 0.6 to 0.7 + ------------------------------- + + Several improvements in module 're': + - better documentation; + - support for most captures (all but accumulator); + - limited repetitions p{n,m}. + + Small improvements in efficiency. + + Several small bugs corrected (special thanks to Hans Hagen + and Taco Hoekwater). + +* Changes from version 0.5 to 0.6 + ------------------------------- + + Support for non-numeric indices in grammars. + + Some bug fixes (thanks to the luatex team). + + Some new optimizations; (thanks to Mike Pall). + + A new page layout (thanks to Andre Carregal). + + Minimal documentation for module 're'. + +* Changes from version 0.4 to 0.5 + ------------------------------- + + Several optimizations. + + lpeg.P now accepts booleans. + + Some new examples. + + A proper license. + + Several small improvements. + +* Changes from version 0.3 to 0.4 + ------------------------------- + + Static check for loops in repetitions and grammars. + + Removed label option in captures. + + The implementation of captures uses less memory. + +* Changes from version 0.2 to 0.3 + ------------------------------- + + User-defined patterns in Lua. + + Several new captures. + +* Changes from version 0.1 to 0.2 + ------------------------------- + + Several small corrections. + + Handles embedded zeros like any other character. + + Capture "name" can be any Lua value. + + Unlimited number of captures. + + Match gets an optional initial position. + +(end of HISTORY) diff --git a/src/ext/lpeg/lpcap.c b/src/ext/lpeg/lpcap.c new file mode 100644 index 0000000..c9085de --- /dev/null +++ b/src/ext/lpeg/lpcap.c @@ -0,0 +1,537 @@ +/* +** $Id: lpcap.c,v 1.6 2015/06/15 16:09:57 roberto Exp $ +** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) +*/ + +#include "lua.h" +#include "lauxlib.h" + +#include "lpcap.h" +#include "lptypes.h" + + +#define captype(cap) ((cap)->kind) + +#define isclosecap(cap) (captype(cap) == Cclose) + +#define closeaddr(c) ((c)->s + (c)->siz - 1) + +#define isfullcap(cap) ((cap)->siz != 0) + +#define getfromktable(cs,v) lua_rawgeti((cs)->L, ktableidx((cs)->ptop), v) + +#define pushluaval(cs) getfromktable(cs, (cs)->cap->idx) + + + +/* +** Put at the cache for Lua values the value indexed by 'v' in ktable +** of the running pattern (if it is not there yet); returns its index. +*/ +static int updatecache (CapState *cs, int v) { + int idx = cs->ptop + 1; /* stack index of cache for Lua values */ + if (v != cs->valuecached) { /* not there? */ + getfromktable(cs, v); /* get value from 'ktable' */ + lua_replace(cs->L, idx); /* put it at reserved stack position */ + cs->valuecached = v; /* keep track of what is there */ + } + return idx; +} + + +static int pushcapture (CapState *cs); + + +/* +** Goes back in a list of captures looking for an open capture +** corresponding to a close +*/ +static Capture *findopen (Capture *cap) { + int n = 0; /* number of closes waiting an open */ + for (;;) { + cap--; + if (isclosecap(cap)) n++; /* one more open to skip */ + else if (!isfullcap(cap)) + if (n-- == 0) return cap; + } +} + + +/* +** Go to the next capture +*/ +static void nextcap (CapState *cs) { + Capture *cap = cs->cap; + if (!isfullcap(cap)) { /* not a single capture? */ + int n = 0; /* number of opens waiting a close */ + for (;;) { /* look for corresponding close */ + cap++; + if (isclosecap(cap)) { + if (n-- == 0) break; + } + else if (!isfullcap(cap)) n++; + } + } + cs->cap = cap + 1; /* + 1 to skip last close (or entire single capture) */ +} + + +/* +** Push on the Lua stack all values generated by nested captures inside +** the current capture. Returns number of values pushed. 'addextra' +** makes it push the entire match after all captured values. The +** entire match is pushed also if there are no other nested values, +** so the function never returns zero. +*/ +static int pushnestedvalues (CapState *cs, int addextra) { + Capture *co = cs->cap; + if (isfullcap(cs->cap++)) { /* no nested captures? */ + lua_pushlstring(cs->L, co->s, co->siz - 1); /* push whole match */ + return 1; /* that is it */ + } + else { + int n = 0; + while (!isclosecap(cs->cap)) /* repeat for all nested patterns */ + n += pushcapture(cs); + if (addextra || n == 0) { /* need extra? */ + lua_pushlstring(cs->L, co->s, cs->cap->s - co->s); /* push whole match */ + n++; + } + cs->cap++; /* skip close entry */ + return n; + } +} + + +/* +** Push only the first value generated by nested captures +*/ +static void pushonenestedvalue (CapState *cs) { + int n = pushnestedvalues(cs, 0); + if (n > 1) + lua_pop(cs->L, n - 1); /* pop extra values */ +} + + +/* +** Try to find a named group capture with the name given at the top of +** the stack; goes backward from 'cap'. +*/ +static Capture *findback (CapState *cs, Capture *cap) { + lua_State *L = cs->L; + while (cap-- > cs->ocap) { /* repeat until end of list */ + if (isclosecap(cap)) + cap = findopen(cap); /* skip nested captures */ + else if (!isfullcap(cap)) + continue; /* opening an enclosing capture: skip and get previous */ + if (captype(cap) == Cgroup) { + getfromktable(cs, cap->idx); /* get group name */ + if (lp_equal(L, -2, -1)) { /* right group? */ + lua_pop(L, 2); /* remove reference name and group name */ + return cap; + } + else lua_pop(L, 1); /* remove group name */ + } + } + luaL_error(L, "back reference '%s' not found", lua_tostring(L, -1)); + return NULL; /* to avoid warnings */ +} + + +/* +** Back-reference capture. Return number of values pushed. +*/ +static int backrefcap (CapState *cs) { + int n; + Capture *curr = cs->cap; + pushluaval(cs); /* reference name */ + cs->cap = findback(cs, curr); /* find corresponding group */ + n = pushnestedvalues(cs, 0); /* push group's values */ + cs->cap = curr + 1; + return n; +} + + +/* +** Table capture: creates a new table and populates it with nested +** captures. +*/ +static int tablecap (CapState *cs) { + lua_State *L = cs->L; + int n = 0; + lua_newtable(L); + if (isfullcap(cs->cap++)) + return 1; /* table is empty */ + while (!isclosecap(cs->cap)) { + if (captype(cs->cap) == Cgroup && cs->cap->idx != 0) { /* named group? */ + pushluaval(cs); /* push group name */ + pushonenestedvalue(cs); + lua_settable(L, -3); + } + else { /* not a named group */ + int i; + int k = pushcapture(cs); + for (i = k; i > 0; i--) /* store all values into table */ + lua_rawseti(L, -(i + 1), n + i); + n += k; + } + } + cs->cap++; /* skip close entry */ + return 1; /* number of values pushed (only the table) */ +} + + +/* +** Table-query capture +*/ +static int querycap (CapState *cs) { + int idx = cs->cap->idx; + pushonenestedvalue(cs); /* get nested capture */ + lua_gettable(cs->L, updatecache(cs, idx)); /* query cap. value at table */ + if (!lua_isnil(cs->L, -1)) + return 1; + else { /* no value */ + lua_pop(cs->L, 1); /* remove nil */ + return 0; + } +} + + +/* +** Fold capture +*/ +static int foldcap (CapState *cs) { + int n; + lua_State *L = cs->L; + int idx = cs->cap->idx; + if (isfullcap(cs->cap++) || /* no nested captures? */ + isclosecap(cs->cap) || /* no nested captures (large subject)? */ + (n = pushcapture(cs)) == 0) /* nested captures with no values? */ + return luaL_error(L, "no initial value for fold capture"); + if (n > 1) + lua_pop(L, n - 1); /* leave only one result for accumulator */ + while (!isclosecap(cs->cap)) { + lua_pushvalue(L, updatecache(cs, idx)); /* get folding function */ + lua_insert(L, -2); /* put it before accumulator */ + n = pushcapture(cs); /* get next capture's values */ + lua_call(L, n + 1, 1); /* call folding function */ + } + cs->cap++; /* skip close entry */ + return 1; /* only accumulator left on the stack */ +} + + +/* +** Function capture +*/ +static int functioncap (CapState *cs) { + int n; + int top = lua_gettop(cs->L); + pushluaval(cs); /* push function */ + n = pushnestedvalues(cs, 0); /* push nested captures */ + lua_call(cs->L, n, LUA_MULTRET); /* call function */ + return lua_gettop(cs->L) - top; /* return function's results */ +} + + +/* +** Select capture +*/ +static int numcap (CapState *cs) { + int idx = cs->cap->idx; /* value to select */ + if (idx == 0) { /* no values? */ + nextcap(cs); /* skip entire capture */ + return 0; /* no value produced */ + } + else { + int n = pushnestedvalues(cs, 0); + if (n < idx) /* invalid index? */ + return luaL_error(cs->L, "no capture '%d'", idx); + else { + lua_pushvalue(cs->L, -(n - idx + 1)); /* get selected capture */ + lua_replace(cs->L, -(n + 1)); /* put it in place of 1st capture */ + lua_pop(cs->L, n - 1); /* remove other captures */ + return 1; + } + } +} + + +/* +** Return the stack index of the first runtime capture in the given +** list of captures (or zero if no runtime captures) +*/ +int finddyncap (Capture *cap, Capture *last) { + for (; cap < last; cap++) { + if (cap->kind == Cruntime) + return cap->idx; /* stack position of first capture */ + } + return 0; /* no dynamic captures in this segment */ +} + + +/* +** Calls a runtime capture. Returns number of captures removed by +** the call, including the initial Cgroup. (Captures to be added are +** on the Lua stack.) +*/ +int runtimecap (CapState *cs, Capture *close, const char *s, int *rem) { + int n, id; + lua_State *L = cs->L; + int otop = lua_gettop(L); + Capture *open = findopen(close); + assert(captype(open) == Cgroup); + id = finddyncap(open, close); /* get first dynamic capture argument */ + close->kind = Cclose; /* closes the group */ + close->s = s; + cs->cap = open; cs->valuecached = 0; /* prepare capture state */ + luaL_checkstack(L, 4, "too many runtime captures"); + pushluaval(cs); /* push function to be called */ + lua_pushvalue(L, SUBJIDX); /* push original subject */ + lua_pushinteger(L, s - cs->s + 1); /* push current position */ + n = pushnestedvalues(cs, 0); /* push nested captures */ + lua_call(L, n + 2, LUA_MULTRET); /* call dynamic function */ + if (id > 0) { /* are there old dynamic captures to be removed? */ + int i; + for (i = id; i <= otop; i++) + lua_remove(L, id); /* remove old dynamic captures */ + *rem = otop - id + 1; /* total number of dynamic captures removed */ + } + else + *rem = 0; /* no dynamic captures removed */ + return close - open; /* number of captures of all kinds removed */ +} + + +/* +** Auxiliary structure for substitution and string captures: keep +** information about nested captures for future use, avoiding to push +** string results into Lua +*/ +typedef struct StrAux { + int isstring; /* whether capture is a string */ + union { + Capture *cp; /* if not a string, respective capture */ + struct { /* if it is a string... */ + const char *s; /* ... starts here */ + const char *e; /* ... ends here */ + } s; + } u; +} StrAux; + +#define MAXSTRCAPS 10 + +/* +** Collect values from current capture into array 'cps'. Current +** capture must be Cstring (first call) or Csimple (recursive calls). +** (In first call, fills %0 with whole match for Cstring.) +** Returns number of elements in the array that were filled. +*/ +static int getstrcaps (CapState *cs, StrAux *cps, int n) { + int k = n++; + cps[k].isstring = 1; /* get string value */ + cps[k].u.s.s = cs->cap->s; /* starts here */ + if (!isfullcap(cs->cap++)) { /* nested captures? */ + while (!isclosecap(cs->cap)) { /* traverse them */ + if (n >= MAXSTRCAPS) /* too many captures? */ + nextcap(cs); /* skip extra captures (will not need them) */ + else if (captype(cs->cap) == Csimple) /* string? */ + n = getstrcaps(cs, cps, n); /* put info. into array */ + else { + cps[n].isstring = 0; /* not a string */ + cps[n].u.cp = cs->cap; /* keep original capture */ + nextcap(cs); + n++; + } + } + cs->cap++; /* skip close */ + } + cps[k].u.s.e = closeaddr(cs->cap - 1); /* ends here */ + return n; +} + + +/* +** add next capture value (which should be a string) to buffer 'b' +*/ +static int addonestring (luaL_Buffer *b, CapState *cs, const char *what); + + +/* +** String capture: add result to buffer 'b' (instead of pushing +** it into the stack) +*/ +static void stringcap (luaL_Buffer *b, CapState *cs) { + StrAux cps[MAXSTRCAPS]; + int n; + size_t len, i; + const char *fmt; /* format string */ + fmt = lua_tolstring(cs->L, updatecache(cs, cs->cap->idx), &len); + n = getstrcaps(cs, cps, 0) - 1; /* collect nested captures */ + for (i = 0; i < len; i++) { /* traverse them */ + if (fmt[i] != '%') /* not an escape? */ + luaL_addchar(b, fmt[i]); /* add it to buffer */ + else if (fmt[++i] < '0' || fmt[i] > '9') /* not followed by a digit? */ + luaL_addchar(b, fmt[i]); /* add to buffer */ + else { + int l = fmt[i] - '0'; /* capture index */ + if (l > n) + luaL_error(cs->L, "invalid capture index (%d)", l); + else if (cps[l].isstring) + luaL_addlstring(b, cps[l].u.s.s, cps[l].u.s.e - cps[l].u.s.s); + else { + Capture *curr = cs->cap; + cs->cap = cps[l].u.cp; /* go back to evaluate that nested capture */ + if (!addonestring(b, cs, "capture")) + luaL_error(cs->L, "no values in capture index %d", l); + cs->cap = curr; /* continue from where it stopped */ + } + } + } +} + + +/* +** Substitution capture: add result to buffer 'b' +*/ +static void substcap (luaL_Buffer *b, CapState *cs) { + const char *curr = cs->cap->s; + if (isfullcap(cs->cap)) /* no nested captures? */ + luaL_addlstring(b, curr, cs->cap->siz - 1); /* keep original text */ + else { + cs->cap++; /* skip open entry */ + while (!isclosecap(cs->cap)) { /* traverse nested captures */ + const char *next = cs->cap->s; + luaL_addlstring(b, curr, next - curr); /* add text up to capture */ + if (addonestring(b, cs, "replacement")) + curr = closeaddr(cs->cap - 1); /* continue after match */ + else /* no capture value */ + curr = next; /* keep original text in final result */ + } + luaL_addlstring(b, curr, cs->cap->s - curr); /* add last piece of text */ + } + cs->cap++; /* go to next capture */ +} + + +/* +** Evaluates a capture and adds its first value to buffer 'b'; returns +** whether there was a value +*/ +static int addonestring (luaL_Buffer *b, CapState *cs, const char *what) { + switch (captype(cs->cap)) { + case Cstring: + stringcap(b, cs); /* add capture directly to buffer */ + return 1; + case Csubst: + substcap(b, cs); /* add capture directly to buffer */ + return 1; + default: { + lua_State *L = cs->L; + int n = pushcapture(cs); + if (n > 0) { + if (n > 1) lua_pop(L, n - 1); /* only one result */ + if (!lua_isstring(L, -1)) + luaL_error(L, "invalid %s value (a %s)", what, luaL_typename(L, -1)); + luaL_addvalue(b); + } + return n; + } + } +} + + +/* +** Push all values of the current capture into the stack; returns +** number of values pushed +*/ +static int pushcapture (CapState *cs) { + lua_State *L = cs->L; + luaL_checkstack(L, 4, "too many captures"); + switch (captype(cs->cap)) { + case Cposition: { + lua_pushinteger(L, cs->cap->s - cs->s + 1); + cs->cap++; + return 1; + } + case Cconst: { + pushluaval(cs); + cs->cap++; + return 1; + } + case Carg: { + int arg = (cs->cap++)->idx; + if (arg + FIXEDARGS > cs->ptop) + return luaL_error(L, "reference to absent extra argument #%d", arg); + lua_pushvalue(L, arg + FIXEDARGS); + return 1; + } + case Csimple: { + int k = pushnestedvalues(cs, 1); + lua_insert(L, -k); /* make whole match be first result */ + return k; + } + case Cruntime: { + lua_pushvalue(L, (cs->cap++)->idx); /* value is in the stack */ + return 1; + } + case Cstring: { + luaL_Buffer b; + luaL_buffinit(L, &b); + stringcap(&b, cs); + luaL_pushresult(&b); + return 1; + } + case Csubst: { + luaL_Buffer b; + luaL_buffinit(L, &b); + substcap(&b, cs); + luaL_pushresult(&b); + return 1; + } + case Cgroup: { + if (cs->cap->idx == 0) /* anonymous group? */ + return pushnestedvalues(cs, 0); /* add all nested values */ + else { /* named group: add no values */ + nextcap(cs); /* skip capture */ + return 0; + } + } + case Cbackref: return backrefcap(cs); + case Ctable: return tablecap(cs); + case Cfunction: return functioncap(cs); + case Cnum: return numcap(cs); + case Cquery: return querycap(cs); + case Cfold: return foldcap(cs); + default: assert(0); return 0; + } +} + + +/* +** Prepare a CapState structure and traverse the entire list of +** captures in the stack pushing its results. 's' is the subject +** string, 'r' is the final position of the match, and 'ptop' +** the index in the stack where some useful values were pushed. +** Returns the number of results pushed. (If the list produces no +** results, push the final position of the match.) +*/ +int getcaptures (lua_State *L, const char *s, const char *r, int ptop) { + Capture *capture = (Capture *)lua_touserdata(L, caplistidx(ptop)); + int n = 0; + if (!isclosecap(capture)) { /* is there any capture? */ + CapState cs; + cs.ocap = cs.cap = capture; cs.L = L; + cs.s = s; cs.valuecached = 0; cs.ptop = ptop; + do { /* collect their values */ + n += pushcapture(&cs); + } while (!isclosecap(cs.cap)); + } + if (n == 0) { /* no capture values? */ + lua_pushinteger(L, r - s + 1); /* return only end position */ + n = 1; + } + return n; +} + + diff --git a/src/ext/lpeg/lpcap.h b/src/ext/lpeg/lpcap.h new file mode 100644 index 0000000..6133df2 --- /dev/null +++ b/src/ext/lpeg/lpcap.h @@ -0,0 +1,56 @@ +/* +** $Id: lpcap.h,v 1.3 2016/09/13 17:45:58 roberto Exp $ +*/ + +#if !defined(lpcap_h) +#define lpcap_h + + +#include "lptypes.h" + + +/* kinds of captures */ +typedef enum CapKind { + Cclose, /* not used in trees */ + Cposition, + Cconst, /* ktable[key] is Lua constant */ + Cbackref, /* ktable[key] is "name" of group to get capture */ + Carg, /* 'key' is arg's number */ + Csimple, /* next node is pattern */ + Ctable, /* next node is pattern */ + Cfunction, /* ktable[key] is function; next node is pattern */ + Cquery, /* ktable[key] is table; next node is pattern */ + Cstring, /* ktable[key] is string; next node is pattern */ + Cnum, /* numbered capture; 'key' is number of value to return */ + Csubst, /* substitution capture; next node is pattern */ + Cfold, /* ktable[key] is function; next node is pattern */ + Cruntime, /* not used in trees (is uses another type for tree) */ + Cgroup /* ktable[key] is group's "name" */ +} CapKind; + + +typedef struct Capture { + const char *s; /* subject position */ + unsigned short idx; /* extra info (group name, arg index, etc.) */ + byte kind; /* kind of capture */ + byte siz; /* size of full capture + 1 (0 = not a full capture) */ +} Capture; + + +typedef struct CapState { + Capture *cap; /* current capture */ + Capture *ocap; /* (original) capture list */ + lua_State *L; + int ptop; /* index of last argument to 'match' */ + const char *s; /* original string */ + int valuecached; /* value stored in cache slot */ +} CapState; + + +int runtimecap (CapState *cs, Capture *close, const char *s, int *rem); +int getcaptures (lua_State *L, const char *s, const char *r, int ptop); +int finddyncap (Capture *cap, Capture *last); + +#endif + + diff --git a/src/ext/lpeg/lpcode.c b/src/ext/lpeg/lpcode.c new file mode 100644 index 0000000..2722d71 --- /dev/null +++ b/src/ext/lpeg/lpcode.c @@ -0,0 +1,1014 @@ +/* +** $Id: lpcode.c,v 1.24 2016/09/15 17:46:13 roberto Exp $ +** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) +*/ + +#include + + +#include "lua.h" +#include "lauxlib.h" + +#include "lptypes.h" +#include "lpcode.h" + + +/* signals a "no-instruction */ +#define NOINST -1 + + + +static const Charset fullset_ = + {{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}}; + +static const Charset *fullset = &fullset_; + +/* +** {====================================================== +** Analysis and some optimizations +** ======================================================= +*/ + +/* +** Check whether a charset is empty (returns IFail), singleton (IChar), +** full (IAny), or none of those (ISet). When singleton, '*c' returns +** which character it is. (When generic set, the set was the input, +** so there is no need to return it.) +*/ +static Opcode charsettype (const byte *cs, int *c) { + int count = 0; /* number of characters in the set */ + int i; + int candidate = -1; /* candidate position for the singleton char */ + for (i = 0; i < CHARSETSIZE; i++) { /* for each byte */ + int b = cs[i]; + if (b == 0) { /* is byte empty? */ + if (count > 1) /* was set neither empty nor singleton? */ + return ISet; /* neither full nor empty nor singleton */ + /* else set is still empty or singleton */ + } + else if (b == 0xFF) { /* is byte full? */ + if (count < (i * BITSPERCHAR)) /* was set not full? */ + return ISet; /* neither full nor empty nor singleton */ + else count += BITSPERCHAR; /* set is still full */ + } + else if ((b & (b - 1)) == 0) { /* has byte only one bit? */ + if (count > 0) /* was set not empty? */ + return ISet; /* neither full nor empty nor singleton */ + else { /* set has only one char till now; track it */ + count++; + candidate = i; + } + } + else return ISet; /* byte is neither empty, full, nor singleton */ + } + switch (count) { + case 0: return IFail; /* empty set */ + case 1: { /* singleton; find character bit inside byte */ + int b = cs[candidate]; + *c = candidate * BITSPERCHAR; + if ((b & 0xF0) != 0) { *c += 4; b >>= 4; } + if ((b & 0x0C) != 0) { *c += 2; b >>= 2; } + if ((b & 0x02) != 0) { *c += 1; } + return IChar; + } + default: { + assert(count == CHARSETSIZE * BITSPERCHAR); /* full set */ + return IAny; + } + } +} + + +/* +** A few basic operations on Charsets +*/ +static void cs_complement (Charset *cs) { + loopset(i, cs->cs[i] = ~cs->cs[i]); +} + +static int cs_equal (const byte *cs1, const byte *cs2) { + loopset(i, if (cs1[i] != cs2[i]) return 0); + return 1; +} + +static int cs_disjoint (const Charset *cs1, const Charset *cs2) { + loopset(i, if ((cs1->cs[i] & cs2->cs[i]) != 0) return 0;) + return 1; +} + + +/* +** If 'tree' is a 'char' pattern (TSet, TChar, TAny), convert it into a +** charset and return 1; else return 0. +*/ +int tocharset (TTree *tree, Charset *cs) { + switch (tree->tag) { + case TSet: { /* copy set */ + loopset(i, cs->cs[i] = treebuffer(tree)[i]); + return 1; + } + case TChar: { /* only one char */ + assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX); + loopset(i, cs->cs[i] = 0); /* erase all chars */ + setchar(cs->cs, tree->u.n); /* add that one */ + return 1; + } + case TAny: { + loopset(i, cs->cs[i] = 0xFF); /* add all characters to the set */ + return 1; + } + default: return 0; + } +} + + +/* +** Visit a TCall node taking care to stop recursion. If node not yet +** visited, return 'f(sib2(tree))', otherwise return 'def' (default +** value) +*/ +static int callrecursive (TTree *tree, int f (TTree *t), int def) { + int key = tree->key; + assert(tree->tag == TCall); + assert(sib2(tree)->tag == TRule); + if (key == 0) /* node already visited? */ + return def; /* return default value */ + else { /* first visit */ + int result; + tree->key = 0; /* mark call as already visited */ + result = f(sib2(tree)); /* go to called rule */ + tree->key = key; /* restore tree */ + return result; + } +} + + +/* +** Check whether a pattern tree has captures +*/ +int hascaptures (TTree *tree) { + tailcall: + switch (tree->tag) { + case TCapture: case TRunTime: + return 1; + case TCall: + return callrecursive(tree, hascaptures, 0); + case TRule: /* do not follow siblings */ + tree = sib1(tree); goto tailcall; + case TOpenCall: assert(0); + default: { + switch (numsiblings[tree->tag]) { + case 1: /* return hascaptures(sib1(tree)); */ + tree = sib1(tree); goto tailcall; + case 2: + if (hascaptures(sib1(tree))) + return 1; + /* else return hascaptures(sib2(tree)); */ + tree = sib2(tree); goto tailcall; + default: assert(numsiblings[tree->tag] == 0); return 0; + } + } + } +} + + +/* +** Checks how a pattern behaves regarding the empty string, +** in one of two different ways: +** A pattern is *nullable* if it can match without consuming any character; +** A pattern is *nofail* if it never fails for any string +** (including the empty string). +** The difference is only for predicates and run-time captures; +** for other patterns, the two properties are equivalent. +** (With predicates, &'a' is nullable but not nofail. Of course, +** nofail => nullable.) +** These functions are all convervative in the following way: +** p is nullable => nullable(p) +** nofail(p) => p cannot fail +** The function assumes that TOpenCall is not nullable; +** this will be checked again when the grammar is fixed. +** Run-time captures can do whatever they want, so the result +** is conservative. +*/ +int checkaux (TTree *tree, int pred) { + tailcall: + switch (tree->tag) { + case TChar: case TSet: case TAny: + case TFalse: case TOpenCall: + return 0; /* not nullable */ + case TRep: case TTrue: + return 1; /* no fail */ + case TNot: case TBehind: /* can match empty, but can fail */ + if (pred == PEnofail) return 0; + else return 1; /* PEnullable */ + case TAnd: /* can match empty; fail iff body does */ + if (pred == PEnullable) return 1; + /* else return checkaux(sib1(tree), pred); */ + tree = sib1(tree); goto tailcall; + case TRunTime: /* can fail; match empty iff body does */ + if (pred == PEnofail) return 0; + /* else return checkaux(sib1(tree), pred); */ + tree = sib1(tree); goto tailcall; + case TSeq: + if (!checkaux(sib1(tree), pred)) return 0; + /* else return checkaux(sib2(tree), pred); */ + tree = sib2(tree); goto tailcall; + case TChoice: + if (checkaux(sib2(tree), pred)) return 1; + /* else return checkaux(sib1(tree), pred); */ + tree = sib1(tree); goto tailcall; + case TCapture: case TGrammar: case TRule: + /* return checkaux(sib1(tree), pred); */ + tree = sib1(tree); goto tailcall; + case TCall: /* return checkaux(sib2(tree), pred); */ + tree = sib2(tree); goto tailcall; + default: assert(0); return 0; + } +} + + +/* +** number of characters to match a pattern (or -1 if variable) +*/ +int fixedlen (TTree *tree) { + int len = 0; /* to accumulate in tail calls */ + tailcall: + switch (tree->tag) { + case TChar: case TSet: case TAny: + return len + 1; + case TFalse: case TTrue: case TNot: case TAnd: case TBehind: + return len; + case TRep: case TRunTime: case TOpenCall: + return -1; + case TCapture: case TRule: case TGrammar: + /* return fixedlen(sib1(tree)); */ + tree = sib1(tree); goto tailcall; + case TCall: { + int n1 = callrecursive(tree, fixedlen, -1); + if (n1 < 0) + return -1; + else + return len + n1; + } + case TSeq: { + int n1 = fixedlen(sib1(tree)); + if (n1 < 0) + return -1; + /* else return fixedlen(sib2(tree)) + len; */ + len += n1; tree = sib2(tree); goto tailcall; + } + case TChoice: { + int n1 = fixedlen(sib1(tree)); + int n2 = fixedlen(sib2(tree)); + if (n1 != n2 || n1 < 0) + return -1; + else + return len + n1; + } + default: assert(0); return 0; + }; +} + + +/* +** Computes the 'first set' of a pattern. +** The result is a conservative aproximation: +** match p ax -> x (for some x) ==> a belongs to first(p) +** or +** a not in first(p) ==> match p ax -> fail (for all x) +** +** The set 'follow' is the first set of what follows the +** pattern (full set if nothing follows it). +** +** The function returns 0 when this resulting set can be used for +** test instructions that avoid the pattern altogether. +** A non-zero return can happen for two reasons: +** 1) match p '' -> '' ==> return has bit 1 set +** (tests cannot be used because they would always fail for an empty input); +** 2) there is a match-time capture ==> return has bit 2 set +** (optimizations should not bypass match-time captures). +*/ +static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { + tailcall: + switch (tree->tag) { + case TChar: case TSet: case TAny: { + tocharset(tree, firstset); + return 0; + } + case TTrue: { + loopset(i, firstset->cs[i] = follow->cs[i]); + return 1; /* accepts the empty string */ + } + case TFalse: { + loopset(i, firstset->cs[i] = 0); + return 0; + } + case TChoice: { + Charset csaux; + int e1 = getfirst(sib1(tree), follow, firstset); + int e2 = getfirst(sib2(tree), follow, &csaux); + loopset(i, firstset->cs[i] |= csaux.cs[i]); + return e1 | e2; + } + case TSeq: { + if (!nullable(sib1(tree))) { + /* when p1 is not nullable, p2 has nothing to contribute; + return getfirst(sib1(tree), fullset, firstset); */ + tree = sib1(tree); follow = fullset; goto tailcall; + } + else { /* FIRST(p1 p2, fl) = FIRST(p1, FIRST(p2, fl)) */ + Charset csaux; + int e2 = getfirst(sib2(tree), follow, &csaux); + int e1 = getfirst(sib1(tree), &csaux, firstset); + if (e1 == 0) return 0; /* 'e1' ensures that first can be used */ + else if ((e1 | e2) & 2) /* one of the children has a matchtime? */ + return 2; /* pattern has a matchtime capture */ + else return e2; /* else depends on 'e2' */ + } + } + case TRep: { + getfirst(sib1(tree), follow, firstset); + loopset(i, firstset->cs[i] |= follow->cs[i]); + return 1; /* accept the empty string */ + } + case TCapture: case TGrammar: case TRule: { + /* return getfirst(sib1(tree), follow, firstset); */ + tree = sib1(tree); goto tailcall; + } + case TRunTime: { /* function invalidates any follow info. */ + int e = getfirst(sib1(tree), fullset, firstset); + if (e) return 2; /* function is not "protected"? */ + else return 0; /* pattern inside capture ensures first can be used */ + } + case TCall: { + /* return getfirst(sib2(tree), follow, firstset); */ + tree = sib2(tree); goto tailcall; + } + case TAnd: { + int e = getfirst(sib1(tree), follow, firstset); + loopset(i, firstset->cs[i] &= follow->cs[i]); + return e; + } + case TNot: { + if (tocharset(sib1(tree), firstset)) { + cs_complement(firstset); + return 1; + } + /* else go through */ + } + case TBehind: { /* instruction gives no new information */ + /* call 'getfirst' only to check for math-time captures */ + int e = getfirst(sib1(tree), follow, firstset); + loopset(i, firstset->cs[i] = follow->cs[i]); /* uses follow */ + return e | 1; /* always can accept the empty string */ + } + default: assert(0); return 0; + } +} + + +/* +** If 'headfail(tree)' true, then 'tree' can fail only depending on the +** next character of the subject. +*/ +static int headfail (TTree *tree) { + tailcall: + switch (tree->tag) { + case TChar: case TSet: case TAny: case TFalse: + return 1; + case TTrue: case TRep: case TRunTime: case TNot: + case TBehind: + return 0; + case TCapture: case TGrammar: case TRule: case TAnd: + tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ + case TCall: + tree = sib2(tree); goto tailcall; /* return headfail(sib2(tree)); */ + case TSeq: + if (!nofail(sib2(tree))) return 0; + /* else return headfail(sib1(tree)); */ + tree = sib1(tree); goto tailcall; + case TChoice: + if (!headfail(sib1(tree))) return 0; + /* else return headfail(sib2(tree)); */ + tree = sib2(tree); goto tailcall; + default: assert(0); return 0; + } +} + + +/* +** Check whether the code generation for the given tree can benefit +** from a follow set (to avoid computing the follow set when it is +** not needed) +*/ +static int needfollow (TTree *tree) { + tailcall: + switch (tree->tag) { + case TChar: case TSet: case TAny: + case TFalse: case TTrue: case TAnd: case TNot: + case TRunTime: case TGrammar: case TCall: case TBehind: + return 0; + case TChoice: case TRep: + return 1; + case TCapture: + tree = sib1(tree); goto tailcall; + case TSeq: + tree = sib2(tree); goto tailcall; + default: assert(0); return 0; + } +} + +/* }====================================================== */ + + + +/* +** {====================================================== +** Code generation +** ======================================================= +*/ + + +/* +** size of an instruction +*/ +int sizei (const Instruction *i) { + switch((Opcode)i->i.code) { + case ISet: case ISpan: return CHARSETINSTSIZE; + case ITestSet: return CHARSETINSTSIZE + 1; + case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: + case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: + return 2; + default: return 1; + } +} + + +/* +** state for the compiler +*/ +typedef struct CompileState { + Pattern *p; /* pattern being compiled */ + int ncode; /* next position in p->code to be filled */ + lua_State *L; +} CompileState; + + +/* +** code generation is recursive; 'opt' indicates that the code is being +** generated as the last thing inside an optional pattern (so, if that +** code is optional too, it can reuse the 'IChoice' already in place for +** the outer pattern). 'tt' points to a previous test protecting this +** code (or NOINST). 'fl' is the follow set of the pattern. +*/ +static void codegen (CompileState *compst, TTree *tree, int opt, int tt, + const Charset *fl); + + +void realloccode (lua_State *L, Pattern *p, int nsize) { + void *ud; + lua_Alloc f = lua_getallocf(L, &ud); + void *newblock = f(ud, p->code, p->codesize * sizeof(Instruction), + nsize * sizeof(Instruction)); + if (newblock == NULL && nsize > 0) + luaL_error(L, "not enough memory"); + p->code = (Instruction *)newblock; + p->codesize = nsize; +} + + +static int nextinstruction (CompileState *compst) { + int size = compst->p->codesize; + if (compst->ncode >= size) + realloccode(compst->L, compst->p, size * 2); + return compst->ncode++; +} + + +#define getinstr(cs,i) ((cs)->p->code[i]) + + +static int addinstruction (CompileState *compst, Opcode op, int aux) { + int i = nextinstruction(compst); + getinstr(compst, i).i.code = op; + getinstr(compst, i).i.aux = aux; + return i; +} + + +/* +** Add an instruction followed by space for an offset (to be set later) +*/ +static int addoffsetinst (CompileState *compst, Opcode op) { + int i = addinstruction(compst, op, 0); /* instruction */ + addinstruction(compst, (Opcode)0, 0); /* open space for offset */ + assert(op == ITestSet || sizei(&getinstr(compst, i)) == 2); + return i; +} + + +/* +** Set the offset of an instruction +*/ +static void setoffset (CompileState *compst, int instruction, int offset) { + getinstr(compst, instruction + 1).offset = offset; +} + + +/* +** Add a capture instruction: +** 'op' is the capture instruction; 'cap' the capture kind; +** 'key' the key into ktable; 'aux' is the optional capture offset +** +*/ +static int addinstcap (CompileState *compst, Opcode op, int cap, int key, + int aux) { + int i = addinstruction(compst, op, joinkindoff(cap, aux)); + getinstr(compst, i).i.key = key; + return i; +} + + +#define gethere(compst) ((compst)->ncode) + +#define target(code,i) ((i) + code[i + 1].offset) + + +/* +** Patch 'instruction' to jump to 'target' +*/ +static void jumptothere (CompileState *compst, int instruction, int target) { + if (instruction >= 0) + setoffset(compst, instruction, target - instruction); +} + + +/* +** Patch 'instruction' to jump to current position +*/ +static void jumptohere (CompileState *compst, int instruction) { + jumptothere(compst, instruction, gethere(compst)); +} + + +/* +** Code an IChar instruction, or IAny if there is an equivalent +** test dominating it +*/ +static void codechar (CompileState *compst, int c, int tt) { + if (tt >= 0 && getinstr(compst, tt).i.code == ITestChar && + getinstr(compst, tt).i.aux == c) + addinstruction(compst, IAny, 0); + else + addinstruction(compst, IChar, c); +} + + +/* +** Add a charset posfix to an instruction +*/ +static void addcharset (CompileState *compst, const byte *cs) { + int p = gethere(compst); + int i; + for (i = 0; i < (int)CHARSETINSTSIZE - 1; i++) + nextinstruction(compst); /* space for buffer */ + /* fill buffer with charset */ + loopset(j, getinstr(compst, p).buff[j] = cs[j]); +} + + +/* +** code a char set, optimizing unit sets for IChar, "complete" +** sets for IAny, and empty sets for IFail; also use an IAny +** when instruction is dominated by an equivalent test. +*/ +static void codecharset (CompileState *compst, const byte *cs, int tt) { + int c = 0; /* (=) to avoid warnings */ + Opcode op = charsettype(cs, &c); + switch (op) { + case IChar: codechar(compst, c, tt); break; + case ISet: { /* non-trivial set? */ + if (tt >= 0 && getinstr(compst, tt).i.code == ITestSet && + cs_equal(cs, getinstr(compst, tt + 2).buff)) + addinstruction(compst, IAny, 0); + else { + addinstruction(compst, ISet, 0); + addcharset(compst, cs); + } + break; + } + default: addinstruction(compst, op, c); break; + } +} + + +/* +** code a test set, optimizing unit sets for ITestChar, "complete" +** sets for ITestAny, and empty sets for IJmp (always fails). +** 'e' is true iff test should accept the empty string. (Test +** instructions in the current VM never accept the empty string.) +*/ +static int codetestset (CompileState *compst, Charset *cs, int e) { + if (e) return NOINST; /* no test */ + else { + int c = 0; + Opcode op = charsettype(cs->cs, &c); + switch (op) { + case IFail: return addoffsetinst(compst, IJmp); /* always jump */ + case IAny: return addoffsetinst(compst, ITestAny); + case IChar: { + int i = addoffsetinst(compst, ITestChar); + getinstr(compst, i).i.aux = c; + return i; + } + case ISet: { + int i = addoffsetinst(compst, ITestSet); + addcharset(compst, cs->cs); + return i; + } + default: assert(0); return 0; + } + } +} + + +/* +** Find the final destination of a sequence of jumps +*/ +static int finaltarget (Instruction *code, int i) { + while (code[i].i.code == IJmp) + i = target(code, i); + return i; +} + + +/* +** final label (after traversing any jumps) +*/ +static int finallabel (Instruction *code, int i) { + return finaltarget(code, target(code, i)); +} + + +/* +** == behind n;

(where n = fixedlen(p)) +*/ +static void codebehind (CompileState *compst, TTree *tree) { + if (tree->u.n > 0) + addinstruction(compst, IBehind, tree->u.n); + codegen(compst, sib1(tree), 0, NOINST, fullset); +} + + +/* +** Choice; optimizations: +** - when p1 is headfail or +** when first(p1) and first(p2) are disjoint, than +** a character not in first(p1) cannot go to p1, and a character +** in first(p1) cannot go to p2 (at it is not in first(p2)). +** (The optimization is not valid if p1 accepts the empty string, +** as then there is no character at all...) +** - when p2 is empty and opt is true; a IPartialCommit can reuse +** the Choice already active in the stack. +*/ +static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt, + const Charset *fl) { + int emptyp2 = (p2->tag == TTrue); + Charset cs1, cs2; + int e1 = getfirst(p1, fullset, &cs1); + if (headfail(p1) || + (!e1 && (getfirst(p2, fl, &cs2), cs_disjoint(&cs1, &cs2)))) { + /* == test (fail(p1)) -> L1 ; p1 ; jmp L2; L1: p2; L2: */ + int test = codetestset(compst, &cs1, 0); + int jmp = NOINST; + codegen(compst, p1, 0, test, fl); + if (!emptyp2) + jmp = addoffsetinst(compst, IJmp); + jumptohere(compst, test); + codegen(compst, p2, opt, NOINST, fl); + jumptohere(compst, jmp); + } + else if (opt && emptyp2) { + /* p1? == IPartialCommit; p1 */ + jumptohere(compst, addoffsetinst(compst, IPartialCommit)); + codegen(compst, p1, 1, NOINST, fullset); + } + else { + /* == + test(first(p1)) -> L1; choice L1; ; commit L2; L1: ; L2: */ + int pcommit; + int test = codetestset(compst, &cs1, e1); + int pchoice = addoffsetinst(compst, IChoice); + codegen(compst, p1, emptyp2, test, fullset); + pcommit = addoffsetinst(compst, ICommit); + jumptohere(compst, pchoice); + jumptohere(compst, test); + codegen(compst, p2, opt, NOINST, fl); + jumptohere(compst, pcommit); + } +} + + +/* +** And predicate +** optimization: fixedlen(p) = n ==> <&p> ==

; behind n +** (valid only when 'p' has no captures) +*/ +static void codeand (CompileState *compst, TTree *tree, int tt) { + int n = fixedlen(tree); + if (n >= 0 && n <= MAXBEHIND && !hascaptures(tree)) { + codegen(compst, tree, 0, tt, fullset); + if (n > 0) + addinstruction(compst, IBehind, n); + } + else { /* default: Choice L1; p1; BackCommit L2; L1: Fail; L2: */ + int pcommit; + int pchoice = addoffsetinst(compst, IChoice); + codegen(compst, tree, 0, tt, fullset); + pcommit = addoffsetinst(compst, IBackCommit); + jumptohere(compst, pchoice); + addinstruction(compst, IFail, 0); + jumptohere(compst, pcommit); + } +} + + +/* +** Captures: if pattern has fixed (and not too big) length, and it +** has no nested captures, use a single IFullCapture instruction +** after the match; otherwise, enclose the pattern with OpenCapture - +** CloseCapture. +*/ +static void codecapture (CompileState *compst, TTree *tree, int tt, + const Charset *fl) { + int len = fixedlen(sib1(tree)); + if (len >= 0 && len <= MAXOFF && !hascaptures(sib1(tree))) { + codegen(compst, sib1(tree), 0, tt, fl); + addinstcap(compst, IFullCapture, tree->cap, tree->key, len); + } + else { + addinstcap(compst, IOpenCapture, tree->cap, tree->key, 0); + codegen(compst, sib1(tree), 0, tt, fl); + addinstcap(compst, ICloseCapture, Cclose, 0, 0); + } +} + + +static void coderuntime (CompileState *compst, TTree *tree, int tt) { + addinstcap(compst, IOpenCapture, Cgroup, tree->key, 0); + codegen(compst, sib1(tree), 0, tt, fullset); + addinstcap(compst, ICloseRunTime, Cclose, 0, 0); +} + + +/* +** Repetion; optimizations: +** When pattern is a charset, can use special instruction ISpan. +** When pattern is head fail, or if it starts with characters that +** are disjoint from what follows the repetions, a simple test +** is enough (a fail inside the repetition would backtrack to fail +** again in the following pattern, so there is no need for a choice). +** When 'opt' is true, the repetion can reuse the Choice already +** active in the stack. +*/ +static void coderep (CompileState *compst, TTree *tree, int opt, + const Charset *fl) { + Charset st; + if (tocharset(tree, &st)) { + addinstruction(compst, ISpan, 0); + addcharset(compst, st.cs); + } + else { + int e1 = getfirst(tree, fullset, &st); + if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) { + /* L1: test (fail(p1)) -> L2;

; jmp L1; L2: */ + int jmp; + int test = codetestset(compst, &st, 0); + codegen(compst, tree, 0, test, fullset); + jmp = addoffsetinst(compst, IJmp); + jumptohere(compst, test); + jumptothere(compst, jmp, test); + } + else { + /* test(fail(p1)) -> L2; choice L2; L1:

; partialcommit L1; L2: */ + /* or (if 'opt'): partialcommit L1; L1:

; partialcommit L1; */ + int commit, l2; + int test = codetestset(compst, &st, e1); + int pchoice = NOINST; + if (opt) + jumptohere(compst, addoffsetinst(compst, IPartialCommit)); + else + pchoice = addoffsetinst(compst, IChoice); + l2 = gethere(compst); + codegen(compst, tree, 0, NOINST, fullset); + commit = addoffsetinst(compst, IPartialCommit); + jumptothere(compst, commit, l2); + jumptohere(compst, pchoice); + jumptohere(compst, test); + } + } +} + + +/* +** Not predicate; optimizations: +** In any case, if first test fails, 'not' succeeds, so it can jump to +** the end. If pattern is headfail, that is all (it cannot fail +** in other parts); this case includes 'not' of simple sets. Otherwise, +** use the default code (a choice plus a failtwice). +*/ +static void codenot (CompileState *compst, TTree *tree) { + Charset st; + int e = getfirst(tree, fullset, &st); + int test = codetestset(compst, &st, e); + if (headfail(tree)) /* test (fail(p1)) -> L1; fail; L1: */ + addinstruction(compst, IFail, 0); + else { + /* test(fail(p))-> L1; choice L1;

; failtwice; L1: */ + int pchoice = addoffsetinst(compst, IChoice); + codegen(compst, tree, 0, NOINST, fullset); + addinstruction(compst, IFailTwice, 0); + jumptohere(compst, pchoice); + } + jumptohere(compst, test); +} + + +/* +** change open calls to calls, using list 'positions' to find +** correct offsets; also optimize tail calls +*/ +static void correctcalls (CompileState *compst, int *positions, + int from, int to) { + int i; + Instruction *code = compst->p->code; + for (i = from; i < to; i += sizei(&code[i])) { + if (code[i].i.code == IOpenCall) { + int n = code[i].i.key; /* rule number */ + int rule = positions[n]; /* rule position */ + assert(rule == from || code[rule - 1].i.code == IRet); + if (code[finaltarget(code, i + 2)].i.code == IRet) /* call; ret ? */ + code[i].i.code = IJmp; /* tail call */ + else + code[i].i.code = ICall; + jumptothere(compst, i, rule); /* call jumps to respective rule */ + } + } + assert(i == to); +} + + +/* +** Code for a grammar: +** call L1; jmp L2; L1: rule 1; ret; rule 2; ret; ...; L2: +*/ +static void codegrammar (CompileState *compst, TTree *grammar) { + int positions[MAXRULES]; + int rulenumber = 0; + TTree *rule; + int firstcall = addoffsetinst(compst, ICall); /* call initial rule */ + int jumptoend = addoffsetinst(compst, IJmp); /* jump to the end */ + int start = gethere(compst); /* here starts the initial rule */ + jumptohere(compst, firstcall); + for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { + positions[rulenumber++] = gethere(compst); /* save rule position */ + codegen(compst, sib1(rule), 0, NOINST, fullset); /* code rule */ + addinstruction(compst, IRet, 0); + } + assert(rule->tag == TTrue); + jumptohere(compst, jumptoend); + correctcalls(compst, positions, start, gethere(compst)); +} + + +static void codecall (CompileState *compst, TTree *call) { + int c = addoffsetinst(compst, IOpenCall); /* to be corrected later */ + getinstr(compst, c).i.key = sib2(call)->cap; /* rule number */ + assert(sib2(call)->tag == TRule); +} + + +/* +** Code first child of a sequence +** (second child is called in-place to allow tail call) +** Return 'tt' for second child +*/ +static int codeseq1 (CompileState *compst, TTree *p1, TTree *p2, + int tt, const Charset *fl) { + if (needfollow(p1)) { + Charset fl1; + getfirst(p2, fl, &fl1); /* p1 follow is p2 first */ + codegen(compst, p1, 0, tt, &fl1); + } + else /* use 'fullset' as follow */ + codegen(compst, p1, 0, tt, fullset); + if (fixedlen(p1) != 0) /* can 'p1' consume anything? */ + return NOINST; /* invalidate test */ + else return tt; /* else 'tt' still protects sib2 */ +} + + +/* +** Main code-generation function: dispatch to auxiliar functions +** according to kind of tree. ('needfollow' should return true +** only for consructions that use 'fl'.) +*/ +static void codegen (CompileState *compst, TTree *tree, int opt, int tt, + const Charset *fl) { + tailcall: + switch (tree->tag) { + case TChar: codechar(compst, tree->u.n, tt); break; + case TAny: addinstruction(compst, IAny, 0); break; + case TSet: codecharset(compst, treebuffer(tree), tt); break; + case TTrue: break; + case TFalse: addinstruction(compst, IFail, 0); break; + case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; + case TRep: coderep(compst, sib1(tree), opt, fl); break; + case TBehind: codebehind(compst, tree); break; + case TNot: codenot(compst, sib1(tree)); break; + case TAnd: codeand(compst, sib1(tree), tt); break; + case TCapture: codecapture(compst, tree, tt, fl); break; + case TRunTime: coderuntime(compst, tree, tt); break; + case TGrammar: codegrammar(compst, tree); break; + case TCall: codecall(compst, tree); break; + case TSeq: { + tt = codeseq1(compst, sib1(tree), sib2(tree), tt, fl); /* code 'p1' */ + /* codegen(compst, p2, opt, tt, fl); */ + tree = sib2(tree); goto tailcall; + } + default: assert(0); + } +} + + +/* +** Optimize jumps and other jump-like instructions. +** * Update labels of instructions with labels to their final +** destinations (e.g., choice L1; ... L1: jmp L2: becomes +** choice L2) +** * Jumps to other instructions that do jumps become those +** instructions (e.g., jump to return becomes a return; jump +** to commit becomes a commit) +*/ +static void peephole (CompileState *compst) { + Instruction *code = compst->p->code; + int i; + for (i = 0; i < compst->ncode; i += sizei(&code[i])) { + redo: + switch (code[i].i.code) { + case IChoice: case ICall: case ICommit: case IPartialCommit: + case IBackCommit: case ITestChar: case ITestSet: + case ITestAny: { /* instructions with labels */ + jumptothere(compst, i, finallabel(code, i)); /* optimize label */ + break; + } + case IJmp: { + int ft = finaltarget(code, i); + switch (code[ft].i.code) { /* jumping to what? */ + case IRet: case IFail: case IFailTwice: + case IEnd: { /* instructions with unconditional implicit jumps */ + code[i] = code[ft]; /* jump becomes that instruction */ + code[i + 1].i.code = IAny; /* 'no-op' for target position */ + break; + } + case ICommit: case IPartialCommit: + case IBackCommit: { /* inst. with unconditional explicit jumps */ + int fft = finallabel(code, ft); + code[i] = code[ft]; /* jump becomes that instruction... */ + jumptothere(compst, i, fft); /* but must correct its offset */ + goto redo; /* reoptimize its label */ + } + default: { + jumptothere(compst, i, ft); /* optimize label */ + break; + } + } + break; + } + default: break; + } + } + assert(code[i - 1].i.code == IEnd); +} + + +/* +** Compile a pattern +*/ +Instruction *compile (lua_State *L, Pattern *p) { + CompileState compst; + compst.p = p; compst.ncode = 0; compst.L = L; + realloccode(L, p, 2); /* minimum initial size */ + codegen(&compst, p->tree, 0, NOINST, fullset); + addinstruction(&compst, IEnd, 0); + realloccode(L, p, compst.ncode); /* set final size */ + peephole(&compst); + return p->code; +} + + +/* }====================================================== */ + diff --git a/src/ext/lpeg/lpcode.h b/src/ext/lpeg/lpcode.h new file mode 100644 index 0000000..2a5861e --- /dev/null +++ b/src/ext/lpeg/lpcode.h @@ -0,0 +1,40 @@ +/* +** $Id: lpcode.h,v 1.8 2016/09/15 17:46:13 roberto Exp $ +*/ + +#if !defined(lpcode_h) +#define lpcode_h + +#include "lua.h" + +#include "lptypes.h" +#include "lptree.h" +#include "lpvm.h" + +int tocharset (TTree *tree, Charset *cs); +int checkaux (TTree *tree, int pred); +int fixedlen (TTree *tree); +int hascaptures (TTree *tree); +int lp_gc (lua_State *L); +Instruction *compile (lua_State *L, Pattern *p); +void realloccode (lua_State *L, Pattern *p, int nsize); +int sizei (const Instruction *i); + + +#define PEnullable 0 +#define PEnofail 1 + +/* +** nofail(t) implies that 't' cannot fail with any input +*/ +#define nofail(t) checkaux(t, PEnofail) + +/* +** (not nullable(t)) implies 't' cannot match without consuming +** something +*/ +#define nullable(t) checkaux(t, PEnullable) + + + +#endif diff --git a/src/ext/lpeg/lpeg-128.gif b/src/ext/lpeg/lpeg-128.gif new file mode 100644 index 0000000000000000000000000000000000000000..bbf5e78b88e71044bd341a2476dcec756c252ae4 GIT binary patch literal 4923 zcmV-B6U6LCNk%w1VSoUD0O$Vz004pj0D}Plga8JE00)Es2!#L%h6@FN4hDe`2Z9p_ zgBS~j4iktF6p0fRixn1(7#EEV9grItj}#!2B_EC`A(0;@k|8FOBPWz3D3vEFmoh4o zEH0HaE0iuTmM}1wGBTPiH=Q&zn>I9^I5?R*I-5N^oj*OFK|Y~7LZduGq((%cN=Kwi zNTp0krbkVwPfn#$P^VK-s8Cm}SXil9S*luEt6N*FU0kkTUa(+av0-4cV`8*qWwB*u zvSw$qXK1u&X|-!>xNL2?b8fYCZ?|u9xN>&8et>;?c)EXqet?31e0sZqgMfW|yn=** zetf-yg@S~JgMfa&g@=TJf53)_g@S;>eS^Y?iiduL#EFZDij0Ybg};S{!G?#zjgO3u zkd26m#E+4Vk&}>%jK`9ck&TVXl$Mi{j>MIhl$My4l#s@knULz#>wW(&B@2g=gP|A&Ckos%;V0`%+1Z^ z&(Y4%($CP(=h4&9(a`A9)X~$`($doD)YjA0*VNS1>(Dk-Y+}P^d z-P+yR>)qen-`nlp;N9Qh-r?Qv+~Dxw;@{oj^5Nv*gVX`^Xcp8>g?(2>-FpI>g??I?eFXD?fCBS?e6dS^6&KU^6&HU_3`uY z^7Qfb^Y`=h^7i!j_4oAn_4@ht`}g_w`uF_!`uF?z{rUU&{QCa-{Q39&|NH&=`2PR- z|Ns2|`~Ls@A^sIZa%Ew3Wn>_CX>@2HRA^-&M@dak04x9i004jhfB*mp{s8|897wRB z!Gj1BDqP60p~Hs|BTAe|v7*I`7&B_z$g!ixk03*e97(dI$&)Bks$9vkrAvX8`q*is z=H9?{l}7E#R?lNQWVLv4P~jVh?hctKNEB_v)Q`AA0{He1AFMltl|F;_};VY9{( zl?>#^V#;+GAcx6`MS^o5VWY+-eH4NZB-lh!2s*%^Q%D=Wpb`p8b@@WsJ{k14+ee}#bO3Jby=YfFj5SY4>pSIg$+7yAlZ_3|IG4ZYXE{+9&0^>gij@c?1K+BVN7L+ zBE8Uo%|6)(0!cRW6;xLjK86S4czYfI0f`*plLsohjDgNS_Sl1qC+O^h7nVis14DTp z_UB)a2aE$!UFv{w2Rn||WsjqsJvIKEv42Q>pl%atELe@uN%~I9w1O zV#cKwqF5Br5C<|5?UMqf)N;9JfW`!>inerBLQX{d@Zv-T1n>!#kN^gl*Z>n8VQoY3 z(2(b=$RW((0nJ{E&n0%)GnFrb*aOi!dZ>EwmKPuA-(diH5QjPo?Sn_Ot)`4{2&4u| zi5>X#vc@|Pm2*V{Hk> zUea4{!V6#lG+X(=U@xX82l*yQ_Mq|()&=on3I1y8a!FpoM(dfd;ENeIK_h^&Gs~rB ze+Z%xxoV7eLF3GDu*IvosxqH_3Q1lJ+7QIgKJX+#E#Y5=#eg~BqQnUWDt8{D=Uq;V z^2y3UfJs5~SVO|AMgOh9Et8kT&lkZ@Og7=1v+8-hPU!OwKSbrD&ZN!j>DnnC<>!z* zG)ODE?Sb1ae)*}z&<;WKbYq0zw1-6kG&yU8&KTf0qz(!|(!R3e%v)Z)l0!8SIgKdz zAl)pm2fqVY01xT-NFUTtyn%Ql1cS4g-;P#54oZsx9Dr8riUWWSgl-_DQ%5fFKm!tx zfCDf{!YSZkrq)%Y7fYDXLewFG+4=8p{#C;r5dUL@Q)B}kNUR4m@IZpgsfz)P9XKO zA#YG24~Fam2W9i0hvI+ zxvPeBB zAhiK7fs~m51VyG|reVGVb`)i)4;g1Z07-MK`%smD0KkAzBE*$Xh(}27fd_t)5lAb= zC^LOTii^Mlc7p^!92R1aC+uQV`1oDm{#LVS<>`m4S?pN58{!Sg z3Yf$bC|+;1>5vMx59`A$vQ)O9%0#-0JDTX}~|6we0k1su%wk%kT$H+31jtRN*_Gd{y0P{w};~v8-U2rC7tb7gCRr(~?620NOrIO7IL?U>1y))@n!)idpEHy-C1^JMKXa zE4pJFr&z(+hC72{ZsH9b*@q9@`@D5u+{v`$06C_3G!Cr*i|QQ@19qX0hr`1izpV0j zPP|P@TxeV;(fuIL1lvk9=ffYQ>oWzl|X^V`!%xrtkwFG@+?~+yfjw zHuiELw@pP90wUt@CPM1_JgNae9izK?pQ=oPeC*>MdItauY=R9R(O@8qfB<5`Qh+1~ zgda0Ya?!frJ+@>5I7Q9`)Dr@aLlii6c85oBQ*TBQ)Fm4nUd+AgF;hJ;f+)zCy?_S( z;doLsSY?EmoMwuQ3oA@+AGKJr=LTT)=u`Uue8)#VHhzG6@1ymtIR!uTfraKOMhMfh zIIR*mZ2u(ObWz8~bogYPA~dClKvcHtz?2Q;`qwWr6>2f4&w-sqtZ1;(;GR0eUnM_^=7# zM10|MJB4F+CN+Q*2LOEc58E($#$kU9CPc2m03<*IUocSh;C(z89!;kZ*$@j4;TgrV z55_k@E(m(;CW9{rg*7-p^_PQThbc6G2ea@Ek`x>J@MQyb01$Q%_%I4)0C)Zip%1vA zgi8oCcn5(Qb^sy}g6MEQFn|yD01f>E1m3U@Y?MF((GH~WU0)c6%AsF(B?y;yEuE1M zEJ$^q7kc8uJTw@BWn_Q&w;V~}4+HTGeqaPJkO9P@0|N17pR^bjuu*K01=P_!zeIP4 z0W|IqRo*sPlQ<8R7(_J?g3xd=9$*>BpaL00jPO7Tx1=0jr4ZPFQxl;G`gRbblYyQ0 zc{$i?Bv>{h5`wi<9JZ%}uJLN;z*VV1DL7D!9P}S_h7j@q49!MLJEIW$;0`8pe0DdD zU6(unFp2qK4*e7s10V|ASRT@}Pw@tVv_&!H;d~VoaoZFOO%PHCq5g*RxEk#TSemmK zBa%Q|g)|f(Nv#(i(!>a~6SPm)1`ScBfzkz>&gLs$`YK?#|FHKjxpQPp|th7G4sm!v=nr*M~extB?J z0Njv+>M(I)!2p(k4~?f@hY?$|!U%gp0&jJHwImiv(0d0l4U`FDEa`aG5S1xocbsK< z^?*d82@gdF59y$l0B{5XQ4ilR3vmz!mN0bsa02qj4B~+R&rnMBunm@g2aEs>Q$Y`u z_795F1yLCw3=j@NrUc0#5~WmMmm?%@*bw_b1ph$@cr*`4{zMexzybnLD+o0J9FQ5K zlY=>k572M|3g8Vn7@8Bg7`(R-qre4-C=uPjHw~x)#liyo*`NODpApa{H}DGUG!Alr zSS+vtN~bUYumESEL-?QzMer2^-~#OV59$Cd0n(EYH3t4A64g_d6?J?tM{xb202?3z z6o3E#ggeCs02AN=9l!uDDi#Uw0UW@YV!;Rvi5I)@1UYpP!=MJ2Nf7lQ4$}s6GdYi{ z<%0f4b%#+LsODL$@uDtB0|_!rOH!Y@p$<=gPj)gr`OpnJ*=;1cY7nSWYUdiomRN(u z04c~8T*?-ZUwjM}J67H^OmHO8{QqTiMR`O^^SKnEsRUcs;gzMv3`G&<>E zVI4_6bT~CA1*rc41L`1K@@G?*UU>__s+(~MR1l}8G7$d24&Cs8(JHh4>Q{8t1@GV}i_}V` z&;-B$Q!CL74RIEy_YKAwi5InW3>X$&@C_1y4TdT*EOCEJun7@?5AZ+@Ohjtt%33r- z0(fw&)HAnpiVQD z;$RFcG*N(8HXeru&`=Ne@VT=^s-{vQ$50WT7@A154YR-qX6HyFSpawx2y#yf+&3m>5 zA-ff!3QZ6RZDzhq@dj$(3>ty6M2HT);9~NN7w7;8OP~mLB)>h|zjhJ~9vZ#2V1@>~ zQ1(!*v*`|<=<;JBG7!h4Gs!GHu(kO!Hdc@Q%Vv4j_w z013X}F-EME>aYut@CLcS8Iw>5ia-gT;62^&23w#6N)QQmAu2e0#n9EWO@Ro4kOz0b t87`~}*f6ke{1|$%8+P%%7keDYf;`BCT*!ue$cUWCioD2-9K8Vn06SzG6$t + + + LPeg - Parsing Expression Grammars For Lua + + + + + + + +

+ +
+ +
LPeg
+
+ Parsing Expression Grammars For Lua, version 1.0 +
+
+ +
+ + + +
+ + +

Introduction

+ +

+LPeg is a new pattern-matching library for Lua, +based on + +Parsing Expression Grammars (PEGs). +This text is a reference manual for the library. +For a more formal treatment of LPeg, +as well as some discussion about its implementation, +see + +A Text Pattern-Matching Tool based on Parsing Expression Grammars. +(You may also be interested in my +talk about LPeg +given at the III Lua Workshop.) +

+ +

+Following the Snobol tradition, +LPeg defines patterns as first-class objects. +That is, patterns are regular Lua values +(represented by userdata). +The library offers several functions to create +and compose patterns. +With the use of metamethods, +several of these functions are provided as infix or prefix +operators. +On the one hand, +the result is usually much more verbose than the typical +encoding of patterns using the so called +regular expressions +(which typically are not regular expressions in the formal sense). +On the other hand, +first-class patterns allow much better documentation +(as it is easy to comment the code, +to break complex definitions in smaller parts, etc.) +and are extensible, +as we can define new functions to create and compose patterns. +

+ +

+For a quick glance of the library, +the following table summarizes its basic operations +for creating patterns: +

+ + + + + + + + + + + + + + + + + + + + + + + + + + +
OperatorDescription
lpeg.P(string)Matches string literally
lpeg.P(n)Matches exactly n characters
lpeg.S(string)Matches any character in string (Set)
lpeg.R("xy")Matches any character between x and y (Range)
patt^nMatches at least n repetitions of patt
patt^-nMatches at most n repetitions of patt
patt1 * patt2Matches patt1 followed by patt2
patt1 + patt2Matches patt1 or patt2 + (ordered choice)
patt1 - patt2Matches patt1 if patt2 does not match
-pattEquivalent to ("" - patt)
#pattMatches patt but consumes no input
lpeg.B(patt)Matches patt behind the current position, + consuming no input
+ +

As a very simple example, +lpeg.R("09")^1 creates a pattern that +matches a non-empty sequence of digits. +As a not so simple example, +-lpeg.P(1) +(which can be written as lpeg.P(-1), +or simply -1 for operations expecting a pattern) +matches an empty string only if it cannot match a single character; +so, it succeeds only at the end of the subject. +

+ +

+LPeg also offers the re module, +which implements patterns following a regular-expression style +(e.g., [09]+). +(This module is 260 lines of Lua code, +and of course it uses LPeg to parse regular expressions and +translate them to regular LPeg patterns.) +

+ + +

Functions

+ + +

lpeg.match (pattern, subject [, init])

+

+The matching function. +It attempts to match the given pattern against the subject string. +If the match succeeds, +returns the index in the subject of the first character after the match, +or the captured values +(if the pattern captured any value). +

+ +

+An optional numeric argument init makes the match +start at that position in the subject string. +As usual in Lua libraries, +a negative value counts from the end. +

+ +

+Unlike typical pattern-matching functions, +match works only in anchored mode; +that is, it tries to match the pattern with a prefix of +the given subject string (at position init), +not with an arbitrary substring of the subject. +So, if we want to find a pattern anywhere in a string, +we must either write a loop in Lua or write a pattern that +matches anywhere. +This second approach is easy and quite efficient; +see examples. +

+ +

lpeg.type (value)

+

+If the given value is a pattern, +returns the string "pattern". +Otherwise returns nil. +

+ +

lpeg.version ()

+

+Returns a string with the running version of LPeg. +

+ +

lpeg.setmaxstack (max)

+

+Sets a limit for the size of the backtrack stack used by LPeg to +track calls and choices. +(The default limit is 400.) +Most well-written patterns need little backtrack levels and +therefore you seldom need to change this limit; +before changing it you should try to rewrite your +pattern to avoid the need for extra space. +Nevertheless, a few useful patterns may overflow. +Also, with recursive grammars, +subjects with deep recursion may also need larger limits. +

+ + +

Basic Constructions

+ +

+The following operations build patterns. +All operations that expect a pattern as an argument +may receive also strings, tables, numbers, booleans, or functions, +which are translated to patterns according to +the rules of function lpeg.P. +

+ + + +

lpeg.P (value)

+

+Converts the given value into a proper pattern, +according to the following rules: +

+
    + +
  • +If the argument is a pattern, +it is returned unmodified. +

  • + +
  • +If the argument is a string, +it is translated to a pattern that matches the string literally. +

  • + +
  • +If the argument is a non-negative number n, +the result is a pattern that matches exactly n characters. +

  • + +
  • +If the argument is a negative number -n, +the result is a pattern that +succeeds only if the input string has less than n characters left: +lpeg.P(-n) +is equivalent to -lpeg.P(n) +(see the unary minus operation). +

  • + +
  • +If the argument is a boolean, +the result is a pattern that always succeeds or always fails +(according to the boolean value), +without consuming any input. +

  • + +
  • +If the argument is a table, +it is interpreted as a grammar +(see Grammars). +

  • + +
  • +If the argument is a function, +returns a pattern equivalent to a +match-time capture over the empty string. +

  • + +
+ + +

lpeg.B(patt)

+

+Returns a pattern that +matches only if the input string at the current position +is preceded by patt. +Pattern patt must match only strings +with some fixed length, +and it cannot contain captures. +

+ +

+Like the and predicate, +this pattern never consumes any input, +independently of success or failure. +

+ + +

lpeg.R ({range})

+

+Returns a pattern that matches any single character +belonging to one of the given ranges. +Each range is a string xy of length 2, +representing all characters with code +between the codes of x and y +(both inclusive). +

+ +

+As an example, the pattern +lpeg.R("09") matches any digit, +and lpeg.R("az", "AZ") matches any ASCII letter. +

+ + +

lpeg.S (string)

+

+Returns a pattern that matches any single character that +appears in the given string. +(The S stands for Set.) +

+ +

+As an example, the pattern +lpeg.S("+-*/") matches any arithmetic operator. +

+ +

+Note that, if s is a character +(that is, a string of length 1), +then lpeg.P(s) is equivalent to lpeg.S(s) +which is equivalent to lpeg.R(s..s). +Note also that both lpeg.S("") and lpeg.R() +are patterns that always fail. +

+ + +

lpeg.V (v)

+

+This operation creates a non-terminal (a variable) +for a grammar. +The created non-terminal refers to the rule indexed by v +in the enclosing grammar. +(See Grammars for details.) +

+ + +

lpeg.locale ([table])

+

+Returns a table with patterns for matching some character classes +according to the current locale. +The table has fields named +alnum, +alpha, +cntrl, +digit, +graph, +lower, +print, +punct, +space, +upper, and +xdigit, +each one containing a correspondent pattern. +Each pattern matches any single character that belongs to its class. +

+ +

+If called with an argument table, +then it creates those fields inside the given table and +returns that table. +

+ + +

#patt

+

+Returns a pattern that +matches only if the input string matches patt, +but without consuming any input, +independently of success or failure. +(This pattern is called an and predicate +and it is equivalent to +&patt in the original PEG notation.) +

+ + +

+This pattern never produces any capture. +

+ + +

-patt

+

+Returns a pattern that +matches only if the input string does not match patt. +It does not consume any input, +independently of success or failure. +(This pattern is equivalent to +!patt in the original PEG notation.) +

+ +

+As an example, the pattern +-lpeg.P(1) matches only the end of string. +

+ +

+This pattern never produces any captures, +because either patt fails +or -patt fails. +(A failing pattern never produces captures.) +

+ + +

patt1 + patt2

+

+Returns a pattern equivalent to an ordered choice +of patt1 and patt2. +(This is denoted by patt1 / patt2 in the original PEG notation, +not to be confused with the / operation in LPeg.) +It matches either patt1 or patt2, +with no backtracking once one of them succeeds. +The identity element for this operation is the pattern +lpeg.P(false), +which always fails. +

+ +

+If both patt1 and patt2 are +character sets, +this operation is equivalent to set union. +

+
+lower = lpeg.R("az")
+upper = lpeg.R("AZ")
+letter = lower + upper
+
+ + +

patt1 - patt2

+

+Returns a pattern equivalent to !patt2 patt1. +This pattern asserts that the input does not match +patt2 and then matches patt1. +

+ +

+When successful, +this pattern produces all captures from patt1. +It never produces any capture from patt2 +(as either patt2 fails or +patt1 - patt2 fails). +

+ +

+If both patt1 and patt2 are +character sets, +this operation is equivalent to set difference. +Note that -patt is equivalent to "" - patt +(or 0 - patt). +If patt is a character set, +1 - patt is its complement. +

+ + +

patt1 * patt2

+

+Returns a pattern that matches patt1 +and then matches patt2, +starting where patt1 finished. +The identity element for this operation is the +pattern lpeg.P(true), +which always succeeds. +

+ +

+(LPeg uses the * operator +[instead of the more obvious ..] +both because it has +the right priority and because in formal languages it is +common to use a dot for denoting concatenation.) +

+ + +

patt^n

+

+If n is nonnegative, +this pattern is +equivalent to pattn patt*: +It matches n or more occurrences of patt. +

+ +

+Otherwise, when n is negative, +this pattern is equivalent to (patt?)-n: +It matches at most |n| +occurrences of patt. +

+ +

+In particular, patt^0 is equivalent to patt*, +patt^1 is equivalent to patt+, +and patt^-1 is equivalent to patt? +in the original PEG notation. +

+ +

+In all cases, +the resulting pattern is greedy with no backtracking +(also called a possessive repetition). +That is, it matches only the longest possible sequence +of matches for patt. +

+ + + +

Grammars

+ +

+With the use of Lua variables, +it is possible to define patterns incrementally, +with each new pattern using previously defined ones. +However, this technique does not allow the definition of +recursive patterns. +For recursive patterns, +we need real grammars. +

+ +

+LPeg represents grammars with tables, +where each entry is a rule. +

+ +

+The call lpeg.V(v) +creates a pattern that represents the nonterminal +(or variable) with index v in a grammar. +Because the grammar still does not exist when +this function is evaluated, +the result is an open reference to the respective rule. +

+ +

+A table is fixed when it is converted to a pattern +(either by calling lpeg.P or by using it wherein a +pattern is expected). +Then every open reference created by lpeg.V(v) +is corrected to refer to the rule indexed by v in the table. +

+ +

+When a table is fixed, +the result is a pattern that matches its initial rule. +The entry with index 1 in the table defines its initial rule. +If that entry is a string, +it is assumed to be the name of the initial rule. +Otherwise, LPeg assumes that the entry 1 itself is the initial rule. +

+ +

+As an example, +the following grammar matches strings of a's and b's that +have the same number of a's and b's: +

+
+equalcount = lpeg.P{
+  "S";   -- initial rule name
+  S = "a" * lpeg.V"B" + "b" * lpeg.V"A" + "",
+  A = "a" * lpeg.V"S" + "b" * lpeg.V"A" * lpeg.V"A",
+  B = "b" * lpeg.V"S" + "a" * lpeg.V"B" * lpeg.V"B",
+} * -1
+
+

+It is equivalent to the following grammar in standard PEG notation: +

+
+  S <- 'a' B / 'b' A / ''
+  A <- 'a' S / 'b' A A
+  B <- 'b' S / 'a' B B
+
+ + +

Captures

+ +

+A capture is a pattern that produces values +(the so called semantic information) +according to what it matches. +LPeg offers several kinds of captures, +which produces values based on matches and combine these values to +produce new values. +Each capture may produce zero or more values. +

+ +

+The following table summarizes the basic captures: +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
OperationWhat it Produces
lpeg.C(patt)the match for patt plus all captures + made by patt
lpeg.Carg(n)the value of the nth extra argument to + lpeg.match (matches the empty string)
lpeg.Cb(name)the values produced by the previous + group capture named name + (matches the empty string)
lpeg.Cc(values)the given values (matches the empty string)
lpeg.Cf(patt, func)a folding of the captures from patt
lpeg.Cg(patt [, name])the values produced by patt, + optionally tagged with name
lpeg.Cp()the current position (matches the empty string)
lpeg.Cs(patt)the match for patt + with the values from nested captures replacing their matches
lpeg.Ct(patt)a table with all captures from patt
patt / stringstring, with some marks replaced by captures + of patt
patt / numberthe n-th value captured by patt, +or no value when number is zero.
patt / tabletable[c], where c is the (first) + capture of patt
patt / functionthe returns of function applied to the captures + of patt
lpeg.Cmt(patt, function)the returns of function applied to the captures + of patt; the application is done at match time
+ +

+A capture pattern produces its values only when it succeeds. +For instance, +the pattern lpeg.C(lpeg.P"a"^-1) +produces the empty string when there is no "a" +(because the pattern "a"? succeeds), +while the pattern lpeg.C("a")^-1 +does not produce any value when there is no "a" +(because the pattern "a" fails). +A pattern inside a loop or inside a recursive structure +produces values for each match. +

+ +

+Usually, +LPeg does not specify when (and if) it evaluates its captures. +(As an example, +consider the pattern lpeg.P"a" / func / 0. +Because the "division" by 0 instructs LPeg to throw away the +results from the pattern, +LPeg may or may not call func.) +Therefore, captures should avoid side effects. +Moreover, +most captures cannot affect the way a pattern matches a subject. +The only exception to this rule is the +so-called match-time capture. +When a match-time capture matches, +it forces the immediate evaluation of all its nested captures +and then calls its corresponding function, +which defines whether the match succeeds and also +what values are produced. +

+ +

lpeg.C (patt)

+

+Creates a simple capture, +which captures the substring of the subject that matches patt. +The captured value is a string. +If patt has other captures, +their values are returned after this one. +

+ + +

lpeg.Carg (n)

+

+Creates an argument capture. +This pattern matches the empty string and +produces the value given as the nth extra +argument given in the call to lpeg.match. +

+ + +

lpeg.Cb (name)

+

+Creates a back capture. +This pattern matches the empty string and +produces the values produced by the most recent +group capture named name +(where name can be any Lua value). +

+ +

+Most recent means the last +complete +outermost +group capture with the given name. +A Complete capture means that the entire pattern +corresponding to the capture has matched. +An Outermost capture means that the capture is not inside +another complete capture. +

+ +

+In the same way that LPeg does not specify when it evaluates captures, +it does not specify whether it reuses +values previously produced by the group +or re-evaluates them. +

+ +

lpeg.Cc ([value, ...])

+

+Creates a constant capture. +This pattern matches the empty string and +produces all given values as its captured values. +

+ + +

lpeg.Cf (patt, func)

+

+Creates a fold capture. +If patt produces a list of captures +C1 C2 ... Cn, +this capture will produce the value +func(...func(func(C1, C2), C3)..., + Cn), +that is, it will fold +(or accumulate, or reduce) +the captures from patt using function func. +

+ +

+This capture assumes that patt should produce +at least one capture with at least one value (of any type), +which becomes the initial value of an accumulator. +(If you need a specific initial value, +you may prefix a constant capture to patt.) +For each subsequent capture, +LPeg calls func +with this accumulator as the first argument and all values produced +by the capture as extra arguments; +the first result from this call +becomes the new value for the accumulator. +The final value of the accumulator becomes the captured value. +

+ +

+As an example, +the following pattern matches a list of numbers separated +by commas and returns their addition: +

+
+-- matches a numeral and captures its numerical value
+number = lpeg.R"09"^1 / tonumber
+
+-- matches a list of numbers, capturing their values
+list = number * ("," * number)^0
+
+-- auxiliary function to add two numbers
+function add (acc, newvalue) return acc + newvalue end
+
+-- folds the list of numbers adding them
+sum = lpeg.Cf(list, add)
+
+-- example of use
+print(sum:match("10,30,43"))   --> 83
+
+ + +

lpeg.Cg (patt [, name])

+

+Creates a group capture. +It groups all values returned by patt +into a single capture. +The group may be anonymous (if no name is given) +or named with the given name +(which can be any non-nil Lua value). +

+ +

+An anonymous group serves to join values from several captures into +a single capture. +A named group has a different behavior. +In most situations, a named group returns no values at all. +Its values are only relevant for a following +back capture or when used +inside a table capture. +

+ + +

lpeg.Cp ()

+

+Creates a position capture. +It matches the empty string and +captures the position in the subject where the match occurs. +The captured value is a number. +

+ + +

lpeg.Cs (patt)

+

+Creates a substitution capture, +which captures the substring of the subject that matches patt, +with substitutions. +For any capture inside patt with a value, +the substring that matched the capture is replaced by the capture value +(which should be a string). +The final captured value is the string resulting from +all replacements. +

+ + +

lpeg.Ct (patt)

+

+Creates a table capture. +This capture returns a table with all values from all anonymous captures +made by patt inside this table in successive integer keys, +starting at 1. +Moreover, +for each named capture group created by patt, +the first value of the group is put into the table +with the group name as its key. +The captured value is only the table. +

+ + +

patt / string

+

+Creates a string capture. +It creates a capture string based on string. +The captured value is a copy of string, +except that the character % works as an escape character: +any sequence in string of the form %n, +with n between 1 and 9, +stands for the match of the n-th capture in patt. +The sequence %0 stands for the whole match. +The sequence %% stands for a single %. +

+ + +

patt / number

+

+Creates a numbered capture. +For a non-zero number, +the captured value is the n-th value +captured by patt. +When number is zero, +there are no captured values. +

+ + +

patt / table

+

+Creates a query capture. +It indexes the given table using as key the first value captured by +patt, +or the whole match if patt produced no value. +The value at that index is the final value of the capture. +If the table does not have that key, +there is no captured value. +

+ + +

patt / function

+

+Creates a function capture. +It calls the given function passing all captures made by +patt as arguments, +or the whole match if patt made no capture. +The values returned by the function +are the final values of the capture. +In particular, +if function returns no value, +there is no captured value. +

+ + +

lpeg.Cmt(patt, function)

+

+Creates a match-time capture. +Unlike all other captures, +this one is evaluated immediately when a match occurs +(even if it is part of a larger pattern that fails later). +It forces the immediate evaluation of all its nested captures +and then calls function. +

+ +

+The given function gets as arguments the entire subject, +the current position (after the match of patt), +plus any capture values produced by patt. +

+ +

+The first value returned by function +defines how the match happens. +If the call returns a number, +the match succeeds +and the returned number becomes the new current position. +(Assuming a subject s and current position i, +the returned number must be in the range [i, len(s) + 1].) +If the call returns true, +the match succeeds without consuming any input. +(So, to return true is equivalent to return i.) +If the call returns false, nil, or no value, +the match fails. +

+ +

+Any extra values returned by the function become the +values produced by the capture. +

+ + + + +

Some Examples

+ +

Using a Pattern

+

+This example shows a very simple but complete program +that builds and uses a pattern: +

+
+local lpeg = require "lpeg"
+
+-- matches a word followed by end-of-string
+p = lpeg.R"az"^1 * -1
+
+print(p:match("hello"))        --> 6
+print(lpeg.match(p, "hello"))  --> 6
+print(p:match("1 hello"))      --> nil
+
+

+The pattern is simply a sequence of one or more lower-case letters +followed by the end of string (-1). +The program calls match both as a method +and as a function. +In both sucessful cases, +the match returns +the index of the first character after the match, +which is the string length plus one. +

+ + +

Name-value lists

+

+This example parses a list of name-value pairs and returns a table +with those pairs: +

+
+lpeg.locale(lpeg)   -- adds locale entries into 'lpeg' table
+
+local space = lpeg.space^0
+local name = lpeg.C(lpeg.alpha^1) * space
+local sep = lpeg.S(",;") * space
+local pair = lpeg.Cg(name * "=" * space * name) * sep^-1
+local list = lpeg.Cf(lpeg.Ct("") * pair^0, rawset)
+t = list:match("a=b, c = hi; next = pi")  --> { a = "b", c = "hi", next = "pi" }
+
+

+Each pair has the format name = name followed by +an optional separator (a comma or a semicolon). +The pair pattern encloses the pair in a group pattern, +so that the names become the values of a single capture. +The list pattern then folds these captures. +It starts with an empty table, +created by a table capture matching an empty string; +then for each capture (a pair of names) it applies rawset +over the accumulator (the table) and the capture values (the pair of names). +rawset returns the table itself, +so the accumulator is always the table. +

+ +

Splitting a string

+

+The following code builds a pattern that +splits a string using a given pattern +sep as a separator: +

+
+function split (s, sep)
+  sep = lpeg.P(sep)
+  local elem = lpeg.C((1 - sep)^0)
+  local p = elem * (sep * elem)^0
+  return lpeg.match(p, s)
+end
+
+

+First the function ensures that sep is a proper pattern. +The pattern elem is a repetition of zero of more +arbitrary characters as long as there is not a match against +the separator. +It also captures its match. +The pattern p matches a list of elements separated +by sep. +

+ +

+If the split results in too many values, +it may overflow the maximum number of values +that can be returned by a Lua function. +In this case, +we can collect these values in a table: +

+
+function split (s, sep)
+  sep = lpeg.P(sep)
+  local elem = lpeg.C((1 - sep)^0)
+  local p = lpeg.Ct(elem * (sep * elem)^0)   -- make a table capture
+  return lpeg.match(p, s)
+end
+
+ + +

Searching for a pattern

+

+The primitive match works only in anchored mode. +If we want to find a pattern anywhere in a string, +we must write a pattern that matches anywhere. +

+ +

+Because patterns are composable, +we can write a function that, +given any arbitrary pattern p, +returns a new pattern that searches for p +anywhere in a string. +There are several ways to do the search. +One way is like this: +

+
+function anywhere (p)
+  return lpeg.P{ p + 1 * lpeg.V(1) }
+end
+
+

+This grammar has a straight reading: +it matches p or skips one character and tries again. +

+ +

+If we want to know where the pattern is in the string +(instead of knowing only that it is there somewhere), +we can add position captures to the pattern: +

+
+local I = lpeg.Cp()
+function anywhere (p)
+  return lpeg.P{ I * p * I + 1 * lpeg.V(1) }
+end
+
+print(anywhere("world"):match("hello world!"))   -> 7   12
+
+ +

+Another option for the search is like this: +

+
+local I = lpeg.Cp()
+function anywhere (p)
+  return (1 - lpeg.P(p))^0 * I * p * I
+end
+
+

+Again the pattern has a straight reading: +it skips as many characters as possible while not matching p, +and then matches p (plus appropriate captures). +

+ +

+If we want to look for a pattern only at word boundaries, +we can use the following transformer: +

+ +
+local t = lpeg.locale()
+
+function atwordboundary (p)
+  return lpeg.P{
+    [1] = p + t.alpha^0 * (1 - t.alpha)^1 * lpeg.V(1)
+  }
+end
+
+ + +

Balanced parentheses

+

+The following pattern matches only strings with balanced parentheses: +

+
+b = lpeg.P{ "(" * ((1 - lpeg.S"()") + lpeg.V(1))^0 * ")" }
+
+

+Reading the first (and only) rule of the given grammar, +we have that a balanced string is +an open parenthesis, +followed by zero or more repetitions of either +a non-parenthesis character or +a balanced string (lpeg.V(1)), +followed by a closing parenthesis. +

+ + +

Global substitution

+

+The next example does a job somewhat similar to string.gsub. +It receives a pattern and a replacement value, +and substitutes the replacement value for all occurrences of the pattern +in a given string: +

+
+function gsub (s, patt, repl)
+  patt = lpeg.P(patt)
+  patt = lpeg.Cs((patt / repl + 1)^0)
+  return lpeg.match(patt, s)
+end
+
+

+As in string.gsub, +the replacement value can be a string, +a function, or a table. +

+ + +

Comma-Separated Values (CSV)

+

+This example breaks a string into comma-separated values, +returning all fields: +

+
+local field = '"' * lpeg.Cs(((lpeg.P(1) - '"') + lpeg.P'""' / '"')^0) * '"' +
+                    lpeg.C((1 - lpeg.S',\n"')^0)
+
+local record = field * (',' * field)^0 * (lpeg.P'\n' + -1)
+
+function csv (s)
+  return lpeg.match(record, s)
+end
+
+

+A field is either a quoted field +(which may contain any character except an individual quote, +which may be written as two quotes that are replaced by one) +or an unquoted field +(which cannot contain commas, newlines, or quotes). +A record is a list of fields separated by commas, +ending with a newline or the string end (-1). +

+ +

+As it is, +the previous pattern returns each field as a separated result. +If we add a table capture in the definition of record, +the pattern will return instead a single table +containing all fields: +

+
+local record = lpeg.Ct(field * (',' * field)^0) * (lpeg.P'\n' + -1)
+
+ + +

UTF-8 and Latin 1

+

+It is not difficult to use LPeg to convert a string from +UTF-8 encoding to Latin 1 (ISO 8859-1): +

+ +
+-- convert a two-byte UTF-8 sequence to a Latin 1 character
+local function f2 (s)
+  local c1, c2 = string.byte(s, 1, 2)
+  return string.char(c1 * 64 + c2 - 12416)
+end
+
+local utf8 = lpeg.R("\0\127")
+           + lpeg.R("\194\195") * lpeg.R("\128\191") / f2
+
+local decode_pattern = lpeg.Cs(utf8^0) * -1
+
+

+In this code, +the definition of UTF-8 is already restricted to the +Latin 1 range (from 0 to 255). +Any encoding outside this range (as well as any invalid encoding) +will not match that pattern. +

+ +

+As the definition of decode_pattern demands that +the pattern matches the whole input (because of the -1 at its end), +any invalid string will simply fail to match, +without any useful information about the problem. +We can improve this situation redefining decode_pattern +as follows: +

+
+local function er (_, i) error("invalid encoding at position " .. i) end
+
+local decode_pattern = lpeg.Cs(utf8^0) * (-1 + lpeg.P(er))
+
+

+Now, if the pattern utf8^0 stops +before the end of the string, +an appropriate error function is called. +

+ + +

UTF-8 and Unicode

+

+We can extend the previous patterns to handle all Unicode code points. +Of course, +we cannot translate them to Latin 1 or any other one-byte encoding. +Instead, our translation results in a array with the code points +represented as numbers. +The full code is here: +

+
+-- decode a two-byte UTF-8 sequence
+local function f2 (s)
+  local c1, c2 = string.byte(s, 1, 2)
+  return c1 * 64 + c2 - 12416
+end
+
+-- decode a three-byte UTF-8 sequence
+local function f3 (s)
+  local c1, c2, c3 = string.byte(s, 1, 3)
+  return (c1 * 64 + c2) * 64 + c3 - 925824
+end
+
+-- decode a four-byte UTF-8 sequence
+local function f4 (s)
+  local c1, c2, c3, c4 = string.byte(s, 1, 4)
+  return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168
+end
+
+local cont = lpeg.R("\128\191")   -- continuation byte
+
+local utf8 = lpeg.R("\0\127") / string.byte
+           + lpeg.R("\194\223") * cont / f2
+           + lpeg.R("\224\239") * cont * cont / f3
+           + lpeg.R("\240\244") * cont * cont * cont / f4
+
+local decode_pattern = lpeg.Ct(utf8^0) * -1
+
+ + +

Lua's long strings

+

+A long string in Lua starts with the pattern [=*[ +and ends at the first occurrence of ]=*] with +exactly the same number of equal signs. +If the opening brackets are followed by a newline, +this newline is discarded +(that is, it is not part of the string). +

+ +

+To match a long string in Lua, +the pattern must capture the first repetition of equal signs and then, +whenever it finds a candidate for closing the string, +check whether it has the same number of equal signs. +

+ +
+equals = lpeg.P"="^0
+open = "[" * lpeg.Cg(equals, "init") * "[" * lpeg.P"\n"^-1
+close = "]" * lpeg.C(equals) * "]"
+closeeq = lpeg.Cmt(close * lpeg.Cb("init"), function (s, i, a, b) return a == b end)
+string = open * lpeg.C((lpeg.P(1) - closeeq)^0) * close / 1
+
+ +

+The open pattern matches [=*[, +capturing the repetitions of equal signs in a group named init; +it also discharges an optional newline, if present. +The close pattern matches ]=*], +also capturing the repetitions of equal signs. +The closeeq pattern first matches close; +then it uses a back capture to recover the capture made +by the previous open, +which is named init; +finally it uses a match-time capture to check +whether both captures are equal. +The string pattern starts with an open, +then it goes as far as possible until matching closeeq, +and then matches the final close. +The final numbered capture simply discards +the capture made by close. +

+ + +

Arithmetic expressions

+

+This example is a complete parser and evaluator for simple +arithmetic expressions. +We write it in two styles. +The first approach first builds a syntax tree and then +traverses this tree to compute the expression value: +

+
+-- Lexical Elements
+local Space = lpeg.S(" \n\t")^0
+local Number = lpeg.C(lpeg.P"-"^-1 * lpeg.R("09")^1) * Space
+local TermOp = lpeg.C(lpeg.S("+-")) * Space
+local FactorOp = lpeg.C(lpeg.S("*/")) * Space
+local Open = "(" * Space
+local Close = ")" * Space
+
+-- Grammar
+local Exp, Term, Factor = lpeg.V"Exp", lpeg.V"Term", lpeg.V"Factor"
+G = lpeg.P{ Exp,
+  Exp = lpeg.Ct(Term * (TermOp * Term)^0);
+  Term = lpeg.Ct(Factor * (FactorOp * Factor)^0);
+  Factor = Number + Open * Exp * Close;
+}
+
+G = Space * G * -1
+
+-- Evaluator
+function eval (x)
+  if type(x) == "string" then
+    return tonumber(x)
+  else
+    local op1 = eval(x[1])
+    for i = 2, #x, 2 do
+      local op = x[i]
+      local op2 = eval(x[i + 1])
+      if (op == "+") then op1 = op1 + op2
+      elseif (op == "-") then op1 = op1 - op2
+      elseif (op == "*") then op1 = op1 * op2
+      elseif (op == "/") then op1 = op1 / op2
+      end
+    end
+    return op1
+  end
+end
+
+-- Parser/Evaluator
+function evalExp (s)
+  local t = lpeg.match(G, s)
+  if not t then error("syntax error", 2) end
+  return eval(t)
+end
+
+-- small example
+print(evalExp"3 + 5*9 / (1+1) - 12")   --> 13.5
+
+ +

+The second style computes the expression value on the fly, +without building the syntax tree. +The following grammar takes this approach. +(It assumes the same lexical elements as before.) +

+
+-- Auxiliary function
+function eval (v1, op, v2)
+  if (op == "+") then return v1 + v2
+  elseif (op == "-") then return v1 - v2
+  elseif (op == "*") then return v1 * v2
+  elseif (op == "/") then return v1 / v2
+  end
+end
+
+-- Grammar
+local V = lpeg.V
+G = lpeg.P{ "Exp",
+  Exp = lpeg.Cf(V"Term" * lpeg.Cg(TermOp * V"Term")^0, eval);
+  Term = lpeg.Cf(V"Factor" * lpeg.Cg(FactorOp * V"Factor")^0, eval);
+  Factor = Number / tonumber + Open * V"Exp" * Close;
+}
+
+-- small example
+print(lpeg.match(G, "3 + 5*9 / (1+1) - 12"))   --> 13.5
+
+

+Note the use of the fold (accumulator) capture. +To compute the value of an expression, +the accumulator starts with the value of the first term, +and then applies eval over +the accumulator, the operator, +and the new term for each repetition. +

+ + + +

Download

+ +

LPeg +source code.

+ + +

License

+ +

+Copyright © 2007-2017 Lua.org, PUC-Rio. +

+

+Permission is hereby granted, free of charge, +to any person obtaining a copy of this software and +associated documentation files (the "Software"), +to deal in the Software without restriction, +including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, +and to permit persons to whom the Software is +furnished to do so, +subject to the following conditions: +

+ +

+The above copyright notice and this permission notice +shall be included in all copies or substantial portions of the Software. +

+ +

+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +

+ +
+ +
+ +
+

+$Id: lpeg.html,v 1.77 2017/01/13 13:40:05 roberto Exp $ +

+
+ +
+ + + diff --git a/src/ext/lpeg/lpprint.c b/src/ext/lpeg/lpprint.c new file mode 100644 index 0000000..f7be408 --- /dev/null +++ b/src/ext/lpeg/lpprint.c @@ -0,0 +1,244 @@ +/* +** $Id: lpprint.c,v 1.10 2016/09/13 16:06:03 roberto Exp $ +** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) +*/ + +#include +#include +#include + + +#include "lptypes.h" +#include "lpprint.h" +#include "lpcode.h" + + +#if defined(LPEG_DEBUG) + +/* +** {====================================================== +** Printing patterns (for debugging) +** ======================================================= +*/ + + +void printcharset (const byte *st) { + int i; + printf("["); + for (i = 0; i <= UCHAR_MAX; i++) { + int first = i; + while (testchar(st, i) && i <= UCHAR_MAX) i++; + if (i - 1 == first) /* unary range? */ + printf("(%02x)", first); + else if (i - 1 > first) /* non-empty range? */ + printf("(%02x-%02x)", first, i - 1); + } + printf("]"); +} + + +static const char *capkind (int kind) { + const char *const modes[] = { + "close", "position", "constant", "backref", + "argument", "simple", "table", "function", + "query", "string", "num", "substitution", "fold", + "runtime", "group"}; + return modes[kind]; +} + + +static void printjmp (const Instruction *op, const Instruction *p) { + printf("-> %d", (int)(p + (p + 1)->offset - op)); +} + + +void printinst (const Instruction *op, const Instruction *p) { + const char *const names[] = { + "any", "char", "set", + "testany", "testchar", "testset", + "span", "behind", + "ret", "end", + "choice", "jmp", "call", "open_call", + "commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup", + "fullcapture", "opencapture", "closecapture", "closeruntime" + }; + printf("%02ld: %s ", (long)(p - op), names[p->i.code]); + switch ((Opcode)p->i.code) { + case IChar: { + printf("'%c'", p->i.aux); + break; + } + case ITestChar: { + printf("'%c'", p->i.aux); printjmp(op, p); + break; + } + case IFullCapture: { + printf("%s (size = %d) (idx = %d)", + capkind(getkind(p)), getoff(p), p->i.key); + break; + } + case IOpenCapture: { + printf("%s (idx = %d)", capkind(getkind(p)), p->i.key); + break; + } + case ISet: { + printcharset((p+1)->buff); + break; + } + case ITestSet: { + printcharset((p+2)->buff); printjmp(op, p); + break; + } + case ISpan: { + printcharset((p+1)->buff); + break; + } + case IOpenCall: { + printf("-> %d", (p + 1)->offset); + break; + } + case IBehind: { + printf("%d", p->i.aux); + break; + } + case IJmp: case ICall: case ICommit: case IChoice: + case IPartialCommit: case IBackCommit: case ITestAny: { + printjmp(op, p); + break; + } + default: break; + } + printf("\n"); +} + + +void printpatt (Instruction *p, int n) { + Instruction *op = p; + while (p < op + n) { + printinst(op, p); + p += sizei(p); + } +} + + +#if defined(LPEG_DEBUG) +static void printcap (Capture *cap) { + printf("%s (idx: %d - size: %d) -> %p\n", + capkind(cap->kind), cap->idx, cap->siz, cap->s); +} + + +void printcaplist (Capture *cap, Capture *limit) { + printf(">======\n"); + for (; cap->s && (limit == NULL || cap < limit); cap++) + printcap(cap); + printf("=======\n"); +} +#endif + +/* }====================================================== */ + + +/* +** {====================================================== +** Printing trees (for debugging) +** ======================================================= +*/ + +static const char *tagnames[] = { + "char", "set", "any", + "true", "false", + "rep", + "seq", "choice", + "not", "and", + "call", "opencall", "rule", "grammar", + "behind", + "capture", "run-time" +}; + + +void printtree (TTree *tree, int ident) { + int i; + for (i = 0; i < ident; i++) printf(" "); + printf("%s", tagnames[tree->tag]); + switch (tree->tag) { + case TChar: { + int c = tree->u.n; + if (isprint(c)) + printf(" '%c'\n", c); + else + printf(" (%02X)\n", c); + break; + } + case TSet: { + printcharset(treebuffer(tree)); + printf("\n"); + break; + } + case TOpenCall: case TCall: { + assert(sib2(tree)->tag == TRule); + printf(" key: %d (rule: %d)\n", tree->key, sib2(tree)->cap); + break; + } + case TBehind: { + printf(" %d\n", tree->u.n); + printtree(sib1(tree), ident + 2); + break; + } + case TCapture: { + printf(" kind: '%s' key: %d\n", capkind(tree->cap), tree->key); + printtree(sib1(tree), ident + 2); + break; + } + case TRule: { + printf(" n: %d key: %d\n", tree->cap, tree->key); + printtree(sib1(tree), ident + 2); + break; /* do not print next rule as a sibling */ + } + case TGrammar: { + TTree *rule = sib1(tree); + printf(" %d\n", tree->u.n); /* number of rules */ + for (i = 0; i < tree->u.n; i++) { + printtree(rule, ident + 2); + rule = sib2(rule); + } + assert(rule->tag == TTrue); /* sentinel */ + break; + } + default: { + int sibs = numsiblings[tree->tag]; + printf("\n"); + if (sibs >= 1) { + printtree(sib1(tree), ident + 2); + if (sibs >= 2) + printtree(sib2(tree), ident + 2); + } + break; + } + } +} + + +void printktable (lua_State *L, int idx) { + int n, i; + lua_getuservalue(L, idx); + if (lua_isnil(L, -1)) /* no ktable? */ + return; + n = lua_rawlen(L, -1); + printf("["); + for (i = 1; i <= n; i++) { + printf("%d = ", i); + lua_rawgeti(L, -1, i); + if (lua_isstring(L, -1)) + printf("%s ", lua_tostring(L, -1)); + else + printf("%s ", lua_typename(L, lua_type(L, -1))); + lua_pop(L, 1); + } + printf("]\n"); + /* leave ktable at the stack */ +} + +/* }====================================================== */ + +#endif diff --git a/src/ext/lpeg/lpprint.h b/src/ext/lpeg/lpprint.h new file mode 100644 index 0000000..6329760 --- /dev/null +++ b/src/ext/lpeg/lpprint.h @@ -0,0 +1,36 @@ +/* +** $Id: lpprint.h,v 1.2 2015/06/12 18:18:08 roberto Exp $ +*/ + + +#if !defined(lpprint_h) +#define lpprint_h + + +#include "lptree.h" +#include "lpvm.h" + + +#if defined(LPEG_DEBUG) + +void printpatt (Instruction *p, int n); +void printtree (TTree *tree, int ident); +void printktable (lua_State *L, int idx); +void printcharset (const byte *st); +void printcaplist (Capture *cap, Capture *limit); +void printinst (const Instruction *op, const Instruction *p); + +#else + +#define printktable(L,idx) \ + luaL_error(L, "function only implemented in debug mode") +#define printtree(tree,i) \ + luaL_error(L, "function only implemented in debug mode") +#define printpatt(p,n) \ + luaL_error(L, "function only implemented in debug mode") + +#endif + + +#endif + diff --git a/src/ext/lpeg/lptree.c b/src/ext/lpeg/lptree.c new file mode 100644 index 0000000..bda61b9 --- /dev/null +++ b/src/ext/lpeg/lptree.c @@ -0,0 +1,1303 @@ +/* +** $Id: lptree.c,v 1.22 2016/09/13 18:10:22 roberto Exp $ +** Copyright 2013, Lua.org & PUC-Rio (see 'lpeg.html' for license) +*/ + +#include +#include +#include + + +#include "lua.h" +#include "lauxlib.h" + +#include "lptypes.h" +#include "lpcap.h" +#include "lpcode.h" +#include "lpprint.h" +#include "lptree.h" + + +/* number of siblings for each tree */ +const byte numsiblings[] = { + 0, 0, 0, /* char, set, any */ + 0, 0, /* true, false */ + 1, /* rep */ + 2, 2, /* seq, choice */ + 1, 1, /* not, and */ + 0, 0, 2, 1, /* call, opencall, rule, grammar */ + 1, /* behind */ + 1, 1 /* capture, runtime capture */ +}; + + +static TTree *newgrammar (lua_State *L, int arg); + + +/* +** returns a reasonable name for value at index 'idx' on the stack +*/ +static const char *val2str (lua_State *L, int idx) { + const char *k = lua_tostring(L, idx); + if (k != NULL) + return lua_pushfstring(L, "%s", k); + else + return lua_pushfstring(L, "(a %s)", luaL_typename(L, idx)); +} + + +/* +** Fix a TOpenCall into a TCall node, using table 'postable' to +** translate a key to its rule address in the tree. Raises an +** error if key does not exist. +*/ +static void fixonecall (lua_State *L, int postable, TTree *g, TTree *t) { + int n; + lua_rawgeti(L, -1, t->key); /* get rule's name */ + lua_gettable(L, postable); /* query name in position table */ + n = lua_tonumber(L, -1); /* get (absolute) position */ + lua_pop(L, 1); /* remove position */ + if (n == 0) { /* no position? */ + lua_rawgeti(L, -1, t->key); /* get rule's name again */ + luaL_error(L, "rule '%s' undefined in given grammar", val2str(L, -1)); + } + t->tag = TCall; + t->u.ps = n - (t - g); /* position relative to node */ + assert(sib2(t)->tag == TRule); + sib2(t)->key = t->key; /* fix rule's key */ +} + + +/* +** Transform left associative constructions into right +** associative ones, for sequence and choice; that is: +** (t11 + t12) + t2 => t11 + (t12 + t2) +** (t11 * t12) * t2 => t11 * (t12 * t2) +** (that is, Op (Op t11 t12) t2 => Op t11 (Op t12 t2)) +*/ +static void correctassociativity (TTree *tree) { + TTree *t1 = sib1(tree); + assert(tree->tag == TChoice || tree->tag == TSeq); + while (t1->tag == tree->tag) { + int n1size = tree->u.ps - 1; /* t1 == Op t11 t12 */ + int n11size = t1->u.ps - 1; + int n12size = n1size - n11size - 1; + memmove(sib1(tree), sib1(t1), n11size * sizeof(TTree)); /* move t11 */ + tree->u.ps = n11size + 1; + sib2(tree)->tag = tree->tag; + sib2(tree)->u.ps = n12size + 1; + } +} + + +/* +** Make final adjustments in a tree. Fix open calls in tree 't', +** making them refer to their respective rules or raising appropriate +** errors (if not inside a grammar). Correct associativity of associative +** constructions (making them right associative). Assume that tree's +** ktable is at the top of the stack (for error messages). +*/ +static void finalfix (lua_State *L, int postable, TTree *g, TTree *t) { + tailcall: + switch (t->tag) { + case TGrammar: /* subgrammars were already fixed */ + return; + case TOpenCall: { + if (g != NULL) /* inside a grammar? */ + fixonecall(L, postable, g, t); + else { /* open call outside grammar */ + lua_rawgeti(L, -1, t->key); + luaL_error(L, "rule '%s' used outside a grammar", val2str(L, -1)); + } + break; + } + case TSeq: case TChoice: + correctassociativity(t); + break; + } + switch (numsiblings[t->tag]) { + case 1: /* finalfix(L, postable, g, sib1(t)); */ + t = sib1(t); goto tailcall; + case 2: + finalfix(L, postable, g, sib1(t)); + t = sib2(t); goto tailcall; /* finalfix(L, postable, g, sib2(t)); */ + default: assert(numsiblings[t->tag] == 0); break; + } +} + + + +/* +** {=================================================================== +** KTable manipulation +** +** - The ktable of a pattern 'p' can be shared by other patterns that +** contain 'p' and no other constants. Because of this sharing, we +** should not add elements to a 'ktable' unless it was freshly created +** for the new pattern. +** +** - The maximum index in a ktable is USHRT_MAX, because trees and +** patterns use unsigned shorts to store those indices. +** ==================================================================== +*/ + +/* +** Create a new 'ktable' to the pattern at the top of the stack. +*/ +static void newktable (lua_State *L, int n) { + lua_createtable(L, n, 0); /* create a fresh table */ + lua_setuservalue(L, -2); /* set it as 'ktable' for pattern */ +} + + +/* +** Add element 'idx' to 'ktable' of pattern at the top of the stack; +** Return index of new element. +** If new element is nil, does not add it to table (as it would be +** useless) and returns 0, as ktable[0] is always nil. +*/ +static int addtoktable (lua_State *L, int idx) { + if (lua_isnil(L, idx)) /* nil value? */ + return 0; + else { + int n; + lua_getuservalue(L, -1); /* get ktable from pattern */ + n = lua_rawlen(L, -1); + if (n >= USHRT_MAX) + luaL_error(L, "too many Lua values in pattern"); + lua_pushvalue(L, idx); /* element to be added */ + lua_rawseti(L, -2, ++n); + lua_pop(L, 1); /* remove 'ktable' */ + return n; + } +} + + +/* +** Return the number of elements in the ktable at 'idx'. +** In Lua 5.2/5.3, default "environment" for patterns is nil, not +** a table. Treat it as an empty table. In Lua 5.1, assumes that +** the environment has no numeric indices (len == 0) +*/ +static int ktablelen (lua_State *L, int idx) { + if (!lua_istable(L, idx)) return 0; + else return lua_rawlen(L, idx); +} + + +/* +** Concatentate the contents of table 'idx1' into table 'idx2'. +** (Assume that both indices are negative.) +** Return the original length of table 'idx2' (or 0, if no +** element was added, as there is no need to correct any index). +*/ +static int concattable (lua_State *L, int idx1, int idx2) { + int i; + int n1 = ktablelen(L, idx1); + int n2 = ktablelen(L, idx2); + if (n1 + n2 > USHRT_MAX) + luaL_error(L, "too many Lua values in pattern"); + if (n1 == 0) return 0; /* nothing to correct */ + for (i = 1; i <= n1; i++) { + lua_rawgeti(L, idx1, i); + lua_rawseti(L, idx2 - 1, n2 + i); /* correct 'idx2' */ + } + return n2; +} + + +/* +** When joining 'ktables', constants from one of the subpatterns must +** be renumbered; 'correctkeys' corrects their indices (adding 'n' +** to each of them) +*/ +static void correctkeys (TTree *tree, int n) { + if (n == 0) return; /* no correction? */ + tailcall: + switch (tree->tag) { + case TOpenCall: case TCall: case TRunTime: case TRule: { + if (tree->key > 0) + tree->key += n; + break; + } + case TCapture: { + if (tree->key > 0 && tree->cap != Carg && tree->cap != Cnum) + tree->key += n; + break; + } + default: break; + } + switch (numsiblings[tree->tag]) { + case 1: /* correctkeys(sib1(tree), n); */ + tree = sib1(tree); goto tailcall; + case 2: + correctkeys(sib1(tree), n); + tree = sib2(tree); goto tailcall; /* correctkeys(sib2(tree), n); */ + default: assert(numsiblings[tree->tag] == 0); break; + } +} + + +/* +** Join the ktables from p1 and p2 the ktable for the new pattern at the +** top of the stack, reusing them when possible. +*/ +static void joinktables (lua_State *L, int p1, TTree *t2, int p2) { + int n1, n2; + lua_getuservalue(L, p1); /* get ktables */ + lua_getuservalue(L, p2); + n1 = ktablelen(L, -2); + n2 = ktablelen(L, -1); + if (n1 == 0 && n2 == 0) /* are both tables empty? */ + lua_pop(L, 2); /* nothing to be done; pop tables */ + else if (n2 == 0 || lp_equal(L, -2, -1)) { /* 2nd table empty or equal? */ + lua_pop(L, 1); /* pop 2nd table */ + lua_setuservalue(L, -2); /* set 1st ktable into new pattern */ + } + else if (n1 == 0) { /* first table is empty? */ + lua_setuservalue(L, -3); /* set 2nd table into new pattern */ + lua_pop(L, 1); /* pop 1st table */ + } + else { + lua_createtable(L, n1 + n2, 0); /* create ktable for new pattern */ + /* stack: new p; ktable p1; ktable p2; new ktable */ + concattable(L, -3, -1); /* from p1 into new ktable */ + concattable(L, -2, -1); /* from p2 into new ktable */ + lua_setuservalue(L, -4); /* new ktable becomes 'p' environment */ + lua_pop(L, 2); /* pop other ktables */ + correctkeys(t2, n1); /* correction for indices from p2 */ + } +} + + +/* +** copy 'ktable' of element 'idx' to new tree (on top of stack) +*/ +static void copyktable (lua_State *L, int idx) { + lua_getuservalue(L, idx); + lua_setuservalue(L, -2); +} + + +/* +** merge 'ktable' from 'stree' at stack index 'idx' into 'ktable' +** from tree at the top of the stack, and correct corresponding +** tree. +*/ +static void mergektable (lua_State *L, int idx, TTree *stree) { + int n; + lua_getuservalue(L, -1); /* get ktables */ + lua_getuservalue(L, idx); + n = concattable(L, -1, -2); + lua_pop(L, 2); /* remove both ktables */ + correctkeys(stree, n); +} + + +/* +** Create a new 'ktable' to the pattern at the top of the stack, adding +** all elements from pattern 'p' (if not 0) plus element 'idx' to it. +** Return index of new element. +*/ +static int addtonewktable (lua_State *L, int p, int idx) { + newktable(L, 1); + if (p) + mergektable(L, p, NULL); + return addtoktable(L, idx); +} + +/* }====================================================== */ + + +/* +** {====================================================== +** Tree generation +** ======================================================= +*/ + +/* +** In 5.2, could use 'luaL_testudata'... +*/ +static int testpattern (lua_State *L, int idx) { + if (lua_touserdata(L, idx)) { /* value is a userdata? */ + if (lua_getmetatable(L, idx)) { /* does it have a metatable? */ + luaL_getmetatable(L, PATTERN_T); + if (lua_rawequal(L, -1, -2)) { /* does it have the correct mt? */ + lua_pop(L, 2); /* remove both metatables */ + return 1; + } + } + } + return 0; +} + + +static Pattern *getpattern (lua_State *L, int idx) { + return (Pattern *)luaL_checkudata(L, idx, PATTERN_T); +} + + +static int getsize (lua_State *L, int idx) { + return (lua_rawlen(L, idx) - sizeof(Pattern)) / sizeof(TTree) + 1; +} + + +static TTree *gettree (lua_State *L, int idx, int *len) { + Pattern *p = getpattern(L, idx); + if (len) + *len = getsize(L, idx); + return p->tree; +} + + +/* +** create a pattern. Set its uservalue (the 'ktable') equal to its +** metatable. (It could be any empty sequence; the metatable is at +** hand here, so we use it.) +*/ +static TTree *newtree (lua_State *L, int len) { + size_t size = (len - 1) * sizeof(TTree) + sizeof(Pattern); + Pattern *p = (Pattern *)lua_newuserdata(L, size); + luaL_getmetatable(L, PATTERN_T); + lua_pushvalue(L, -1); + lua_setuservalue(L, -3); + lua_setmetatable(L, -2); + p->code = NULL; p->codesize = 0; + return p->tree; +} + + +static TTree *newleaf (lua_State *L, int tag) { + TTree *tree = newtree(L, 1); + tree->tag = tag; + return tree; +} + + +static TTree *newcharset (lua_State *L) { + TTree *tree = newtree(L, bytes2slots(CHARSETSIZE) + 1); + tree->tag = TSet; + loopset(i, treebuffer(tree)[i] = 0); + return tree; +} + + +/* +** add to tree a sequence where first sibling is 'sib' (with size +** 'sibsize'); returns position for second sibling +*/ +static TTree *seqaux (TTree *tree, TTree *sib, int sibsize) { + tree->tag = TSeq; tree->u.ps = sibsize + 1; + memcpy(sib1(tree), sib, sibsize * sizeof(TTree)); + return sib2(tree); +} + + +/* +** Build a sequence of 'n' nodes, each with tag 'tag' and 'u.n' got +** from the array 's' (or 0 if array is NULL). (TSeq is binary, so it +** must build a sequence of sequence of sequence...) +*/ +static void fillseq (TTree *tree, int tag, int n, const char *s) { + int i; + for (i = 0; i < n - 1; i++) { /* initial n-1 copies of Seq tag; Seq ... */ + tree->tag = TSeq; tree->u.ps = 2; + sib1(tree)->tag = tag; + sib1(tree)->u.n = s ? (byte)s[i] : 0; + tree = sib2(tree); + } + tree->tag = tag; /* last one does not need TSeq */ + tree->u.n = s ? (byte)s[i] : 0; +} + + +/* +** Numbers as patterns: +** 0 == true (always match); n == TAny repeated 'n' times; +** -n == not (TAny repeated 'n' times) +*/ +static TTree *numtree (lua_State *L, int n) { + if (n == 0) + return newleaf(L, TTrue); + else { + TTree *tree, *nd; + if (n > 0) + tree = nd = newtree(L, 2 * n - 1); + else { /* negative: code it as !(-n) */ + n = -n; + tree = newtree(L, 2 * n); + tree->tag = TNot; + nd = sib1(tree); + } + fillseq(nd, TAny, n, NULL); /* sequence of 'n' any's */ + return tree; + } +} + + +/* +** Convert value at index 'idx' to a pattern +*/ +static TTree *getpatt (lua_State *L, int idx, int *len) { + TTree *tree; + switch (lua_type(L, idx)) { + case LUA_TSTRING: { + size_t slen; + const char *s = lua_tolstring(L, idx, &slen); /* get string */ + if (slen == 0) /* empty? */ + tree = newleaf(L, TTrue); /* always match */ + else { + tree = newtree(L, 2 * (slen - 1) + 1); + fillseq(tree, TChar, slen, s); /* sequence of 'slen' chars */ + } + break; + } + case LUA_TNUMBER: { + int n = lua_tointeger(L, idx); + tree = numtree(L, n); + break; + } + case LUA_TBOOLEAN: { + tree = (lua_toboolean(L, idx) ? newleaf(L, TTrue) : newleaf(L, TFalse)); + break; + } + case LUA_TTABLE: { + tree = newgrammar(L, idx); + break; + } + case LUA_TFUNCTION: { + tree = newtree(L, 2); + tree->tag = TRunTime; + tree->key = addtonewktable(L, 0, idx); + sib1(tree)->tag = TTrue; + break; + } + default: { + return gettree(L, idx, len); + } + } + lua_replace(L, idx); /* put new tree into 'idx' slot */ + if (len) + *len = getsize(L, idx); + return tree; +} + + +/* +** create a new tree, whith a new root and one sibling. +** Sibling must be on the Lua stack, at index 1. +*/ +static TTree *newroot1sib (lua_State *L, int tag) { + int s1; + TTree *tree1 = getpatt(L, 1, &s1); + TTree *tree = newtree(L, 1 + s1); /* create new tree */ + tree->tag = tag; + memcpy(sib1(tree), tree1, s1 * sizeof(TTree)); + copyktable(L, 1); + return tree; +} + + +/* +** create a new tree, whith a new root and 2 siblings. +** Siblings must be on the Lua stack, first one at index 1. +*/ +static TTree *newroot2sib (lua_State *L, int tag) { + int s1, s2; + TTree *tree1 = getpatt(L, 1, &s1); + TTree *tree2 = getpatt(L, 2, &s2); + TTree *tree = newtree(L, 1 + s1 + s2); /* create new tree */ + tree->tag = tag; + tree->u.ps = 1 + s1; + memcpy(sib1(tree), tree1, s1 * sizeof(TTree)); + memcpy(sib2(tree), tree2, s2 * sizeof(TTree)); + joinktables(L, 1, sib2(tree), 2); + return tree; +} + + +static int lp_P (lua_State *L) { + luaL_checkany(L, 1); + getpatt(L, 1, NULL); + lua_settop(L, 1); + return 1; +} + + +/* +** sequence operator; optimizations: +** false x => false, x true => x, true x => x +** (cannot do x . false => false because x may have runtime captures) +*/ +static int lp_seq (lua_State *L) { + TTree *tree1 = getpatt(L, 1, NULL); + TTree *tree2 = getpatt(L, 2, NULL); + if (tree1->tag == TFalse || tree2->tag == TTrue) + lua_pushvalue(L, 1); /* false . x == false, x . true = x */ + else if (tree1->tag == TTrue) + lua_pushvalue(L, 2); /* true . x = x */ + else + newroot2sib(L, TSeq); + return 1; +} + + +/* +** choice operator; optimizations: +** charset / charset => charset +** true / x => true, x / false => x, false / x => x +** (x / true is not equivalent to true) +*/ +static int lp_choice (lua_State *L) { + Charset st1, st2; + TTree *t1 = getpatt(L, 1, NULL); + TTree *t2 = getpatt(L, 2, NULL); + if (tocharset(t1, &st1) && tocharset(t2, &st2)) { + TTree *t = newcharset(L); + loopset(i, treebuffer(t)[i] = st1.cs[i] | st2.cs[i]); + } + else if (nofail(t1) || t2->tag == TFalse) + lua_pushvalue(L, 1); /* true / x => true, x / false => x */ + else if (t1->tag == TFalse) + lua_pushvalue(L, 2); /* false / x => x */ + else + newroot2sib(L, TChoice); + return 1; +} + + +/* +** p^n +*/ +static int lp_star (lua_State *L) { + int size1; + int n = (int)luaL_checkinteger(L, 2); + TTree *tree1 = getpatt(L, 1, &size1); + if (n >= 0) { /* seq tree1 (seq tree1 ... (seq tree1 (rep tree1))) */ + TTree *tree = newtree(L, (n + 1) * (size1 + 1)); + if (nullable(tree1)) + luaL_error(L, "loop body may accept empty string"); + while (n--) /* repeat 'n' times */ + tree = seqaux(tree, tree1, size1); + tree->tag = TRep; + memcpy(sib1(tree), tree1, size1 * sizeof(TTree)); + } + else { /* choice (seq tree1 ... choice tree1 true ...) true */ + TTree *tree; + n = -n; + /* size = (choice + seq + tree1 + true) * n, but the last has no seq */ + tree = newtree(L, n * (size1 + 3) - 1); + for (; n > 1; n--) { /* repeat (n - 1) times */ + tree->tag = TChoice; tree->u.ps = n * (size1 + 3) - 2; + sib2(tree)->tag = TTrue; + tree = sib1(tree); + tree = seqaux(tree, tree1, size1); + } + tree->tag = TChoice; tree->u.ps = size1 + 1; + sib2(tree)->tag = TTrue; + memcpy(sib1(tree), tree1, size1 * sizeof(TTree)); + } + copyktable(L, 1); + return 1; +} + + +/* +** #p == &p +*/ +static int lp_and (lua_State *L) { + newroot1sib(L, TAnd); + return 1; +} + + +/* +** -p == !p +*/ +static int lp_not (lua_State *L) { + newroot1sib(L, TNot); + return 1; +} + + +/* +** [t1 - t2] == Seq (Not t2) t1 +** If t1 and t2 are charsets, make their difference. +*/ +static int lp_sub (lua_State *L) { + Charset st1, st2; + int s1, s2; + TTree *t1 = getpatt(L, 1, &s1); + TTree *t2 = getpatt(L, 2, &s2); + if (tocharset(t1, &st1) && tocharset(t2, &st2)) { + TTree *t = newcharset(L); + loopset(i, treebuffer(t)[i] = st1.cs[i] & ~st2.cs[i]); + } + else { + TTree *tree = newtree(L, 2 + s1 + s2); + tree->tag = TSeq; /* sequence of... */ + tree->u.ps = 2 + s2; + sib1(tree)->tag = TNot; /* ...not... */ + memcpy(sib1(sib1(tree)), t2, s2 * sizeof(TTree)); /* ...t2 */ + memcpy(sib2(tree), t1, s1 * sizeof(TTree)); /* ... and t1 */ + joinktables(L, 1, sib1(tree), 2); + } + return 1; +} + + +static int lp_set (lua_State *L) { + size_t l; + const char *s = luaL_checklstring(L, 1, &l); + TTree *tree = newcharset(L); + while (l--) { + setchar(treebuffer(tree), (byte)(*s)); + s++; + } + return 1; +} + + +static int lp_range (lua_State *L) { + int arg; + int top = lua_gettop(L); + TTree *tree = newcharset(L); + for (arg = 1; arg <= top; arg++) { + int c; + size_t l; + const char *r = luaL_checklstring(L, arg, &l); + luaL_argcheck(L, l == 2, arg, "range must have two characters"); + for (c = (byte)r[0]; c <= (byte)r[1]; c++) + setchar(treebuffer(tree), c); + } + return 1; +} + + +/* +** Look-behind predicate +*/ +static int lp_behind (lua_State *L) { + TTree *tree; + TTree *tree1 = getpatt(L, 1, NULL); + int n = fixedlen(tree1); + luaL_argcheck(L, n >= 0, 1, "pattern may not have fixed length"); + luaL_argcheck(L, !hascaptures(tree1), 1, "pattern have captures"); + luaL_argcheck(L, n <= MAXBEHIND, 1, "pattern too long to look behind"); + tree = newroot1sib(L, TBehind); + tree->u.n = n; + return 1; +} + + +/* +** Create a non-terminal +*/ +static int lp_V (lua_State *L) { + TTree *tree = newleaf(L, TOpenCall); + luaL_argcheck(L, !lua_isnoneornil(L, 1), 1, "non-nil value expected"); + tree->key = addtonewktable(L, 0, 1); + return 1; +} + + +/* +** Create a tree for a non-empty capture, with a body and +** optionally with an associated Lua value (at index 'labelidx' in the +** stack) +*/ +static int capture_aux (lua_State *L, int cap, int labelidx) { + TTree *tree = newroot1sib(L, TCapture); + tree->cap = cap; + tree->key = (labelidx == 0) ? 0 : addtonewktable(L, 1, labelidx); + return 1; +} + + +/* +** Fill a tree with an empty capture, using an empty (TTrue) sibling. +*/ +static TTree *auxemptycap (TTree *tree, int cap) { + tree->tag = TCapture; + tree->cap = cap; + sib1(tree)->tag = TTrue; + return tree; +} + + +/* +** Create a tree for an empty capture +*/ +static TTree *newemptycap (lua_State *L, int cap) { + return auxemptycap(newtree(L, 2), cap); +} + + +/* +** Create a tree for an empty capture with an associated Lua value +*/ +static TTree *newemptycapkey (lua_State *L, int cap, int idx) { + TTree *tree = auxemptycap(newtree(L, 2), cap); + tree->key = addtonewktable(L, 0, idx); + return tree; +} + + +/* +** Captures with syntax p / v +** (function capture, query capture, string capture, or number capture) +*/ +static int lp_divcapture (lua_State *L) { + switch (lua_type(L, 2)) { + case LUA_TFUNCTION: return capture_aux(L, Cfunction, 2); + case LUA_TTABLE: return capture_aux(L, Cquery, 2); + case LUA_TSTRING: return capture_aux(L, Cstring, 2); + case LUA_TNUMBER: { + int n = lua_tointeger(L, 2); + TTree *tree = newroot1sib(L, TCapture); + luaL_argcheck(L, 0 <= n && n <= SHRT_MAX, 1, "invalid number"); + tree->cap = Cnum; + tree->key = n; + return 1; + } + default: return luaL_argerror(L, 2, "invalid replacement value"); + } +} + + +static int lp_substcapture (lua_State *L) { + return capture_aux(L, Csubst, 0); +} + + +static int lp_tablecapture (lua_State *L) { + return capture_aux(L, Ctable, 0); +} + + +static int lp_groupcapture (lua_State *L) { + if (lua_isnoneornil(L, 2)) + return capture_aux(L, Cgroup, 0); + else + return capture_aux(L, Cgroup, 2); +} + + +static int lp_foldcapture (lua_State *L) { + luaL_checktype(L, 2, LUA_TFUNCTION); + return capture_aux(L, Cfold, 2); +} + + +static int lp_simplecapture (lua_State *L) { + return capture_aux(L, Csimple, 0); +} + + +static int lp_poscapture (lua_State *L) { + newemptycap(L, Cposition); + return 1; +} + + +static int lp_argcapture (lua_State *L) { + int n = (int)luaL_checkinteger(L, 1); + TTree *tree = newemptycap(L, Carg); + tree->key = n; + luaL_argcheck(L, 0 < n && n <= SHRT_MAX, 1, "invalid argument index"); + return 1; +} + + +static int lp_backref (lua_State *L) { + luaL_checkany(L, 1); + newemptycapkey(L, Cbackref, 1); + return 1; +} + + +/* +** Constant capture +*/ +static int lp_constcapture (lua_State *L) { + int i; + int n = lua_gettop(L); /* number of values */ + if (n == 0) /* no values? */ + newleaf(L, TTrue); /* no capture */ + else if (n == 1) + newemptycapkey(L, Cconst, 1); /* single constant capture */ + else { /* create a group capture with all values */ + TTree *tree = newtree(L, 1 + 3 * (n - 1) + 2); + newktable(L, n); /* create a 'ktable' for new tree */ + tree->tag = TCapture; + tree->cap = Cgroup; + tree->key = 0; + tree = sib1(tree); + for (i = 1; i <= n - 1; i++) { + tree->tag = TSeq; + tree->u.ps = 3; /* skip TCapture and its sibling */ + auxemptycap(sib1(tree), Cconst); + sib1(tree)->key = addtoktable(L, i); + tree = sib2(tree); + } + auxemptycap(tree, Cconst); + tree->key = addtoktable(L, i); + } + return 1; +} + + +static int lp_matchtime (lua_State *L) { + TTree *tree; + luaL_checktype(L, 2, LUA_TFUNCTION); + tree = newroot1sib(L, TRunTime); + tree->key = addtonewktable(L, 1, 2); + return 1; +} + +/* }====================================================== */ + + +/* +** {====================================================== +** Grammar - Tree generation +** ======================================================= +*/ + +/* +** push on the stack the index and the pattern for the +** initial rule of grammar at index 'arg' in the stack; +** also add that index into position table. +*/ +static void getfirstrule (lua_State *L, int arg, int postab) { + lua_rawgeti(L, arg, 1); /* access first element */ + if (lua_isstring(L, -1)) { /* is it the name of initial rule? */ + lua_pushvalue(L, -1); /* duplicate it to use as key */ + lua_gettable(L, arg); /* get associated rule */ + } + else { + lua_pushinteger(L, 1); /* key for initial rule */ + lua_insert(L, -2); /* put it before rule */ + } + if (!testpattern(L, -1)) { /* initial rule not a pattern? */ + if (lua_isnil(L, -1)) + luaL_error(L, "grammar has no initial rule"); + else + luaL_error(L, "initial rule '%s' is not a pattern", lua_tostring(L, -2)); + } + lua_pushvalue(L, -2); /* push key */ + lua_pushinteger(L, 1); /* push rule position (after TGrammar) */ + lua_settable(L, postab); /* insert pair at position table */ +} + +/* +** traverse grammar at index 'arg', pushing all its keys and patterns +** into the stack. Create a new table (before all pairs key-pattern) to +** collect all keys and their associated positions in the final tree +** (the "position table"). +** Return the number of rules and (in 'totalsize') the total size +** for the new tree. +*/ +static int collectrules (lua_State *L, int arg, int *totalsize) { + int n = 1; /* to count number of rules */ + int postab = lua_gettop(L) + 1; /* index of position table */ + int size; /* accumulator for total size */ + lua_newtable(L); /* create position table */ + getfirstrule(L, arg, postab); + size = 2 + getsize(L, postab + 2); /* TGrammar + TRule + rule */ + lua_pushnil(L); /* prepare to traverse grammar table */ + while (lua_next(L, arg) != 0) { + if (lua_tonumber(L, -2) == 1 || + lp_equal(L, -2, postab + 1)) { /* initial rule? */ + lua_pop(L, 1); /* remove value (keep key for lua_next) */ + continue; + } + if (!testpattern(L, -1)) /* value is not a pattern? */ + luaL_error(L, "rule '%s' is not a pattern", val2str(L, -2)); + luaL_checkstack(L, LUA_MINSTACK, "grammar has too many rules"); + lua_pushvalue(L, -2); /* push key (to insert into position table) */ + lua_pushinteger(L, size); + lua_settable(L, postab); + size += 1 + getsize(L, -1); /* update size */ + lua_pushvalue(L, -2); /* push key (for next lua_next) */ + n++; + } + *totalsize = size + 1; /* TTrue to finish list of rules */ + return n; +} + + +static void buildgrammar (lua_State *L, TTree *grammar, int frule, int n) { + int i; + TTree *nd = sib1(grammar); /* auxiliary pointer to traverse the tree */ + for (i = 0; i < n; i++) { /* add each rule into new tree */ + int ridx = frule + 2*i + 1; /* index of i-th rule */ + int rulesize; + TTree *rn = gettree(L, ridx, &rulesize); + nd->tag = TRule; + nd->key = 0; /* will be fixed when rule is used */ + nd->cap = i; /* rule number */ + nd->u.ps = rulesize + 1; /* point to next rule */ + memcpy(sib1(nd), rn, rulesize * sizeof(TTree)); /* copy rule */ + mergektable(L, ridx, sib1(nd)); /* merge its ktable into new one */ + nd = sib2(nd); /* move to next rule */ + } + nd->tag = TTrue; /* finish list of rules */ +} + + +/* +** Check whether a tree has potential infinite loops +*/ +static int checkloops (TTree *tree) { + tailcall: + if (tree->tag == TRep && nullable(sib1(tree))) + return 1; + else if (tree->tag == TGrammar) + return 0; /* sub-grammars already checked */ + else { + switch (numsiblings[tree->tag]) { + case 1: /* return checkloops(sib1(tree)); */ + tree = sib1(tree); goto tailcall; + case 2: + if (checkloops(sib1(tree))) return 1; + /* else return checkloops(sib2(tree)); */ + tree = sib2(tree); goto tailcall; + default: assert(numsiblings[tree->tag] == 0); return 0; + } + } +} + + +/* +** Give appropriate error message for 'verifyrule'. If a rule appears +** twice in 'passed', there is path from it back to itself without +** advancing the subject. +*/ +static int verifyerror (lua_State *L, int *passed, int npassed) { + int i, j; + for (i = npassed - 1; i >= 0; i--) { /* search for a repetition */ + for (j = i - 1; j >= 0; j--) { + if (passed[i] == passed[j]) { + lua_rawgeti(L, -1, passed[i]); /* get rule's key */ + return luaL_error(L, "rule '%s' may be left recursive", val2str(L, -1)); + } + } + } + return luaL_error(L, "too many left calls in grammar"); +} + + +/* +** Check whether a rule can be left recursive; raise an error in that +** case; otherwise return 1 iff pattern is nullable. +** The return value is used to check sequences, where the second pattern +** is only relevant if the first is nullable. +** Parameter 'nb' works as an accumulator, to allow tail calls in +** choices. ('nb' true makes function returns true.) +** Parameter 'passed' is a list of already visited rules, 'npassed' +** counts the elements in 'passed'. +** Assume ktable at the top of the stack. +*/ +static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed, + int nb) { + tailcall: + switch (tree->tag) { + case TChar: case TSet: case TAny: + case TFalse: + return nb; /* cannot pass from here */ + case TTrue: + case TBehind: /* look-behind cannot have calls */ + return 1; + case TNot: case TAnd: case TRep: + /* return verifyrule(L, sib1(tree), passed, npassed, 1); */ + tree = sib1(tree); nb = 1; goto tailcall; + case TCapture: case TRunTime: + /* return verifyrule(L, sib1(tree), passed, npassed, nb); */ + tree = sib1(tree); goto tailcall; + case TCall: + /* return verifyrule(L, sib2(tree), passed, npassed, nb); */ + tree = sib2(tree); goto tailcall; + case TSeq: /* only check 2nd child if first is nb */ + if (!verifyrule(L, sib1(tree), passed, npassed, 0)) + return nb; + /* else return verifyrule(L, sib2(tree), passed, npassed, nb); */ + tree = sib2(tree); goto tailcall; + case TChoice: /* must check both children */ + nb = verifyrule(L, sib1(tree), passed, npassed, nb); + /* return verifyrule(L, sib2(tree), passed, npassed, nb); */ + tree = sib2(tree); goto tailcall; + case TRule: + if (npassed >= MAXRULES) + return verifyerror(L, passed, npassed); + else { + passed[npassed++] = tree->key; + /* return verifyrule(L, sib1(tree), passed, npassed); */ + tree = sib1(tree); goto tailcall; + } + case TGrammar: + return nullable(tree); /* sub-grammar cannot be left recursive */ + default: assert(0); return 0; + } +} + + +static void verifygrammar (lua_State *L, TTree *grammar) { + int passed[MAXRULES]; + TTree *rule; + /* check left-recursive rules */ + for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { + if (rule->key == 0) continue; /* unused rule */ + verifyrule(L, sib1(rule), passed, 0, 0); + } + assert(rule->tag == TTrue); + /* check infinite loops inside rules */ + for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { + if (rule->key == 0) continue; /* unused rule */ + if (checkloops(sib1(rule))) { + lua_rawgeti(L, -1, rule->key); /* get rule's key */ + luaL_error(L, "empty loop in rule '%s'", val2str(L, -1)); + } + } + assert(rule->tag == TTrue); +} + + +/* +** Give a name for the initial rule if it is not referenced +*/ +static void initialrulename (lua_State *L, TTree *grammar, int frule) { + if (sib1(grammar)->key == 0) { /* initial rule is not referenced? */ + int n = lua_rawlen(L, -1) + 1; /* index for name */ + lua_pushvalue(L, frule); /* rule's name */ + lua_rawseti(L, -2, n); /* ktable was on the top of the stack */ + sib1(grammar)->key = n; + } +} + + +static TTree *newgrammar (lua_State *L, int arg) { + int treesize; + int frule = lua_gettop(L) + 2; /* position of first rule's key */ + int n = collectrules(L, arg, &treesize); + TTree *g = newtree(L, treesize); + luaL_argcheck(L, n <= MAXRULES, arg, "grammar has too many rules"); + g->tag = TGrammar; g->u.n = n; + lua_newtable(L); /* create 'ktable' */ + lua_setuservalue(L, -2); + buildgrammar(L, g, frule, n); + lua_getuservalue(L, -1); /* get 'ktable' for new tree */ + finalfix(L, frule - 1, g, sib1(g)); + initialrulename(L, g, frule); + verifygrammar(L, g); + lua_pop(L, 1); /* remove 'ktable' */ + lua_insert(L, -(n * 2 + 2)); /* move new table to proper position */ + lua_pop(L, n * 2 + 1); /* remove position table + rule pairs */ + return g; /* new table at the top of the stack */ +} + +/* }====================================================== */ + + +static Instruction *prepcompile (lua_State *L, Pattern *p, int idx) { + lua_getuservalue(L, idx); /* push 'ktable' (may be used by 'finalfix') */ + finalfix(L, 0, NULL, p->tree); + lua_pop(L, 1); /* remove 'ktable' */ + return compile(L, p); +} + + +static int lp_printtree (lua_State *L) { + TTree *tree = getpatt(L, 1, NULL); + int c = lua_toboolean(L, 2); + if (c) { + lua_getuservalue(L, 1); /* push 'ktable' (may be used by 'finalfix') */ + finalfix(L, 0, NULL, tree); + lua_pop(L, 1); /* remove 'ktable' */ + } + printktable(L, 1); + printtree(tree, 0); + return 0; +} + + +static int lp_printcode (lua_State *L) { + Pattern *p = getpattern(L, 1); + printktable(L, 1); + if (p->code == NULL) /* not compiled yet? */ + prepcompile(L, p, 1); + printpatt(p->code, p->codesize); + return 0; +} + + +/* +** Get the initial position for the match, interpreting negative +** values from the end of the subject +*/ +static size_t initposition (lua_State *L, size_t len) { + lua_Integer ii = luaL_optinteger(L, 3, 1); + if (ii > 0) { /* positive index? */ + if ((size_t)ii <= len) /* inside the string? */ + return (size_t)ii - 1; /* return it (corrected to 0-base) */ + else return len; /* crop at the end */ + } + else { /* negative index */ + if ((size_t)(-ii) <= len) /* inside the string? */ + return len - ((size_t)(-ii)); /* return position from the end */ + else return 0; /* crop at the beginning */ + } +} + + +/* +** Main match function +*/ +static int lp_match (lua_State *L) { + Capture capture[INITCAPSIZE]; + const char *r; + size_t l; + Pattern *p = (getpatt(L, 1, NULL), getpattern(L, 1)); + Instruction *code = (p->code != NULL) ? p->code : prepcompile(L, p, 1); + const char *s = luaL_checklstring(L, SUBJIDX, &l); + size_t i = initposition(L, l); + int ptop = lua_gettop(L); + lua_pushnil(L); /* initialize subscache */ + lua_pushlightuserdata(L, capture); /* initialize caplistidx */ + lua_getuservalue(L, 1); /* initialize penvidx */ + r = match(L, s, s + i, s + l, code, capture, ptop); + if (r == NULL) { + lua_pushnil(L); + return 1; + } + return getcaptures(L, s, r, ptop); +} + + + +/* +** {====================================================== +** Library creation and functions not related to matching +** ======================================================= +*/ + +/* maximum limit for stack size */ +#define MAXLIM (INT_MAX / 100) + +static int lp_setmax (lua_State *L) { + lua_Integer lim = luaL_checkinteger(L, 1); + luaL_argcheck(L, 0 < lim && lim <= MAXLIM, 1, "out of range"); + lua_settop(L, 1); + lua_setfield(L, LUA_REGISTRYINDEX, MAXSTACKIDX); + return 0; +} + + +static int lp_version (lua_State *L) { + lua_pushstring(L, VERSION); + return 1; +} + + +static int lp_type (lua_State *L) { + if (testpattern(L, 1)) + lua_pushliteral(L, "pattern"); + else + lua_pushnil(L); + return 1; +} + + +int lp_gc (lua_State *L) { + Pattern *p = getpattern(L, 1); + realloccode(L, p, 0); /* delete code block */ + return 0; +} + + +static void createcat (lua_State *L, const char *catname, int (catf) (int)) { + TTree *t = newcharset(L); + int i; + for (i = 0; i <= UCHAR_MAX; i++) + if (catf(i)) setchar(treebuffer(t), i); + lua_setfield(L, -2, catname); +} + + +static int lp_locale (lua_State *L) { + if (lua_isnoneornil(L, 1)) { + lua_settop(L, 0); + lua_createtable(L, 0, 12); + } + else { + luaL_checktype(L, 1, LUA_TTABLE); + lua_settop(L, 1); + } + createcat(L, "alnum", isalnum); + createcat(L, "alpha", isalpha); + createcat(L, "cntrl", iscntrl); + createcat(L, "digit", isdigit); + createcat(L, "graph", isgraph); + createcat(L, "lower", islower); + createcat(L, "print", isprint); + createcat(L, "punct", ispunct); + createcat(L, "space", isspace); + createcat(L, "upper", isupper); + createcat(L, "xdigit", isxdigit); + return 1; +} + + +static struct luaL_Reg pattreg[] = { + {"ptree", lp_printtree}, + {"pcode", lp_printcode}, + {"match", lp_match}, + {"B", lp_behind}, + {"V", lp_V}, + {"C", lp_simplecapture}, + {"Cc", lp_constcapture}, + {"Cmt", lp_matchtime}, + {"Cb", lp_backref}, + {"Carg", lp_argcapture}, + {"Cp", lp_poscapture}, + {"Cs", lp_substcapture}, + {"Ct", lp_tablecapture}, + {"Cf", lp_foldcapture}, + {"Cg", lp_groupcapture}, + {"P", lp_P}, + {"S", lp_set}, + {"R", lp_range}, + {"locale", lp_locale}, + {"version", lp_version}, + {"setmaxstack", lp_setmax}, + {"type", lp_type}, + {NULL, NULL} +}; + + +static struct luaL_Reg metareg[] = { + {"__mul", lp_seq}, + {"__add", lp_choice}, + {"__pow", lp_star}, + {"__gc", lp_gc}, + {"__len", lp_and}, + {"__div", lp_divcapture}, + {"__unm", lp_not}, + {"__sub", lp_sub}, + {NULL, NULL} +}; + + +int luaopen_lpeg (lua_State *L); +int luaopen_lpeg (lua_State *L) { + luaL_newmetatable(L, PATTERN_T); + lua_pushnumber(L, MAXBACK); /* initialize maximum backtracking */ + lua_setfield(L, LUA_REGISTRYINDEX, MAXSTACKIDX); + luaL_setfuncs(L, metareg, 0); + luaL_newlib(L, pattreg); + lua_pushvalue(L, -1); + lua_setfield(L, -3, "__index"); + return 1; +} + +/* }====================================================== */ diff --git a/src/ext/lpeg/lptree.h b/src/ext/lpeg/lptree.h new file mode 100644 index 0000000..34ee15c --- /dev/null +++ b/src/ext/lpeg/lptree.h @@ -0,0 +1,82 @@ +/* +** $Id: lptree.h,v 1.3 2016/09/13 18:07:51 roberto Exp $ +*/ + +#if !defined(lptree_h) +#define lptree_h + + +#include "lptypes.h" + + +/* +** types of trees +*/ +typedef enum TTag { + TChar = 0, /* 'n' = char */ + TSet, /* the set is stored in next CHARSETSIZE bytes */ + TAny, + TTrue, + TFalse, + TRep, /* 'sib1'* */ + TSeq, /* 'sib1' 'sib2' */ + TChoice, /* 'sib1' / 'sib2' */ + TNot, /* !'sib1' */ + TAnd, /* &'sib1' */ + TCall, /* ktable[key] is rule's key; 'sib2' is rule being called */ + TOpenCall, /* ktable[key] is rule's key */ + TRule, /* ktable[key] is rule's key (but key == 0 for unused rules); + 'sib1' is rule's pattern; + 'sib2' is next rule; 'cap' is rule's sequential number */ + TGrammar, /* 'sib1' is initial (and first) rule */ + TBehind, /* 'sib1' is pattern, 'n' is how much to go back */ + TCapture, /* captures: 'cap' is kind of capture (enum 'CapKind'); + ktable[key] is Lua value associated with capture; + 'sib1' is capture body */ + TRunTime /* run-time capture: 'key' is Lua function; + 'sib1' is capture body */ +} TTag; + + +/* +** Tree trees +** The first child of a tree (if there is one) is immediately after +** the tree. A reference to a second child (ps) is its position +** relative to the position of the tree itself. +*/ +typedef struct TTree { + byte tag; + byte cap; /* kind of capture (if it is a capture) */ + unsigned short key; /* key in ktable for Lua data (0 if no key) */ + union { + int ps; /* occasional second child */ + int n; /* occasional counter */ + } u; +} TTree; + + +/* +** A complete pattern has its tree plus, if already compiled, +** its corresponding code +*/ +typedef struct Pattern { + union Instruction *code; + int codesize; + TTree tree[1]; +} Pattern; + + +/* number of children for each tree */ +extern const byte numsiblings[]; + +/* access to children */ +#define sib1(t) ((t) + 1) +#define sib2(t) ((t) + (t)->u.ps) + + + + + + +#endif + diff --git a/src/ext/lpeg/lptypes.h b/src/ext/lpeg/lptypes.h new file mode 100644 index 0000000..8e78bc8 --- /dev/null +++ b/src/ext/lpeg/lptypes.h @@ -0,0 +1,149 @@ +/* +** $Id: lptypes.h,v 1.16 2017/01/13 13:33:17 roberto Exp $ +** LPeg - PEG pattern matching for Lua +** Copyright 2007-2017, Lua.org & PUC-Rio (see 'lpeg.html' for license) +** written by Roberto Ierusalimschy +*/ + +#if !defined(lptypes_h) +#define lptypes_h + + +#if !defined(LPEG_DEBUG) +#define NDEBUG +#endif + +#include +#include + +#include "lua.h" + + +#define VERSION "1.0.1" + + +#define PATTERN_T "lpeg-pattern" +#define MAXSTACKIDX "lpeg-maxstack" + + +/* +** compatibility with Lua 5.1 +*/ +#if (LUA_VERSION_NUM == 501) + +#define lp_equal lua_equal + +#define lua_getuservalue lua_getfenv +#define lua_setuservalue lua_setfenv + +#define lua_rawlen lua_objlen + +#define luaL_setfuncs(L,f,n) luaL_register(L,NULL,f) +#define luaL_newlib(L,f) luaL_register(L,"lpeg",f) + +#endif + + +#if !defined(lp_equal) +#define lp_equal(L,idx1,idx2) lua_compare(L,(idx1),(idx2),LUA_OPEQ) +#endif + + +/* default maximum size for call/backtrack stack */ +#if !defined(MAXBACK) +#define MAXBACK 400 +#endif + + +/* maximum number of rules in a grammar (limited by 'unsigned char') */ +#if !defined(MAXRULES) +#define MAXRULES 250 +#endif + + + +/* initial size for capture's list */ +#define INITCAPSIZE 32 + + +/* index, on Lua stack, for subject */ +#define SUBJIDX 2 + +/* number of fixed arguments to 'match' (before capture arguments) */ +#define FIXEDARGS 3 + +/* index, on Lua stack, for capture list */ +#define caplistidx(ptop) ((ptop) + 2) + +/* index, on Lua stack, for pattern's ktable */ +#define ktableidx(ptop) ((ptop) + 3) + +/* index, on Lua stack, for backtracking stack */ +#define stackidx(ptop) ((ptop) + 4) + + + +typedef unsigned char byte; + + +#define BITSPERCHAR 8 + +#define CHARSETSIZE ((UCHAR_MAX/BITSPERCHAR) + 1) + + + +typedef struct Charset { + byte cs[CHARSETSIZE]; +} Charset; + + + +#define loopset(v,b) { int v; for (v = 0; v < CHARSETSIZE; v++) {b;} } + +/* access to charset */ +#define treebuffer(t) ((byte *)((t) + 1)) + +/* number of slots needed for 'n' bytes */ +#define bytes2slots(n) (((n) - 1) / sizeof(TTree) + 1) + +/* set 'b' bit in charset 'cs' */ +#define setchar(cs,b) ((cs)[(b) >> 3] |= (1 << ((b) & 7))) + + +/* +** in capture instructions, 'kind' of capture and its offset are +** packed in field 'aux', 4 bits for each +*/ +#define getkind(op) ((op)->i.aux & 0xF) +#define getoff(op) (((op)->i.aux >> 4) & 0xF) +#define joinkindoff(k,o) ((k) | ((o) << 4)) + +#define MAXOFF 0xF +#define MAXAUX 0xFF + + +/* maximum number of bytes to look behind */ +#define MAXBEHIND MAXAUX + + +/* maximum size (in elements) for a pattern */ +#define MAXPATTSIZE (SHRT_MAX - 10) + + +/* size (in elements) for an instruction plus extra l bytes */ +#define instsize(l) (((l) + sizeof(Instruction) - 1)/sizeof(Instruction) + 1) + + +/* size (in elements) for a ISet instruction */ +#define CHARSETINSTSIZE instsize(CHARSETSIZE) + +/* size (in elements) for a IFunc instruction */ +#define funcinstsize(p) ((p)->i.aux + 2) + + + +#define testchar(st,c) (((int)(st)[((c) >> 3)] & (1 << ((c) & 7)))) + + +#endif + diff --git a/src/ext/lpeg/lpvm.c b/src/ext/lpeg/lpvm.c new file mode 100644 index 0000000..05a5f68 --- /dev/null +++ b/src/ext/lpeg/lpvm.c @@ -0,0 +1,364 @@ +/* +** $Id: lpvm.c,v 1.9 2016/06/03 20:11:18 roberto Exp $ +** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) +*/ + +#include +#include + + +#include "lua.h" +#include "lauxlib.h" + +#include "lpcap.h" +#include "lptypes.h" +#include "lpvm.h" +#include "lpprint.h" + + +/* initial size for call/backtrack stack */ +#if !defined(INITBACK) +#define INITBACK MAXBACK +#endif + + +#define getoffset(p) (((p) + 1)->offset) + +static const Instruction giveup = {{IGiveup, 0, 0}}; + + +/* +** {====================================================== +** Virtual Machine +** ======================================================= +*/ + + +typedef struct Stack { + const char *s; /* saved position (or NULL for calls) */ + const Instruction *p; /* next instruction */ + int caplevel; +} Stack; + + +#define getstackbase(L, ptop) ((Stack *)lua_touserdata(L, stackidx(ptop))) + + +/* +** Make the size of the array of captures 'cap' twice as large as needed +** (which is 'captop'). ('n' is the number of new elements.) +*/ +static Capture *doublecap (lua_State *L, Capture *cap, int captop, + int n, int ptop) { + Capture *newc; + if (captop >= INT_MAX/((int)sizeof(Capture) * 2)) + luaL_error(L, "too many captures"); + newc = (Capture *)lua_newuserdata(L, captop * 2 * sizeof(Capture)); + memcpy(newc, cap, (captop - n) * sizeof(Capture)); + lua_replace(L, caplistidx(ptop)); + return newc; +} + + +/* +** Double the size of the stack +*/ +static Stack *doublestack (lua_State *L, Stack **stacklimit, int ptop) { + Stack *stack = getstackbase(L, ptop); + Stack *newstack; + int n = *stacklimit - stack; /* current stack size */ + int max, newn; + lua_getfield(L, LUA_REGISTRYINDEX, MAXSTACKIDX); + max = lua_tointeger(L, -1); /* maximum allowed size */ + lua_pop(L, 1); + if (n >= max) /* already at maximum size? */ + luaL_error(L, "backtrack stack overflow (current limit is %d)", max); + newn = 2 * n; /* new size */ + if (newn > max) newn = max; + newstack = (Stack *)lua_newuserdata(L, newn * sizeof(Stack)); + memcpy(newstack, stack, n * sizeof(Stack)); + lua_replace(L, stackidx(ptop)); + *stacklimit = newstack + newn; + return newstack + n; /* return next position */ +} + + +/* +** Interpret the result of a dynamic capture: false -> fail; +** true -> keep current position; number -> next position. +** Return new subject position. 'fr' is stack index where +** is the result; 'curr' is current subject position; 'limit' +** is subject's size. +*/ +static int resdyncaptures (lua_State *L, int fr, int curr, int limit) { + lua_Integer res; + if (!lua_toboolean(L, fr)) { /* false value? */ + lua_settop(L, fr - 1); /* remove results */ + return -1; /* and fail */ + } + else if (lua_isboolean(L, fr)) /* true? */ + res = curr; /* keep current position */ + else { + res = lua_tointeger(L, fr) - 1; /* new position */ + if (res < curr || res > limit) + luaL_error(L, "invalid position returned by match-time capture"); + } + lua_remove(L, fr); /* remove first result (offset) */ + return res; +} + + +/* +** Add capture values returned by a dynamic capture to the capture list +** 'base', nested inside a group capture. 'fd' indexes the first capture +** value, 'n' is the number of values (at least 1). +*/ +static void adddyncaptures (const char *s, Capture *base, int n, int fd) { + int i; + base[0].kind = Cgroup; /* create group capture */ + base[0].siz = 0; + base[0].idx = 0; /* make it an anonymous group */ + for (i = 1; i <= n; i++) { /* add runtime captures */ + base[i].kind = Cruntime; + base[i].siz = 1; /* mark it as closed */ + base[i].idx = fd + i - 1; /* stack index of capture value */ + base[i].s = s; + } + base[i].kind = Cclose; /* close group */ + base[i].siz = 1; + base[i].s = s; +} + + +/* +** Remove dynamic captures from the Lua stack (called in case of failure) +*/ +static int removedyncap (lua_State *L, Capture *capture, + int level, int last) { + int id = finddyncap(capture + level, capture + last); /* index of 1st cap. */ + int top = lua_gettop(L); + if (id == 0) return 0; /* no dynamic captures? */ + lua_settop(L, id - 1); /* remove captures */ + return top - id + 1; /* number of values removed */ +} + + +/* +** Opcode interpreter +*/ +const char *match (lua_State *L, const char *o, const char *s, const char *e, + Instruction *op, Capture *capture, int ptop) { + Stack stackbase[INITBACK]; + Stack *stacklimit = stackbase + INITBACK; + Stack *stack = stackbase; /* point to first empty slot in stack */ + int capsize = INITCAPSIZE; + int captop = 0; /* point to first empty slot in captures */ + int ndyncap = 0; /* number of dynamic captures (in Lua stack) */ + const Instruction *p = op; /* current instruction */ + stack->p = &giveup; stack->s = s; stack->caplevel = 0; stack++; + lua_pushlightuserdata(L, stackbase); + for (;;) { +#if defined(DEBUG) + printf("-------------------------------------\n"); + printcaplist(capture, capture + captop); + printf("s: |%s| stck:%d, dyncaps:%d, caps:%d ", + s, (int)(stack - getstackbase(L, ptop)), ndyncap, captop); + printinst(op, p); +#endif + assert(stackidx(ptop) + ndyncap == lua_gettop(L) && ndyncap <= captop); + switch ((Opcode)p->i.code) { + case IEnd: { + assert(stack == getstackbase(L, ptop) + 1); + capture[captop].kind = Cclose; + capture[captop].s = NULL; + return s; + } + case IGiveup: { + assert(stack == getstackbase(L, ptop)); + return NULL; + } + case IRet: { + assert(stack > getstackbase(L, ptop) && (stack - 1)->s == NULL); + p = (--stack)->p; + continue; + } + case IAny: { + if (s < e) { p++; s++; } + else goto fail; + continue; + } + case ITestAny: { + if (s < e) p += 2; + else p += getoffset(p); + continue; + } + case IChar: { + if ((byte)*s == p->i.aux && s < e) { p++; s++; } + else goto fail; + continue; + } + case ITestChar: { + if ((byte)*s == p->i.aux && s < e) p += 2; + else p += getoffset(p); + continue; + } + case ISet: { + int c = (byte)*s; + if (testchar((p+1)->buff, c) && s < e) + { p += CHARSETINSTSIZE; s++; } + else goto fail; + continue; + } + case ITestSet: { + int c = (byte)*s; + if (testchar((p + 2)->buff, c) && s < e) + p += 1 + CHARSETINSTSIZE; + else p += getoffset(p); + continue; + } + case IBehind: { + int n = p->i.aux; + if (n > s - o) goto fail; + s -= n; p++; + continue; + } + case ISpan: { + for (; s < e; s++) { + int c = (byte)*s; + if (!testchar((p+1)->buff, c)) break; + } + p += CHARSETINSTSIZE; + continue; + } + case IJmp: { + p += getoffset(p); + continue; + } + case IChoice: { + if (stack == stacklimit) + stack = doublestack(L, &stacklimit, ptop); + stack->p = p + getoffset(p); + stack->s = s; + stack->caplevel = captop; + stack++; + p += 2; + continue; + } + case ICall: { + if (stack == stacklimit) + stack = doublestack(L, &stacklimit, ptop); + stack->s = NULL; + stack->p = p + 2; /* save return address */ + stack++; + p += getoffset(p); + continue; + } + case ICommit: { + assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL); + stack--; + p += getoffset(p); + continue; + } + case IPartialCommit: { + assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL); + (stack - 1)->s = s; + (stack - 1)->caplevel = captop; + p += getoffset(p); + continue; + } + case IBackCommit: { + assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL); + s = (--stack)->s; + captop = stack->caplevel; + p += getoffset(p); + continue; + } + case IFailTwice: + assert(stack > getstackbase(L, ptop)); + stack--; + /* go through */ + case IFail: + fail: { /* pattern failed: try to backtrack */ + do { /* remove pending calls */ + assert(stack > getstackbase(L, ptop)); + s = (--stack)->s; + } while (s == NULL); + if (ndyncap > 0) /* is there matchtime captures? */ + ndyncap -= removedyncap(L, capture, stack->caplevel, captop); + captop = stack->caplevel; + p = stack->p; +#if defined(DEBUG) + printf("**FAIL**\n"); +#endif + continue; + } + case ICloseRunTime: { + CapState cs; + int rem, res, n; + int fr = lua_gettop(L) + 1; /* stack index of first result */ + cs.s = o; cs.L = L; cs.ocap = capture; cs.ptop = ptop; + n = runtimecap(&cs, capture + captop, s, &rem); /* call function */ + captop -= n; /* remove nested captures */ + ndyncap -= rem; /* update number of dynamic captures */ + fr -= rem; /* 'rem' items were popped from Lua stack */ + res = resdyncaptures(L, fr, s - o, e - o); /* get result */ + if (res == -1) /* fail? */ + goto fail; + s = o + res; /* else update current position */ + n = lua_gettop(L) - fr + 1; /* number of new captures */ + ndyncap += n; /* update number of dynamic captures */ + if (n > 0) { /* any new capture? */ + if (fr + n >= SHRT_MAX) + luaL_error(L, "too many results in match-time capture"); + if ((captop += n + 2) >= capsize) { + capture = doublecap(L, capture, captop, n + 2, ptop); + capsize = 2 * captop; + } + /* add new captures to 'capture' list */ + adddyncaptures(s, capture + captop - n - 2, n, fr); + } + p++; + continue; + } + case ICloseCapture: { + const char *s1 = s; + assert(captop > 0); + /* if possible, turn capture into a full capture */ + if (capture[captop - 1].siz == 0 && + s1 - capture[captop - 1].s < UCHAR_MAX) { + capture[captop - 1].siz = s1 - capture[captop - 1].s + 1; + p++; + continue; + } + else { + capture[captop].siz = 1; /* mark entry as closed */ + capture[captop].s = s; + goto pushcapture; + } + } + case IOpenCapture: + capture[captop].siz = 0; /* mark entry as open */ + capture[captop].s = s; + goto pushcapture; + case IFullCapture: + capture[captop].siz = getoff(p) + 1; /* save capture size */ + capture[captop].s = s - getoff(p); + /* goto pushcapture; */ + pushcapture: { + capture[captop].idx = p->i.key; + capture[captop].kind = getkind(p); + if (++captop >= capsize) { + capture = doublecap(L, capture, captop, 0, ptop); + capsize = 2 * captop; + } + p++; + continue; + } + default: assert(0); return NULL; + } + } +} + +/* }====================================================== */ + + diff --git a/src/ext/lpeg/lpvm.h b/src/ext/lpeg/lpvm.h new file mode 100644 index 0000000..757b9e1 --- /dev/null +++ b/src/ext/lpeg/lpvm.h @@ -0,0 +1,58 @@ +/* +** $Id: lpvm.h,v 1.3 2014/02/21 13:06:41 roberto Exp $ +*/ + +#if !defined(lpvm_h) +#define lpvm_h + +#include "lpcap.h" + + +/* Virtual Machine's instructions */ +typedef enum Opcode { + IAny, /* if no char, fail */ + IChar, /* if char != aux, fail */ + ISet, /* if char not in buff, fail */ + ITestAny, /* in no char, jump to 'offset' */ + ITestChar, /* if char != aux, jump to 'offset' */ + ITestSet, /* if char not in buff, jump to 'offset' */ + ISpan, /* read a span of chars in buff */ + IBehind, /* walk back 'aux' characters (fail if not possible) */ + IRet, /* return from a rule */ + IEnd, /* end of pattern */ + IChoice, /* stack a choice; next fail will jump to 'offset' */ + IJmp, /* jump to 'offset' */ + ICall, /* call rule at 'offset' */ + IOpenCall, /* call rule number 'key' (must be closed to a ICall) */ + ICommit, /* pop choice and jump to 'offset' */ + IPartialCommit, /* update top choice to current position and jump */ + IBackCommit, /* "fails" but jump to its own 'offset' */ + IFailTwice, /* pop one choice and then fail */ + IFail, /* go back to saved state on choice and jump to saved offset */ + IGiveup, /* internal use */ + IFullCapture, /* complete capture of last 'off' chars */ + IOpenCapture, /* start a capture */ + ICloseCapture, + ICloseRunTime +} Opcode; + + + +typedef union Instruction { + struct Inst { + byte code; + byte aux; + short key; + } i; + int offset; + byte buff[1]; +} Instruction; + + +void printpatt (Instruction *p, int n); +const char *match (lua_State *L, const char *o, const char *s, const char *e, + Instruction *op, Capture *capture, int ptop); + + +#endif + diff --git a/src/ext/lpeg/makefile b/src/ext/lpeg/makefile new file mode 100644 index 0000000..7a8463e --- /dev/null +++ b/src/ext/lpeg/makefile @@ -0,0 +1,55 @@ +LIBNAME = lpeg +LUADIR = ../lua/ + +COPT = -O2 +# COPT = -DLPEG_DEBUG -g + +CWARNS = -Wall -Wextra -pedantic \ + -Waggregate-return \ + -Wcast-align \ + -Wcast-qual \ + -Wdisabled-optimization \ + -Wpointer-arith \ + -Wshadow \ + -Wsign-compare \ + -Wundef \ + -Wwrite-strings \ + -Wbad-function-cast \ + -Wdeclaration-after-statement \ + -Wmissing-prototypes \ + -Wnested-externs \ + -Wstrict-prototypes \ +# -Wunreachable-code \ + + +CFLAGS = $(CWARNS) $(COPT) -std=c99 -I$(LUADIR) -fPIC +CC = gcc + +FILES = lpvm.o lpcap.o lptree.o lpcode.o lpprint.o + +# For Linux +linux: + make lpeg.so "DLLFLAGS = -shared -fPIC" + +# For Mac OS +macosx: + make lpeg.so "DLLFLAGS = -bundle -undefined dynamic_lookup" + +lpeg.so: $(FILES) + env $(CC) $(DLLFLAGS) $(FILES) -o lpeg.so + +$(FILES): makefile + +test: test.lua re.lua lpeg.so + ./test.lua + +clean: + rm -f $(FILES) lpeg.so + + +lpcap.o: lpcap.c lpcap.h lptypes.h +lpcode.o: lpcode.c lptypes.h lpcode.h lptree.h lpvm.h lpcap.h +lpprint.o: lpprint.c lptypes.h lpprint.h lptree.h lpvm.h lpcap.h +lptree.o: lptree.c lptypes.h lpcap.h lpcode.h lptree.h lpvm.h lpprint.h +lpvm.o: lpvm.c lpcap.h lptypes.h lpvm.h lpprint.h lptree.h + diff --git a/src/ext/lpeg/re.html b/src/ext/lpeg/re.html new file mode 100644 index 0000000..32f0a45 --- /dev/null +++ b/src/ext/lpeg/re.html @@ -0,0 +1,498 @@ + + + + LPeg.re - Regex syntax for LPEG + + + + + + + +
+ +
+ +
LPeg.re
+
+ Regex syntax for LPEG +
+
+ +
+ + + +
+ +

The re Module

+ +

+The re module +(provided by file re.lua in the distribution) +supports a somewhat conventional regex syntax +for pattern usage within LPeg. +

+ +

+The next table summarizes re's syntax. +A p represents an arbitrary pattern; +num represents a number ([0-9]+); +name represents an identifier +([a-zA-Z][a-zA-Z0-9_]*). +Constructions are listed in order of decreasing precedence. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SyntaxDescription
( p ) grouping
'string' literal string
"string" literal string
[class] character class
. any character
%namepattern defs[name] or a pre-defined pattern
namenon terminal
<name>non terminal
{} position capture
{ p } simple capture
{: p :} anonymous group capture
{:name: p :} named group capture
{~ p ~} substitution capture
{| p |} table capture
=name back reference +
p ? optional match
p * zero or more repetitions
p + one or more repetitions
p^num exactly n repetitions
p^+numat least n repetitions
p^-numat most n repetitions
p -> 'string' string capture
p -> "string" string capture
p -> num numbered capture
p -> name function/query/string capture +equivalent to p / defs[name]
p => name match-time capture +equivalent to lpeg.Cmt(p, defs[name])
& p and predicate
! p not predicate
p1 p2 concatenation
p1 / p2 ordered choice
(name <- p)+ grammar
+

+Any space appearing in a syntax description can be +replaced by zero or more space characters and Lua-style comments +(-- until end of line). +

+ +

+Character classes define sets of characters. +An initial ^ complements the resulting set. +A range x-y includes in the set +all characters with codes between the codes of x and y. +A pre-defined class %name includes all +characters of that class. +A simple character includes itself in the set. +The only special characters inside a class are ^ +(special only if it is the first character); +] +(can be included in the set as the first character, +after the optional ^); +% (special only if followed by a letter); +and - +(can be included in the set as the first or the last character). +

+ +

+Currently the pre-defined classes are similar to those from the +Lua's string library +(%a for letters, +%A for non letters, etc.). +There is also a class %nl +containing only the newline character, +which is particularly handy for grammars written inside long strings, +as long strings do not interpret escape sequences like \n. +

+ + +

Functions

+ +

re.compile (string, [, defs])

+

+Compiles the given string and +returns an equivalent LPeg pattern. +The given string may define either an expression or a grammar. +The optional defs table provides extra Lua values +to be used by the pattern. +

+ +

re.find (subject, pattern [, init])

+

+Searches the given pattern in the given subject. +If it finds a match, +returns the index where this occurrence starts and +the index where it ends. +Otherwise, returns nil. +

+ +

+An optional numeric argument init makes the search +starts at that position in the subject string. +As usual in Lua libraries, +a negative value counts from the end. +

+ +

re.gsub (subject, pattern, replacement)

+

+Does a global substitution, +replacing all occurrences of pattern +in the given subject by replacement. + +

re.match (subject, pattern)

+

+Matches the given pattern against the given subject, +returning all captures. +

+ +

re.updatelocale ()

+

+Updates the pre-defined character classes to the current locale. +

+ + +

Some Examples

+ +

A complete simple program

+

+The next code shows a simple complete Lua program using +the re module: +

+
+local re = require"re"
+
+-- find the position of the first numeral in a string
+print(re.find("the number 423 is odd", "[0-9]+"))  --> 12    14
+
+-- returns all words in a string
+print(re.match("the number 423 is odd", "({%a+} / .)*"))
+--> the    number    is    odd
+
+-- returns the first numeral in a string
+print(re.match("the number 423 is odd", "s <- {%d+} / . s"))
+--> 423
+
+print(re.gsub("hello World", "[aeiou]", "."))
+--> h.ll. W.rld
+
+ + +

Balanced parentheses

+

+The following call will produce the same pattern produced by the +Lua expression in the +balanced parentheses example: +

+
+b = re.compile[[  balanced <- "(" ([^()] / balanced)* ")"  ]]
+
+ +

String reversal

+

+The next example reverses a string: +

+
+rev = re.compile[[ R <- (!.) -> '' / ({.} R) -> '%2%1']]
+print(rev:match"0123456789")   --> 9876543210
+
+ +

CSV decoder

+

+The next example replicates the CSV decoder: +

+
+record = re.compile[[
+  record <- {| field (',' field)* |} (%nl / !.)
+  field <- escaped / nonescaped
+  nonescaped <- { [^,"%nl]* }
+  escaped <- '"' {~ ([^"] / '""' -> '"')* ~} '"'
+]]
+
+ +

Lua's long strings

+

+The next example matches Lua long strings: +

+
+c = re.compile([[
+  longstring <- ('[' {:eq: '='* :} '[' close)
+  close <- ']' =eq ']' / . close
+]])
+
+print(c:match'[==[]]===]]]]==]===[]')   --> 17
+
+ +

Abstract Syntax Trees

+

+This example shows a simple way to build an +abstract syntax tree (AST) for a given grammar. +To keep our example simple, +let us consider the following grammar +for lists of names: +

+
+p = re.compile[[
+      listname <- (name s)*
+      name <- [a-z][a-z]*
+      s <- %s*
+]]
+
+

+Now, we will add captures to build a corresponding AST. +As a first step, the pattern will build a table to +represent each non terminal; +terminals will be represented by their corresponding strings: +

+
+c = re.compile[[
+      listname <- {| (name s)* |}
+      name <- {| {[a-z][a-z]*} |}
+      s <- %s*
+]]
+
+

+Now, a match against "hi hello bye" +results in the table +{{"hi"}, {"hello"}, {"bye"}}. +

+

+For such a simple grammar, +this AST is more than enough; +actually, the tables around each single name +are already overkilling. +More complex grammars, +however, may need some more structure. +Specifically, +it would be useful if each table had +a tag field telling what non terminal +that table represents. +We can add such a tag using +named group captures: +

+
+x = re.compile[[
+      listname <- {| {:tag: '' -> 'list':} (name s)* |}
+      name <- {| {:tag: '' -> 'id':} {[a-z][a-z]*} |}
+      s <- ' '*
+]]
+
+

+With these group captures, +a match against "hi hello bye" +results in the following table: +

+
+{tag="list",
+  {tag="id", "hi"},
+  {tag="id", "hello"},
+  {tag="id", "bye"}
+}
+
+ + +

Indented blocks

+

+This example breaks indented blocks into tables, +respecting the indentation: +

+
+p = re.compile[[
+  block <- {| {:ident:' '*:} line
+           ((=ident !' ' line) / &(=ident ' ') block)* |}
+  line <- {[^%nl]*} %nl
+]]
+
+

+As an example, +consider the following text: +

+
+t = p:match[[
+first line
+  subline 1
+  subline 2
+second line
+third line
+  subline 3.1
+    subline 3.1.1
+  subline 3.2
+]]
+
+

+The resulting table t will be like this: +

+
+   {'first line'; {'subline 1'; 'subline 2'; ident = '  '};
+    'second line';
+    'third line'; { 'subline 3.1'; {'subline 3.1.1'; ident = '    '};
+                    'subline 3.2'; ident = '  '};
+    ident = ''}
+
+ +

Macro expander

+

+This example implements a simple macro expander. +Macros must be defined as part of the pattern, +following some simple rules: +

+
+p = re.compile[[
+      text <- {~ item* ~}
+      item <- macro / [^()] / '(' item* ')'
+      arg <- ' '* {~ (!',' item)* ~}
+      args <- '(' arg (',' arg)* ')'
+      -- now we define some macros
+      macro <- ('apply' args) -> '%1(%2)'
+             / ('add' args) -> '%1 + %2'
+             / ('mul' args) -> '%1 * %2'
+]]
+
+print(p:match"add(mul(a,b), apply(f,x))")   --> a * b + f(x)
+
+

+A text is a sequence of items, +wherein we apply a substitution capture to expand any macros. +An item is either a macro, +any character different from parentheses, +or a parenthesized expression. +A macro argument (arg) is a sequence +of items different from a comma. +(Note that a comma may appear inside an item, +e.g., inside a parenthesized expression.) +Again we do a substitution capture to expand any macro +in the argument before expanding the outer macro. +args is a list of arguments separated by commas. +Finally we define the macros. +Each macro is a string substitution; +it replaces the macro name and its arguments by its corresponding string, +with each %n replaced by the n-th argument. +

+ +

Patterns

+

+This example shows the complete syntax +of patterns accepted by re. +

+
+p = [=[
+
+pattern         <- exp !.
+exp             <- S (grammar / alternative)
+
+alternative     <- seq ('/' S seq)*
+seq             <- prefix*
+prefix          <- '&' S prefix / '!' S prefix / suffix
+suffix          <- primary S (([+*?]
+                            / '^' [+-]? num
+                            / '->' S (string / '{}' / name)
+                            / '=>' S name) S)*
+
+primary         <- '(' exp ')' / string / class / defined
+                 / '{:' (name ':')? exp ':}'
+                 / '=' name
+                 / '{}'
+                 / '{~' exp '~}'
+                 / '{' exp '}'
+                 / '.'
+                 / name S !arrow
+                 / '<' name '>'          -- old-style non terminals
+
+grammar         <- definition+
+definition      <- name S arrow exp
+
+class           <- '[' '^'? item (!']' item)* ']'
+item            <- defined / range / .
+range           <- . '-' [^]]
+
+S               <- (%s / '--' [^%nl]*)*   -- spaces and comments
+name            <- [A-Za-z][A-Za-z0-9_]*
+arrow           <- '<-'
+num             <- [0-9]+
+string          <- '"' [^"]* '"' / "'" [^']* "'"
+defined         <- '%' name
+
+]=]
+
+print(re.match(p, p))   -- a self description must match itself
+
+ + + +

License

+ +

+Copyright © 2008-2015 Lua.org, PUC-Rio. +

+

+Permission is hereby granted, free of charge, +to any person obtaining a copy of this software and +associated documentation files (the "Software"), +to deal in the Software without restriction, +including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, +and to permit persons to whom the Software is +furnished to do so, +subject to the following conditions: +

+ +

+The above copyright notice and this permission notice +shall be included in all copies or substantial portions of the Software. +

+ +

+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +

+ +
+ +
+ +
+

+$Id: re.html,v 1.24 2016/09/20 17:41:27 roberto Exp $ +

+
+ +
+ + + diff --git a/src/ext/lpeg/re.lua b/src/ext/lpeg/re.lua new file mode 100644 index 0000000..3b9974f --- /dev/null +++ b/src/ext/lpeg/re.lua @@ -0,0 +1,259 @@ +-- $Id: re.lua,v 1.44 2013/03/26 20:11:40 roberto Exp $ + +-- imported functions and modules +local tonumber, type, print, error = tonumber, type, print, error +local setmetatable = setmetatable +local m = require"lpeg" + +-- 'm' will be used to parse expressions, and 'mm' will be used to +-- create expressions; that is, 're' runs on 'm', creating patterns +-- on 'mm' +local mm = m + +-- pattern's metatable +local mt = getmetatable(mm.P(0)) + + + +-- No more global accesses after this point +local version = _VERSION +if version == "Lua 5.2" then _ENV = nil end + + +local any = m.P(1) + + +-- Pre-defined names +local Predef = { nl = m.P"\n" } + + +local mem +local fmem +local gmem + + +local function updatelocale () + mm.locale(Predef) + Predef.a = Predef.alpha + Predef.c = Predef.cntrl + Predef.d = Predef.digit + Predef.g = Predef.graph + Predef.l = Predef.lower + Predef.p = Predef.punct + Predef.s = Predef.space + Predef.u = Predef.upper + Predef.w = Predef.alnum + Predef.x = Predef.xdigit + Predef.A = any - Predef.a + Predef.C = any - Predef.c + Predef.D = any - Predef.d + Predef.G = any - Predef.g + Predef.L = any - Predef.l + Predef.P = any - Predef.p + Predef.S = any - Predef.s + Predef.U = any - Predef.u + Predef.W = any - Predef.w + Predef.X = any - Predef.x + mem = {} -- restart memoization + fmem = {} + gmem = {} + local mt = {__mode = "v"} + setmetatable(mem, mt) + setmetatable(fmem, mt) + setmetatable(gmem, mt) +end + + +updatelocale() + + + +local I = m.P(function (s,i) print(i, s:sub(1, i-1)); return i end) + + +local function getdef (id, defs) + local c = defs and defs[id] + if not c then error("undefined name: " .. id) end + return c +end + + +local function patt_error (s, i) + local msg = (#s < i + 20) and s:sub(i) + or s:sub(i,i+20) .. "..." + msg = ("pattern error near '%s'"):format(msg) + error(msg, 2) +end + +local function mult (p, n) + local np = mm.P(true) + while n >= 1 do + if n%2 >= 1 then np = np * p end + p = p * p + n = n/2 + end + return np +end + +local function equalcap (s, i, c) + if type(c) ~= "string" then return nil end + local e = #c + i + if s:sub(i, e - 1) == c then return e else return nil end +end + + +local S = (Predef.space + "--" * (any - Predef.nl)^0)^0 + +local name = m.R("AZ", "az", "__") * m.R("AZ", "az", "__", "09")^0 + +local arrow = S * "<-" + +local seq_follow = m.P"/" + ")" + "}" + ":}" + "~}" + "|}" + (name * arrow) + -1 + +name = m.C(name) + + +-- a defined name only have meaning in a given environment +local Def = name * m.Carg(1) + +local num = m.C(m.R"09"^1) * S / tonumber + +local String = "'" * m.C((any - "'")^0) * "'" + + '"' * m.C((any - '"')^0) * '"' + + +local defined = "%" * Def / function (c,Defs) + local cat = Defs and Defs[c] or Predef[c] + if not cat then error ("name '" .. c .. "' undefined") end + return cat +end + +local Range = m.Cs(any * (m.P"-"/"") * (any - "]")) / mm.R + +local item = defined + Range + m.C(any) + +local Class = + "[" + * (m.C(m.P"^"^-1)) -- optional complement symbol + * m.Cf(item * (item - "]")^0, mt.__add) / + function (c, p) return c == "^" and any - p or p end + * "]" + +local function adddef (t, k, exp) + if t[k] then + error("'"..k.."' already defined as a rule") + else + t[k] = exp + end + return t +end + +local function firstdef (n, r) return adddef({n}, n, r) end + + +local function NT (n, b) + if not b then + error("rule '"..n.."' used outside a grammar") + else return mm.V(n) + end +end + + +local exp = m.P{ "Exp", + Exp = S * ( m.V"Grammar" + + m.Cf(m.V"Seq" * ("/" * S * m.V"Seq")^0, mt.__add) ); + Seq = m.Cf(m.Cc(m.P"") * m.V"Prefix"^0 , mt.__mul) + * (#seq_follow + patt_error); + Prefix = "&" * S * m.V"Prefix" / mt.__len + + "!" * S * m.V"Prefix" / mt.__unm + + m.V"Suffix"; + Suffix = m.Cf(m.V"Primary" * S * + ( ( m.P"+" * m.Cc(1, mt.__pow) + + m.P"*" * m.Cc(0, mt.__pow) + + m.P"?" * m.Cc(-1, mt.__pow) + + "^" * ( m.Cg(num * m.Cc(mult)) + + m.Cg(m.C(m.S"+-" * m.R"09"^1) * m.Cc(mt.__pow)) + ) + + "->" * S * ( m.Cg((String + num) * m.Cc(mt.__div)) + + m.P"{}" * m.Cc(nil, m.Ct) + + m.Cg(Def / getdef * m.Cc(mt.__div)) + ) + + "=>" * S * m.Cg(Def / getdef * m.Cc(m.Cmt)) + ) * S + )^0, function (a,b,f) return f(a,b) end ); + Primary = "(" * m.V"Exp" * ")" + + String / mm.P + + Class + + defined + + "{:" * (name * ":" + m.Cc(nil)) * m.V"Exp" * ":}" / + function (n, p) return mm.Cg(p, n) end + + "=" * name / function (n) return mm.Cmt(mm.Cb(n), equalcap) end + + m.P"{}" / mm.Cp + + "{~" * m.V"Exp" * "~}" / mm.Cs + + "{|" * m.V"Exp" * "|}" / mm.Ct + + "{" * m.V"Exp" * "}" / mm.C + + m.P"." * m.Cc(any) + + (name * -arrow + "<" * name * ">") * m.Cb("G") / NT; + Definition = name * arrow * m.V"Exp"; + Grammar = m.Cg(m.Cc(true), "G") * + m.Cf(m.V"Definition" / firstdef * m.Cg(m.V"Definition")^0, + adddef) / mm.P +} + +local pattern = S * m.Cg(m.Cc(false), "G") * exp / mm.P * (-any + patt_error) + + +local function compile (p, defs) + if mm.type(p) == "pattern" then return p end -- already compiled + local cp = pattern:match(p, 1, defs) + if not cp then error("incorrect pattern", 3) end + return cp +end + +local function match (s, p, i) + local cp = mem[p] + if not cp then + cp = compile(p) + mem[p] = cp + end + return cp:match(s, i or 1) +end + +local function find (s, p, i) + local cp = fmem[p] + if not cp then + cp = compile(p) / 0 + cp = mm.P{ mm.Cp() * cp * mm.Cp() + 1 * mm.V(1) } + fmem[p] = cp + end + local i, e = cp:match(s, i or 1) + if i then return i, e - 1 + else return i + end +end + +local function gsub (s, p, rep) + local g = gmem[p] or {} -- ensure gmem[p] is not collected while here + gmem[p] = g + local cp = g[rep] + if not cp then + cp = compile(p) + cp = mm.Cs((cp / rep + 1)^0) + g[rep] = cp + end + return cp:match(s) +end + + +-- exported names +local re = { + compile = compile, + match = match, + find = find, + gsub = gsub, + updatelocale = updatelocale, +} + +if version == "Lua 5.1" then _G.re = re end + +return re diff --git a/src/ext/lpeg/test.lua b/src/ext/lpeg/test.lua new file mode 100755 index 0000000..20ad07f --- /dev/null +++ b/src/ext/lpeg/test.lua @@ -0,0 +1,1503 @@ +#!/usr/bin/env lua + +-- $Id: test.lua,v 1.112 2017/01/14 18:55:22 roberto Exp $ + +-- require"strict" -- just to be pedantic + +local m = require"lpeg" + + +-- for general use +local a, b, c, d, e, f, g, p, t + + +-- compatibility with Lua 5.2 +local unpack = rawget(table, "unpack") or unpack +local loadstring = rawget(_G, "loadstring") or load + + +local any = m.P(1) +local space = m.S" \t\n"^0 + +local function checkeq (x, y, p) +if p then print(x,y) end + if type(x) ~= "table" then assert(x == y) + else + for k,v in pairs(x) do checkeq(v, y[k], p) end + for k,v in pairs(y) do checkeq(v, x[k], p) end + end +end + + +local mt = getmetatable(m.P(1)) + + +local allchar = {} +for i=0,255 do allchar[i + 1] = i end +allchar = string.char(unpack(allchar)) +assert(#allchar == 256) + +local function cs2str (c) + return m.match(m.Cs((c + m.P(1)/"")^0), allchar) +end + +local function eqcharset (c1, c2) + assert(cs2str(c1) == cs2str(c2)) +end + + +print"General tests for LPeg library" + +assert(type(m.version()) == "string") +print("version " .. m.version()) +assert(m.type("alo") ~= "pattern") +assert(m.type(io.input) ~= "pattern") +assert(m.type(m.P"alo") == "pattern") + +-- tests for some basic optimizations +assert(m.match(m.P(false) + "a", "a") == 2) +assert(m.match(m.P(true) + "a", "a") == 1) +assert(m.match("a" + m.P(false), "b") == nil) +assert(m.match("a" + m.P(true), "b") == 1) + +assert(m.match(m.P(false) * "a", "a") == nil) +assert(m.match(m.P(true) * "a", "a") == 2) +assert(m.match("a" * m.P(false), "a") == nil) +assert(m.match("a" * m.P(true), "a") == 2) + +assert(m.match(#m.P(false) * "a", "a") == nil) +assert(m.match(#m.P(true) * "a", "a") == 2) +assert(m.match("a" * #m.P(false), "a") == nil) +assert(m.match("a" * #m.P(true), "a") == 2) + + +-- tests for locale +do + assert(m.locale(m) == m) + local t = {} + assert(m.locale(t, m) == t) + local x = m.locale() + for n,v in pairs(x) do + assert(type(n) == "string") + eqcharset(v, m[n]) + end +end + + +assert(m.match(3, "aaaa")) +assert(m.match(4, "aaaa")) +assert(not m.match(5, "aaaa")) +assert(m.match(-3, "aa")) +assert(not m.match(-3, "aaa")) +assert(not m.match(-3, "aaaa")) +assert(not m.match(-4, "aaaa")) +assert(m.P(-5):match"aaaa") + +assert(m.match("a", "alo") == 2) +assert(m.match("al", "alo") == 3) +assert(not m.match("alu", "alo")) +assert(m.match(true, "") == 1) + +local digit = m.S"0123456789" +local upper = m.S"ABCDEFGHIJKLMNOPQRSTUVWXYZ" +local lower = m.S"abcdefghijklmnopqrstuvwxyz" +local letter = m.S"" + upper + lower +local alpha = letter + digit + m.R() + +eqcharset(m.S"", m.P(false)) +eqcharset(upper, m.R("AZ")) +eqcharset(lower, m.R("az")) +eqcharset(upper + lower, m.R("AZ", "az")) +eqcharset(upper + lower, m.R("AZ", "cz", "aa", "bb", "90")) +eqcharset(digit, m.S"01234567" + "8" + "9") +eqcharset(upper, letter - lower) +eqcharset(m.S(""), m.R()) +assert(cs2str(m.S("")) == "") + +eqcharset(m.S"\0", "\0") +eqcharset(m.S"\1\0\2", m.R"\0\2") +eqcharset(m.S"\1\0\2", m.R"\1\2" + "\0") +eqcharset(m.S"\1\0\2" - "\0", m.R"\1\2") + +local word = alpha^1 * (1 - alpha)^0 + +assert((word^0 * -1):match"alo alo") +assert(m.match(word^1 * -1, "alo alo")) +assert(m.match(word^2 * -1, "alo alo")) +assert(not m.match(word^3 * -1, "alo alo")) + +assert(not m.match(word^-1 * -1, "alo alo")) +assert(m.match(word^-2 * -1, "alo alo")) +assert(m.match(word^-3 * -1, "alo alo")) + +local eos = m.P(-1) + +assert(m.match(digit^0 * letter * digit * eos, "1298a1")) +assert(not m.match(digit^0 * letter * eos, "1257a1")) + +b = { + [1] = "(" * (((1 - m.S"()") + #m.P"(" * m.V(1))^0) * ")" +} + +assert(m.match(b, "(al())()")) +assert(not m.match(b * eos, "(al())()")) +assert(m.match(b * eos, "((al())()(é))")) +assert(not m.match(b, "(al()()")) + +assert(not m.match(letter^1 - "for", "foreach")) +assert(m.match(letter^1 - ("for" * eos), "foreach")) +assert(not m.match(letter^1 - ("for" * eos), "for")) + +function basiclookfor (p) + return m.P { + [1] = p + (1 * m.V(1)) + } +end + +function caplookfor (p) + return basiclookfor(p:C()) +end + +assert(m.match(caplookfor(letter^1), " 4achou123...") == "achou") +a = {m.match(caplookfor(letter^1)^0, " two words, one more ")} +checkeq(a, {"two", "words", "one", "more"}) + +assert(m.match( basiclookfor((#m.P(b) * 1) * m.Cp()), " ( (a)") == 7) + +a = {m.match(m.C(digit^1 * m.Cc"d") + m.C(letter^1 * m.Cc"l"), "123")} +checkeq(a, {"123", "d"}) + +-- bug in LPeg 0.12 (nil value does not create a 'ktable') +assert(m.match(m.Cc(nil), "") == nil) + +a = {m.match(m.C(digit^1 * m.Cc"d") + m.C(letter^1 * m.Cc"l"), "abcd")} +checkeq(a, {"abcd", "l"}) + +a = {m.match(m.Cc(10,20,30) * 'a' * m.Cp(), 'aaa')} +checkeq(a, {10,20,30,2}) +a = {m.match(m.Cp() * m.Cc(10,20,30) * 'a' * m.Cp(), 'aaa')} +checkeq(a, {1,10,20,30,2}) +a = m.match(m.Ct(m.Cp() * m.Cc(10,20,30) * 'a' * m.Cp()), 'aaa') +checkeq(a, {1,10,20,30,2}) +a = m.match(m.Ct(m.Cp() * m.Cc(7,8) * m.Cc(10,20,30) * 'a' * m.Cp()), 'aaa') +checkeq(a, {1,7,8,10,20,30,2}) +a = {m.match(m.Cc() * m.Cc() * m.Cc(1) * m.Cc(2,3,4) * m.Cc() * 'a', 'aaa')} +checkeq(a, {1,2,3,4}) + +a = {m.match(m.Cp() * letter^1 * m.Cp(), "abcd")} +checkeq(a, {1, 5}) + + +t = {m.match({[1] = m.C(m.C(1) * m.V(1) + -1)}, "abc")} +checkeq(t, {"abc", "a", "bc", "b", "c", "c", ""}) + +-- bug in 0.12 ('hascapture' did not check for captures inside a rule) +do + local pat = m.P{ + 'S'; + S1 = m.C('abc') + 3, + S = #m.V('S1') -- rule has capture, but '#' must ignore it + } + assert(pat:match'abc' == 1) +end + + +-- bug: loop in 'hascaptures' +do + local p = m.C(-m.P{m.P'x' * m.V(1) + m.P'y'}) + assert(p:match("xxx") == "") +end + + + +-- test for small capture boundary +for i = 250,260 do + assert(#m.match(m.C(i), string.rep('a', i)) == i) + assert(#m.match(m.C(m.C(i)), string.rep('a', i)) == i) +end + +-- tests for any*n and any*-n +for n = 1, 550, 13 do + local x_1 = string.rep('x', n - 1) + local x = x_1 .. 'a' + assert(not m.P(n):match(x_1)) + assert(m.P(n):match(x) == n + 1) + assert(n < 4 or m.match(m.P(n) + "xxx", x_1) == 4) + assert(m.C(n):match(x) == x) + assert(m.C(m.C(n)):match(x) == x) + assert(m.P(-n):match(x_1) == 1) + assert(not m.P(-n):match(x)) + assert(n < 13 or m.match(m.Cc(20) * ((n - 13) * m.P(10)) * 3, x) == 20) + local n3 = math.floor(n/3) + assert(m.match(n3 * m.Cp() * n3 * n3, x) == n3 + 1) +end + +-- true values +assert(m.P(0):match("x") == 1) +assert(m.P(0):match("") == 1) +assert(m.C(0):match("x") == "") + +assert(m.match(m.Cc(0) * m.P(10) + m.Cc(1) * "xuxu", "xuxu") == 1) +assert(m.match(m.Cc(0) * m.P(10) + m.Cc(1) * "xuxu", "xuxuxuxuxu") == 0) +assert(m.match(m.C(m.P(2)^1), "abcde") == "abcd") +p = m.Cc(0) * 1 + m.Cc(1) * 2 + m.Cc(2) * 3 + m.Cc(3) * 4 + + +-- test for alternation optimization +assert(m.match(m.P"a"^1 + "ab" + m.P"x"^0, "ab") == 2) +assert(m.match((m.P"a"^1 + "ab" + m.P"x"^0 * 1)^0, "ab") == 3) +assert(m.match(m.P"ab" + "cd" + "" + "cy" + "ak", "98") == 1) +assert(m.match(m.P"ab" + "cd" + "ax" + "cy", "ax") == 3) +assert(m.match("a" * m.P"b"^0 * "c" + "cd" + "ax" + "cy", "ax") == 3) +assert(m.match((m.P"ab" + "cd" + "ax" + "cy")^0, "ax") == 3) +assert(m.match(m.P(1) * "x" + m.S"" * "xu" + "ay", "ay") == 3) +assert(m.match(m.P"abc" + "cde" + "aka", "aka") == 4) +assert(m.match(m.S"abc" * "x" + "cde" + "aka", "ax") == 3) +assert(m.match(m.S"abc" * "x" + "cde" + "aka", "aka") == 4) +assert(m.match(m.S"abc" * "x" + "cde" + "aka", "cde") == 4) +assert(m.match(m.S"abc" * "x" + "ide" + m.S"ab" * "ka", "aka") == 4) +assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "cde" + "aka", "ax") == 3) +assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "cde" + "aka", "aka") == 4) +assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "cde" + "aka", "cde") == 4) +assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "ide" + m.S"ab" * "ka", "aka") == 4) +assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "ide" + m.S"ab" * "ka", "ax") == 3) +assert(m.match(m.P(1) * "x" + "cde" + m.S"ab" * "ka", "aka") == 4) +assert(m.match(m.P(1) * "x" + "cde" + m.P(1) * "ka", "aka") == 4) +assert(m.match(m.P(1) * "x" + "cde" + m.P(1) * "ka", "cde") == 4) +assert(m.match(m.P"eb" + "cd" + m.P"e"^0 + "x", "ee") == 3) +assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "abcd") == 3) +assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "eeex") == 4) +assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "cd") == 3) +assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "x") == 1) +assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x" + "", "zee") == 1) +assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "abcd") == 3) +assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "eeex") == 4) +assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "cd") == 3) +assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "x") == 2) +assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x" + "", "zee") == 1) +assert(not m.match(("aa" * m.P"bc"^-1 + "aab") * "e", "aabe")) + +assert(m.match("alo" * (m.P"\n" + -1), "alo") == 4) + + +-- bug in 0.12 (rc1) +assert(m.match((m.P"\128\187\191" + m.S"abc")^0, "\128\187\191") == 4) + +assert(m.match(m.S"\0\128\255\127"^0, string.rep("\0\128\255\127", 10)) == + 4*10 + 1) + +-- optimizations with optional parts +assert(m.match(("ab" * -m.P"c")^-1, "abc") == 1) +assert(m.match(("ab" * #m.P"c")^-1, "abd") == 1) +assert(m.match(("ab" * m.B"c")^-1, "ab") == 1) +assert(m.match(("ab" * m.P"cd"^0)^-1, "abcdcdc") == 7) + +assert(m.match(m.P"ab"^-1 - "c", "abcd") == 3) + +p = ('Aa' * ('Bb' * ('Cc' * m.P'Dd'^0)^0)^0)^-1 +assert(p:match("AaBbCcDdBbCcDdDdDdBb") == 21) + + +-- bug in 0.12.2 +-- p = { ('ab' ('c' 'ef'?)*)? } +p = m.C(('ab' * ('c' * m.P'ef'^-1)^0)^-1) +s = "abcefccefc" +assert(s == p:match(s)) + + +pi = "3.14159 26535 89793 23846 26433 83279 50288 41971 69399 37510" +assert(m.match(m.Cs((m.P"1" / "a" + m.P"5" / "b" + m.P"9" / "c" + 1)^0), pi) == + m.match(m.Cs((m.P(1) / {["1"] = "a", ["5"] = "b", ["9"] = "c"})^0), pi)) +print"+" + + +-- tests for capture optimizations +assert(m.match((m.P(3) + 4 * m.Cp()) * "a", "abca") == 5) +t = {m.match(((m.P"a" + m.Cp()) * m.P"x")^0, "axxaxx")} +checkeq(t, {3, 6}) + + +-- tests for numbered captures +p = m.C(1) +assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 3, "abcdefgh") == "a") +assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 1, "abcdefgh") == "abcdef") +assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 4, "abcdefgh") == "bc") +assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 0, "abcdefgh") == 7) + +a, b, c = m.match(p * (m.C(p * m.C(2)) * m.C(3) / 4) * p, "abcdefgh") +assert(a == "a" and b == "efg" and c == "h") + +-- test for table captures +t = m.match(m.Ct(letter^1), "alo") +checkeq(t, {}) + +t, n = m.match(m.Ct(m.C(letter)^1) * m.Cc"t", "alo") +assert(n == "t" and table.concat(t) == "alo") + +t = m.match(m.Ct(m.C(m.C(letter)^1)), "alo") +assert(table.concat(t, ";") == "alo;a;l;o") + +t = m.match(m.Ct(m.C(m.C(letter)^1)), "alo") +assert(table.concat(t, ";") == "alo;a;l;o") + +t = m.match(m.Ct(m.Ct((m.Cp() * letter * m.Cp())^1)), "alo") +assert(table.concat(t[1], ";") == "1;2;2;3;3;4") + +t = m.match(m.Ct(m.C(m.C(1) * 1 * m.C(1))), "alo") +checkeq(t, {"alo", "a", "o"}) + + +-- tests for groups +p = m.Cg(1) -- no capture +assert(p:match('x') == 'x') +p = m.Cg(m.P(true)/function () end * 1) -- no value +assert(p:match('x') == 'x') +p = m.Cg(m.Cg(m.Cg(m.C(1)))) +assert(p:match('x') == 'x') +p = m.Cg(m.Cg(m.Cg(m.C(1))^0) * m.Cg(m.Cc(1) * m.Cc(2))) +t = {p:match'abc'} +checkeq(t, {'a', 'b', 'c', 1, 2}) + +p = m.Ct(m.Cg(m.Cc(10), "hi") * m.C(1)^0 * m.Cg(m.Cc(20), "ho")) +t = p:match'' +checkeq(t, {hi = 10, ho = 20}) +t = p:match'abc' +checkeq(t, {hi = 10, ho = 20, 'a', 'b', 'c'}) + +-- non-string group names +p = m.Ct(m.Cg(1, print) * m.Cg(1, 23.5) * m.Cg(1, io)) +t = p:match('abcdefghij') +assert(t[print] == 'a' and t[23.5] == 'b' and t[io] == 'c') + + +-- test for error messages +local function checkerr (msg, f, ...) + local st, err = pcall(f, ...) + assert(not st and m.match({ m.P(msg) + 1 * m.V(1) }, err)) +end + +checkerr("rule '1' may be left recursive", m.match, { m.V(1) * 'a' }, "a") +checkerr("rule '1' used outside a grammar", m.match, m.V(1), "") +checkerr("rule 'hiii' used outside a grammar", m.match, m.V('hiii'), "") +checkerr("rule 'hiii' undefined in given grammar", m.match, { m.V('hiii') }, "") +checkerr("undefined in given grammar", m.match, { m.V{} }, "") + +checkerr("rule 'A' is not a pattern", m.P, { m.P(1), A = {} }) +checkerr("grammar has no initial rule", m.P, { [print] = {} }) + +-- grammar with a long call chain before left recursion +p = {'a', + a = m.V'b' * m.V'c' * m.V'd' * m.V'a', + b = m.V'c', + c = m.V'd', + d = m.V'e', + e = m.V'f', + f = m.V'g', + g = m.P'' +} +checkerr("rule 'a' may be left recursive", m.match, p, "a") + +-- Bug in peephole optimization of LPeg 0.12 (IJmp -> ICommit) +-- the next grammar has an original sequence IJmp -> ICommit -> IJmp L1 +-- that is optimized to ICommit L1 + +p = m.P { (m.P {m.P'abc'} + 'ayz') * m.V'y'; y = m.P'x' } +assert(p:match('abcx') == 5 and p:match('ayzx') == 5 and not p:match'abc') + + +do + -- large dynamic Cc + local lim = 2^16 - 1 + local c = 0 + local function seq (n) + if n == 1 then c = c + 1; return m.Cc(c) + else + local m = math.floor(n / 2) + return seq(m) * seq(n - m) + end + end + p = m.Ct(seq(lim)) + t = p:match('') + assert(t[lim] == lim) + checkerr("too many", function () p = p / print end) + checkerr("too many", seq, lim + 1) +end + + +-- tests for non-pattern as arguments to pattern functions + +p = { ('a' * m.V(1))^-1 } * m.P'b' * { 'a' * m.V(2); m.V(1)^-1 } +assert(m.match(p, "aaabaac") == 7) + +p = m.P'abc' * 2 * -5 * true * 'de' -- mix of numbers and strings and booleans + +assert(p:match("abc01de") == 8) +assert(p:match("abc01de3456") == nil) + +p = 'abc' * (2 * (-5 * (true * m.P'de'))) + +assert(p:match("abc01de") == 8) +assert(p:match("abc01de3456") == nil) + +p = { m.V(2), m.P"abc" } * + (m.P{ "xx", xx = m.P"xx" } + { "x", x = m.P"a" * m.V"x" + "" }) +assert(p:match("abcaaaxx") == 7) +assert(p:match("abcxx") == 6) + + +-- a large table capture +t = m.match(m.Ct(m.C('a')^0), string.rep("a", 10000)) +assert(#t == 10000 and t[1] == 'a' and t[#t] == 'a') + +print('+') + + +-- bug in 0.10 (rechecking a grammar, after tail-call optimization) +m.P{ m.P { (m.P(3) + "xuxu")^0 * m.V"xuxu", xuxu = m.P(1) } } + +local V = m.V + +local Space = m.S(" \n\t")^0 +local Number = m.C(m.R("09")^1) * Space +local FactorOp = m.C(m.S("+-")) * Space +local TermOp = m.C(m.S("*/")) * Space +local Open = "(" * Space +local Close = ")" * Space + + +local function f_factor (v1, op, v2, d) + assert(d == nil) + if op == "+" then return v1 + v2 + else return v1 - v2 + end +end + + +local function f_term (v1, op, v2, d) + assert(d == nil) + if op == "*" then return v1 * v2 + else return v1 / v2 + end +end + +G = m.P{ "Exp", + Exp = m.Cf(V"Factor" * m.Cg(FactorOp * V"Factor")^0, f_factor); + Factor = m.Cf(V"Term" * m.Cg(TermOp * V"Term")^0, f_term); + Term = Number / tonumber + Open * V"Exp" * Close; +} + +G = Space * G * -1 + +for _, s in ipairs{" 3 + 5*9 / (1+1) ", "3+4/2", "3+3-3- 9*2+3*9/1- 8"} do + assert(m.match(G, s) == loadstring("return "..s)()) +end + + +-- test for grammars (errors deep in calling non-terminals) +g = m.P{ + [1] = m.V(2) + "a", + [2] = "a" * m.V(3) * "x", + [3] = "b" * m.V(3) + "c" +} + +assert(m.match(g, "abbbcx") == 7) +assert(m.match(g, "abbbbx") == 2) + + +-- tests for \0 +assert(m.match(m.R("\0\1")^1, "\0\1\0") == 4) +assert(m.match(m.S("\0\1ab")^1, "\0\1\0a") == 5) +assert(m.match(m.P(1)^3, "\0\1\0a") == 5) +assert(not m.match(-4, "\0\1\0a")) +assert(m.match("\0\1\0a", "\0\1\0a") == 5) +assert(m.match("\0\0\0", "\0\0\0") == 4) +assert(not m.match("\0\0\0", "\0\0")) + + +-- tests for predicates +assert(not m.match(-m.P("a") * 2, "alo")) +assert(m.match(- -m.P("a") * 2, "alo") == 3) +assert(m.match(#m.P("a") * 2, "alo") == 3) +assert(m.match(##m.P("a") * 2, "alo") == 3) +assert(not m.match(##m.P("c") * 2, "alo")) +assert(m.match(m.Cs((##m.P("a") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") +assert(m.match(m.Cs((#((#m.P"a")/"") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") +assert(m.match(m.Cs((- -m.P("a") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") +assert(m.match(m.Cs((-((-m.P"a")/"") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") + + +-- fixed length +do + -- 'and' predicate using fixed length + local p = m.C(#("a" * (m.P("bd") + "cd")) * 2) + assert(p:match("acd") == "ac") + + p = #m.P{ "a" * m.V(2), m.P"b" } * 2 + assert(p:match("abc") == 3) + + p = #(m.P"abc" * m.B"c") + assert(p:match("abc") == 1 and not p:match("ab")) + + p = m.P{ "a" * m.V(2), m.P"b"^1 } + checkerr("pattern may not have fixed length", m.B, p) + + p = "abc" * (m.P"b"^1 + m.P"a"^0) + checkerr("pattern may not have fixed length", m.B, p) +end + + +p = -m.P'a' * m.Cc(1) + -m.P'b' * m.Cc(2) + -m.P'c' * m.Cc(3) +assert(p:match('a') == 2 and p:match('') == 1 and p:match('b') == 1) + +p = -m.P'a' * m.Cc(10) + #m.P'a' * m.Cc(20) +assert(p:match('a') == 20 and p:match('') == 10 and p:match('b') == 10) + + + +-- look-behind predicate +assert(not m.match(m.B'a', 'a')) +assert(m.match(1 * m.B'a', 'a') == 2) +assert(not m.match(m.B(1), 'a')) +assert(m.match(1 * m.B(1), 'a') == 2) +assert(m.match(-m.B(1), 'a') == 1) +assert(m.match(m.B(250), string.rep('a', 250)) == nil) +assert(m.match(250 * m.B(250), string.rep('a', 250)) == 251) + +-- look-behind with an open call +checkerr("pattern may not have fixed length", m.B, m.V'S1') +checkerr("too long to look behind", m.B, 260) + +B = #letter * -m.B(letter) + -letter * m.B(letter) +x = m.Ct({ (B * m.Cp())^-1 * (1 * m.V(1) + m.P(true)) }) +checkeq(m.match(x, 'ar cal c'), {1,3,4,7,9,10}) +checkeq(m.match(x, ' ar cal '), {2,4,5,8}) +checkeq(m.match(x, ' '), {}) +checkeq(m.match(x, 'aloalo'), {1,7}) + +assert(m.match(B, "a") == 1) +assert(m.match(1 * B, "a") == 2) +assert(not m.B(1 - letter):match("")) +assert((-m.B(letter)):match("") == 1) + +assert((4 * m.B(letter, 4)):match("aaaaaaaa") == 5) +assert(not (4 * m.B(#letter * 5)):match("aaaaaaaa")) +assert((4 * -m.B(#letter * 5)):match("aaaaaaaa") == 5) + +-- look-behind with grammars +assert(m.match('a' * m.B{'x', x = m.P(3)}, 'aaa') == nil) +assert(m.match('aa' * m.B{'x', x = m.P('aaa')}, 'aaaa') == nil) +assert(m.match('aaa' * m.B{'x', x = m.P('aaa')}, 'aaaaa') == 4) + + + +-- bug in 0.9 +assert(m.match(('a' * #m.P'b'), "ab") == 2) +assert(not m.match(('a' * #m.P'b'), "a")) + +assert(not m.match(#m.S'567', "")) +assert(m.match(#m.S'567' * 1, "6") == 2) + + +-- tests for Tail Calls + +p = m.P{ 'a' * m.V(1) + '' } +assert(p:match(string.rep('a', 1000)) == 1001) + +-- create a grammar for a simple DFA for even number of 0s and 1s +-- +-- ->1 <---0---> 2 +-- ^ ^ +-- | | +-- 1 1 +-- | | +-- V V +-- 3 <---0---> 4 +-- +-- this grammar should keep no backtracking information + +p = m.P{ + [1] = '0' * m.V(2) + '1' * m.V(3) + -1, + [2] = '0' * m.V(1) + '1' * m.V(4), + [3] = '0' * m.V(4) + '1' * m.V(1), + [4] = '0' * m.V(3) + '1' * m.V(2), +} + +assert(p:match(string.rep("00", 10000))) +assert(p:match(string.rep("01", 10000))) +assert(p:match(string.rep("011", 10000))) +assert(not p:match(string.rep("011", 10000) .. "1")) +assert(not p:match(string.rep("011", 10001))) + + +-- this grammar does need backtracking info. +local lim = 10000 +p = m.P{ '0' * m.V(1) + '0' } +checkerr("stack overflow", m.match, p, string.rep("0", lim)) +m.setmaxstack(2*lim) +checkerr("stack overflow", m.match, p, string.rep("0", lim)) +m.setmaxstack(2*lim + 4) +assert(m.match(p, string.rep("0", lim)) == lim + 1) + +-- this repetition should not need stack space (only the call does) +p = m.P{ ('a' * m.V(1))^0 * 'b' + 'c' } +m.setmaxstack(200) +assert(p:match(string.rep('a', 180) .. 'c' .. string.rep('b', 180)) == 362) + +m.setmaxstack(100) -- restore low limit + +-- tests for optional start position +assert(m.match("a", "abc", 1)) +assert(m.match("b", "abc", 2)) +assert(m.match("c", "abc", 3)) +assert(not m.match(1, "abc", 4)) +assert(m.match("a", "abc", -3)) +assert(m.match("b", "abc", -2)) +assert(m.match("c", "abc", -1)) +assert(m.match("abc", "abc", -4)) -- truncate to position 1 + +assert(m.match("", "abc", 10)) -- empty string is everywhere! +assert(m.match("", "", 10)) +assert(not m.match(1, "", 1)) +assert(not m.match(1, "", -1)) +assert(not m.match(1, "", 0)) + +print("+") + + +-- tests for argument captures +checkerr("invalid argument", m.Carg, 0) +checkerr("invalid argument", m.Carg, -1) +checkerr("invalid argument", m.Carg, 2^18) +checkerr("absent extra argument #1", m.match, m.Carg(1), 'a', 1) +assert(m.match(m.Carg(1), 'a', 1, print) == print) +x = {m.match(m.Carg(1) * m.Carg(2), '', 1, 10, 20)} +checkeq(x, {10, 20}) + +assert(m.match(m.Cmt(m.Cg(m.Carg(3), "a") * + m.Cmt(m.Cb("a"), function (s,i,x) + assert(s == "a" and i == 1); + return i, x+1 + end) * + m.Carg(2), function (s,i,a,b,c) + assert(s == "a" and i == 1 and c == nil); + return i, 2*a + 3*b + end) * "a", + "a", 1, false, 100, 1000) == 2*1001 + 3*100) + + +-- tests for Lua functions + +t = {} +s = "" +p = m.P(function (s1, i) assert(s == s1); t[#t + 1] = i; return nil end) * false +s = "hi, this is a test" +assert(m.match(((p - m.P(-1)) + 2)^0, s) == string.len(s) + 1) +assert(#t == string.len(s)/2 and t[1] == 1 and t[2] == 3) + +assert(not m.match(p, s)) + +p = mt.__add(function (s, i) return i end, function (s, i) return nil end) +assert(m.match(p, "alo")) + +p = mt.__mul(function (s, i) return i end, function (s, i) return nil end) +assert(not m.match(p, "alo")) + + +t = {} +p = function (s1, i) assert(s == s1); t[#t + 1] = i; return i end +s = "hi, this is a test" +assert(m.match((m.P(1) * p)^0, s) == string.len(s) + 1) +assert(#t == string.len(s) and t[1] == 2 and t[2] == 3) + +t = {} +p = m.P(function (s1, i) assert(s == s1); t[#t + 1] = i; + return i <= s1:len() and i end) * 1 +s = "hi, this is a test" +assert(m.match(p^0, s) == string.len(s) + 1) +assert(#t == string.len(s) + 1 and t[1] == 1 and t[2] == 2) + +p = function (s1, i) return m.match(m.P"a"^1, s1, i) end +assert(m.match(p, "aaaa") == 5) +assert(m.match(p, "abaa") == 2) +assert(not m.match(p, "baaa")) + +checkerr("invalid position", m.match, function () return 2^20 end, s) +checkerr("invalid position", m.match, function () return 0 end, s) +checkerr("invalid position", m.match, function (s, i) return i - 1 end, s) +checkerr("invalid position", m.match, + m.P(1)^0 * function (_, i) return i - 1 end, s) +assert(m.match(m.P(1)^0 * function (_, i) return i end * -1, s)) +checkerr("invalid position", m.match, + m.P(1)^0 * function (_, i) return i + 1 end, s) +assert(m.match(m.P(function (s, i) return s:len() + 1 end) * -1, s)) +checkerr("invalid position", m.match, m.P(function (s, i) return s:len() + 2 end) * -1, s) +assert(not m.match(m.P(function (s, i) return s:len() end) * -1, s)) +assert(m.match(m.P(1)^0 * function (_, i) return true end, s) == + string.len(s) + 1) +for i = 1, string.len(s) + 1 do + assert(m.match(function (_, _) return i end, s) == i) +end + +p = (m.P(function (s, i) return i%2 == 0 and i end) * 1 + + m.P(function (s, i) return i%2 ~= 0 and i + 2 <= s:len() and i end) * 3)^0 + * -1 +assert(p:match(string.rep('a', 14000))) + +-- tests for Function Replacements +f = function (a, ...) if a ~= "x" then return {a, ...} end end + +t = m.match(m.C(1)^0/f, "abc") +checkeq(t, {"a", "b", "c"}) + +t = m.match(m.C(1)^0/f/f, "abc") +checkeq(t, {{"a", "b", "c"}}) + +t = m.match(m.P(1)^0/f/f, "abc") -- no capture +checkeq(t, {{"abc"}}) + +t = m.match((m.P(1)^0/f * m.Cp())/f, "abc") +checkeq(t, {{"abc"}, 4}) + +t = m.match((m.C(1)^0/f * m.Cp())/f, "abc") +checkeq(t, {{"a", "b", "c"}, 4}) + +t = m.match((m.C(1)^0/f * m.Cp())/f, "xbc") +checkeq(t, {4}) + +t = m.match(m.C(m.C(1)^0)/f, "abc") +checkeq(t, {"abc", "a", "b", "c"}) + +g = function (...) return 1, ... end +t = {m.match(m.C(1)^0/g/g, "abc")} +checkeq(t, {1, 1, "a", "b", "c"}) + +t = {m.match(m.Cc(nil,nil,4) * m.Cc(nil,3) * m.Cc(nil, nil) / g / g, "")} +t1 = {1,1,nil,nil,4,nil,3,nil,nil} +for i=1,10 do assert(t[i] == t1[i]) end + +-- bug in 0.12.2: ktable with only nil could be eliminated when joining +-- with a pattern without ktable +assert((m.P"aaa" * m.Cc(nil)):match"aaa" == nil) + +t = {m.match((m.C(1) / function (x) return x, x.."x" end)^0, "abc")} +checkeq(t, {"a", "ax", "b", "bx", "c", "cx"}) + +t = m.match(m.Ct((m.C(1) / function (x,y) return y, x end * m.Cc(1))^0), "abc") +checkeq(t, {nil, "a", 1, nil, "b", 1, nil, "c", 1}) + +-- tests for Query Replacements + +assert(m.match(m.C(m.C(1)^0)/{abc = 10}, "abc") == 10) +assert(m.match(m.C(1)^0/{a = 10}, "abc") == 10) +assert(m.match(m.S("ba")^0/{ab = 40}, "abc") == 40) +t = m.match(m.Ct((m.S("ba")/{a = 40})^0), "abc") +checkeq(t, {40}) + +assert(m.match(m.Cs((m.C(1)/{a=".", d=".."})^0), "abcdde") == ".bc....e") +assert(m.match(m.Cs((m.C(1)/{f="."})^0), "abcdde") == "abcdde") +assert(m.match(m.Cs((m.C(1)/{d="."})^0), "abcdde") == "abc..e") +assert(m.match(m.Cs((m.C(1)/{e="."})^0), "abcdde") == "abcdd.") +assert(m.match(m.Cs((m.C(1)/{e=".", f="+"})^0), "eefef") == "..+.+") +assert(m.match(m.Cs((m.C(1))^0), "abcdde") == "abcdde") +assert(m.match(m.Cs(m.C(m.C(1)^0)), "abcdde") == "abcdde") +assert(m.match(1 * m.Cs(m.P(1)^0), "abcdde") == "bcdde") +assert(m.match(m.Cs((m.C('0')/'x' + 1)^0), "abcdde") == "abcdde") +assert(m.match(m.Cs((m.C('0')/'x' + 1)^0), "0ab0b0") == "xabxbx") +assert(m.match(m.Cs((m.C('0')/'x' + m.P(1)/{b=3})^0), "b0a0b") == "3xax3") +assert(m.match(m.P(1)/'%0%0'/{aa = -3} * 'x', 'ax') == -3) +assert(m.match(m.C(1)/'%0%1'/{aa = 'z'}/{z = -3} * 'x', 'ax') == -3) + +assert(m.match(m.Cs(m.Cc(0) * (m.P(1)/"")), "4321") == "0") + +assert(m.match(m.Cs((m.P(1) / "%0")^0), "abcd") == "abcd") +assert(m.match(m.Cs((m.P(1) / "%0.%0")^0), "abcd") == "a.ab.bc.cd.d") +assert(m.match(m.Cs((m.P("a") / "%0.%0" + 1)^0), "abcad") == "a.abca.ad") +assert(m.match(m.C("a") / "%1%%%0", "a") == "a%a") +assert(m.match(m.Cs((m.P(1) / ".xx")^0), "abcd") == ".xx.xx.xx.xx") +assert(m.match(m.Cp() * m.P(3) * m.Cp()/"%2%1%1 - %0 ", "abcde") == + "411 - abc ") + +assert(m.match(m.P(1)/"%0", "abc") == "a") +checkerr("invalid capture index", m.match, m.P(1)/"%1", "abc") +checkerr("invalid capture index", m.match, m.P(1)/"%9", "abc") + +p = m.C(1) +p = p * p; p = p * p; p = p * p * m.C(1) / "%9 - %1" +assert(p:match("1234567890") == "9 - 1") + +assert(m.match(m.Cc(print), "") == print) + +-- too many captures (just ignore extra ones) +p = m.C(1)^0 / "%2-%9-%0-%9" +assert(p:match"01234567890123456789" == "1-8-01234567890123456789-8") +s = string.rep("12345678901234567890", 20) +assert(m.match(m.C(1)^0 / "%9-%1-%0-%3", s) == "9-1-" .. s .. "-3") + +-- string captures with non-string subcaptures +p = m.Cc('alo') * m.C(1) / "%1 - %2 - %1" +assert(p:match'x' == 'alo - x - alo') + +checkerr("invalid capture value (a boolean)", m.match, m.Cc(true) / "%1", "a") + +-- long strings for string capture +l = 10000 +s = string.rep('a', l) .. string.rep('b', l) .. string.rep('c', l) + +p = (m.C(m.P'a'^1) * m.C(m.P'b'^1) * m.C(m.P'c'^1)) / '%3%2%1' + +assert(p:match(s) == string.rep('c', l) .. + string.rep('b', l) .. + string.rep('a', l)) + +print"+" + +-- accumulator capture +function f (x) return x + 1 end +assert(m.match(m.Cf(m.Cc(0) * m.C(1)^0, f), "alo alo") == 7) + +t = {m.match(m.Cf(m.Cc(1,2,3), error), "")} +checkeq(t, {1}) +p = m.Cf(m.Ct(true) * m.Cg(m.C(m.R"az"^1) * "=" * m.C(m.R"az"^1) * ";")^0, + rawset) +t = p:match("a=b;c=du;xux=yuy;") +checkeq(t, {a="b", c="du", xux="yuy"}) + + +-- errors in accumulator capture + +-- no initial capture +checkerr("no initial value", m.match, m.Cf(m.P(5), print), 'aaaaaa') +-- no initial capture (very long match forces fold to be a pair open-close) +checkerr("no initial value", m.match, m.Cf(m.P(500), print), + string.rep('a', 600)) + +-- nested capture produces no initial value +checkerr("no initial value", m.match, m.Cf(m.P(1) / {}, print), "alo") + + +-- tests for loop checker + +local function isnullable (p) + checkerr("may accept empty string", function (p) return p^0 end, m.P(p)) +end + +isnullable(m.P("x")^-4) +assert(m.match(((m.P(0) + 1) * m.S"al")^0, "alo") == 3) +assert(m.match((("x" + #m.P(1))^-4 * m.S"al")^0, "alo") == 3) +isnullable("") +isnullable(m.P("x")^0) +isnullable(m.P("x")^-1) +isnullable(m.P("x") + 1 + 2 + m.P("a")^-1) +isnullable(-m.P("ab")) +isnullable(- -m.P("ab")) +isnullable(# #(m.P("ab") + "xy")) +isnullable(- #m.P("ab")^0) +isnullable(# -m.P("ab")^1) +isnullable(#m.V(3)) +isnullable(m.V(3) + m.V(1) + m.P('a')^-1) +isnullable({[1] = m.V(2) * m.V(3), [2] = m.V(3), [3] = m.P(0)}) +assert(m.match(m.P{[1] = m.V(2) * m.V(3), [2] = m.V(3), [3] = m.P(1)}^0, "abc") + == 3) +assert(m.match(m.P""^-3, "a") == 1) + +local function find (p, s) + return m.match(basiclookfor(p), s) +end + + +local function badgrammar (g, expected) + local stat, msg = pcall(m.P, g) + assert(not stat) + if expected then assert(find(expected, msg)) end +end + +badgrammar({[1] = m.V(1)}, "rule '1'") +badgrammar({[1] = m.V(2)}, "rule '2'") -- invalid non-terminal +badgrammar({[1] = m.V"x"}, "rule 'x'") -- invalid non-terminal +badgrammar({[1] = m.V{}}, "rule '(a table)'") -- invalid non-terminal +badgrammar({[1] = #m.P("a") * m.V(1)}, "rule '1'") -- left-recursive +badgrammar({[1] = -m.P("a") * m.V(1)}, "rule '1'") -- left-recursive +badgrammar({[1] = -1 * m.V(1)}, "rule '1'") -- left-recursive +badgrammar({[1] = -1 + m.V(1)}, "rule '1'") -- left-recursive +badgrammar({[1] = 1 * m.V(2), [2] = m.V(2)}, "rule '2'") -- left-recursive +badgrammar({[1] = 1 * m.V(2)^0, [2] = m.P(0)}, "rule '1'") -- inf. loop +badgrammar({ m.V(2), m.V(3)^0, m.P"" }, "rule '2'") -- inf. loop +badgrammar({ m.V(2) * m.V(3)^0, m.V(3)^0, m.P"" }, "rule '1'") -- inf. loop +badgrammar({"x", x = #(m.V(1) * 'a') }, "rule '1'") -- inf. loop +badgrammar({ -(m.V(1) * 'a') }, "rule '1'") -- inf. loop +badgrammar({"x", x = m.P'a'^-1 * m.V"x"}, "rule 'x'") -- left recursive +badgrammar({"x", x = m.P'a' * m.V"y"^1, y = #m.P(1)}, "rule 'x'") + +assert(m.match({'a' * -m.V(1)}, "aaa") == 2) +assert(m.match({'a' * -m.V(1)}, "aaaa") == nil) + + +-- good x bad grammars +m.P{ ('a' * m.V(1))^-1 } +m.P{ -('a' * m.V(1)) } +m.P{ ('abc' * m.V(1))^-1 } +m.P{ -('abc' * m.V(1)) } +badgrammar{ #m.P('abc') * m.V(1) } +badgrammar{ -('a' + m.V(1)) } +m.P{ #('a' * m.V(1)) } +badgrammar{ #('a' + m.V(1)) } +m.P{ m.B{ m.P'abc' } * 'a' * m.V(1) } +badgrammar{ m.B{ m.P'abc' } * m.V(1) } +badgrammar{ ('a' + m.P'bcd')^-1 * m.V(1) } + + +-- simple tests for maximum sizes: +local p = m.P"a" +for i=1,14 do p = p * p end + +p = {} +for i=1,100 do p[i] = m.P"a" end +p = m.P(p) + + +-- strange values for rule labels + +p = m.P{ "print", + print = m.V(print), + [print] = m.V(_G), + [_G] = m.P"a", + } + +assert(p:match("a")) + +-- initial rule +g = {} +for i = 1, 10 do g["i"..i] = "a" * m.V("i"..i+1) end +g.i11 = m.P"" +for i = 1, 10 do + g[1] = "i"..i + local p = m.P(g) + assert(p:match("aaaaaaaaaaa") == 11 - i + 1) +end + +print"+" + + +-- tests for back references +checkerr("back reference 'x' not found", m.match, m.Cb('x'), '') +checkerr("back reference 'b' not found", m.match, m.Cg(1, 'a') * m.Cb('b'), 'a') + +p = m.Cg(m.C(1) * m.C(1), "k") * m.Ct(m.Cb("k")) +t = p:match("ab") +checkeq(t, {"a", "b"}) + +p = m.P(true) +for i = 1, 10 do p = p * m.Cg(1, i) end +for i = 1, 10 do + local p = p * m.Cb(i) + assert(p:match('abcdefghij') == string.sub('abcdefghij', i, i)) +end + + +t = {} +function foo (p) t[#t + 1] = p; return p .. "x" end + +p = m.Cg(m.C(2) / foo, "x") * m.Cb"x" * + m.Cg(m.Cb('x') / foo, "x") * m.Cb"x" * + m.Cg(m.Cb('x') / foo, "x") * m.Cb"x" * + m.Cg(m.Cb('x') / foo, "x") * m.Cb"x" +x = {p:match'ab'} +checkeq(x, {'abx', 'abxx', 'abxxx', 'abxxxx'}) +checkeq(t, {'ab', + 'ab', 'abx', + 'ab', 'abx', 'abxx', + 'ab', 'abx', 'abxx', 'abxxx'}) + + + +-- tests for match-time captures + +p = m.P'a' * (function (s, i) return (s:sub(i, i) == 'b') and i + 1 end) + + 'acd' + +assert(p:match('abc') == 3) +assert(p:match('acd') == 4) + +local function id (s, i, ...) + return true, ... +end + +assert(m.Cmt(m.Cs((m.Cmt(m.S'abc' / { a = 'x', c = 'y' }, id) + + m.R'09'^1 / string.char + + m.P(1))^0), id):match"acb98+68c" == "xyb\98+\68y") + +p = m.P{'S', + S = m.V'atom' * space + + m.Cmt(m.Ct("(" * space * (m.Cmt(m.V'S'^1, id) + m.P(true)) * ")" * space), id), + atom = m.Cmt(m.C(m.R("AZ", "az", "09")^1), id) +} +x = p:match"(a g () ((b) c) (d (e)))" +checkeq(x, {'a', 'g', {}, {{'b'}, 'c'}, {'d', {'e'}}}); + +x = {(m.Cmt(1, id)^0):match(string.rep('a', 500))} +assert(#x == 500) + +local function id(s, i, x) + if x == 'a' then return i, 1, 3, 7 + else return nil, 2, 4, 6, 8 + end +end + +p = ((m.P(id) * 1 + m.Cmt(2, id) * 1 + m.Cmt(1, id) * 1))^0 +assert(table.concat{p:match('abababab')} == string.rep('137', 4)) + +local function ref (s, i, x) + return m.match(x, s, i - x:len()) +end + +assert(m.Cmt(m.P(1)^0, ref):match('alo') == 4) +assert((m.P(1) * m.Cmt(m.P(1)^0, ref)):match('alo') == 4) +assert(not (m.P(1) * m.Cmt(m.C(1)^0, ref)):match('alo')) + +ref = function (s,i,x) return i == tonumber(x) and i, 'xuxu' end + +assert(m.Cmt(1, ref):match'2') +assert(not m.Cmt(1, ref):match'1') +assert(m.Cmt(m.P(1)^0, ref):match'03') + +function ref (s, i, a, b) + if a == b then return i, a:upper() end +end + +p = m.Cmt(m.C(m.R"az"^1) * "-" * m.C(m.R"az"^1), ref) +p = (any - p)^0 * p * any^0 * -1 + +assert(p:match'abbbc-bc ddaa' == 'BC') + +do -- match-time captures cannot be optimized away + local touch = 0 + f = m.P(function () touch = touch + 1; return true end) + + local function check(n) n = n or 1; assert(touch == n); touch = 0 end + + assert(m.match(f * false + 'b', 'a') == nil); check() + assert(m.match(f * false + 'b', '') == nil); check() + assert(m.match( (f * 'a')^0 * 'b', 'b') == 2); check() + assert(m.match( (f * 'a')^0 * 'b', '') == nil); check() + assert(m.match( (f * 'a')^-1 * 'b', 'b') == 2); check() + assert(m.match( (f * 'a')^-1 * 'b', '') == nil); check() + assert(m.match( ('b' + f * 'a')^-1 * 'b', '') == nil); check() + assert(m.match( (m.P'b'^-1 * f * 'a')^-1 * 'b', '') == nil); check() + assert(m.match( (-m.P(1) * m.P'b'^-1 * f * 'a')^-1 * 'b', '') == nil); + check() + assert(m.match( (f * 'a' + 'b')^-1 * 'b', '') == nil); check() + assert(m.match(f * 'a' + f * 'b', 'b') == 2); check(2) + assert(m.match(f * 'a' + f * 'b', 'a') == 2); check(1) + assert(m.match(-f * 'a' + 'b', 'b') == 2); check(1) + assert(m.match(-f * 'a' + 'b', '') == nil); check(1) +end + +c = '[' * m.Cg(m.P'='^0, "init") * '[' * + { m.Cmt(']' * m.C(m.P'='^0) * ']' * m.Cb("init"), function (_, _, s1, s2) + return s1 == s2 end) + + 1 * m.V(1) } / 0 + +assert(c:match'[==[]]====]]]]==]===[]' == 18) +assert(c:match'[[]=]====]=]]]==]===[]' == 14) +assert(not c:match'[[]=]====]=]=]==]===[]') + + +-- old bug: optimization of concat with fail removed match-time capture +p = m.Cmt(0, function (s) p = s end) * m.P(false) +assert(not p:match('alo')) +assert(p == 'alo') + + +-- ensure that failed match-time captures are not kept on Lua stack +do + local t = {__mode = "kv"}; setmetatable(t,t) + local c = 0 + + local function foo (s,i) + collectgarbage(); + assert(next(t) == "__mode" and next(t, "__mode") == nil) + local x = {} + t[x] = true + c = c + 1 + return i, x + end + + local p = m.P{ m.Cmt(0, foo) * m.P(false) + m.P(1) * m.V(1) + m.P"" } + p:match(string.rep('1', 10)) + assert(c == 11) +end + + +-- Return a match-time capture that returns 'n' captures +local function manyCmt (n) + return m.Cmt("a", function () + local a = {}; for i = 1, n do a[i] = n - i end + return true, unpack(a) + end) +end + +-- bug in 1.0: failed match-time that used previous match-time results +do + local x + local function aux (...) x = #{...}; return false end + local res = {m.match(m.Cmt(manyCmt(20), aux) + manyCmt(10), "a")} + assert(#res == 10 and res[1] == 9 and res[10] == 0) +end + + +-- bug in 1.0: problems with math-times returning too many captures +do + local lim = 2^11 - 10 + local res = {m.match(manyCmt(lim), "a")} + assert(#res == lim and res[1] == lim - 1 and res[lim] == 0) + checkerr("too many", m.match, manyCmt(2^15), "a") +end + +p = (m.P(function () return true, "a" end) * 'a' + + m.P(function (s, i) return i, "aa", 20 end) * 'b' + + m.P(function (s,i) if i <= #s then return i, "aaa" end end) * 1)^0 + +t = {p:match('abacc')} +checkeq(t, {'a', 'aa', 20, 'a', 'aaa', 'aaa'}) + + +------------------------------------------------------------------- +-- Tests for 're' module +------------------------------------------------------------------- + +local re = require "re" + +local match, compile = re.match, re.compile + + + +assert(match("a", ".") == 2) +assert(match("a", "''") == 1) +assert(match("", " ! . ") == 1) +assert(not match("a", " ! . ")) +assert(match("abcde", " ( . . ) * ") == 5) +assert(match("abbcde", " [a-c] +") == 5) +assert(match("0abbc1de", "'0' [a-c]+ '1'") == 7) +assert(match("0zz1dda", "'0' [^a-c]+ 'a'") == 8) +assert(match("abbc--", " [a-c] + +") == 5) +assert(match("abbc--", " [ac-] +") == 2) +assert(match("abbc--", " [-acb] + ") == 7) +assert(not match("abbcde", " [b-z] + ")) +assert(match("abb\"de", '"abb"["]"de"') == 7) +assert(match("abceeef", "'ac' ? 'ab' * 'c' { 'e' * } / 'abceeef' ") == "eee") +assert(match("abceeef", "'ac'? 'ab'* 'c' { 'f'+ } / 'abceeef' ") == 8) +local t = {match("abceefe", "( ( & 'e' {} ) ? . ) * ")} +checkeq(t, {4, 5, 7}) +local t = {match("abceefe", "((&&'e' {})? .)*")} +checkeq(t, {4, 5, 7}) +local t = {match("abceefe", "( ( ! ! 'e' {} ) ? . ) *")} +checkeq(t, {4, 5, 7}) +local t = {match("abceefe", "(( & ! & ! 'e' {})? .)*")} +checkeq(t, {4, 5, 7}) + +assert(match("cccx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 5) +assert(match("cdx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 4) +assert(match("abcdcdx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 8) + +assert(match("abc", "a <- (. a)?") == 4) +b = "balanced <- '(' ([^()] / balanced)* ')'" +assert(match("(abc)", b)) +assert(match("(a(b)((c) (d)))", b)) +assert(not match("(a(b ((c) (d)))", b)) + +b = compile[[ balanced <- "(" ([^()] / balanced)* ")" ]] +assert(b == m.P(b)) +assert(b:match"((((a))(b)))") + +local g = [[ + S <- "0" B / "1" A / "" -- balanced strings + A <- "0" S / "1" A A -- one more 0 + B <- "1" S / "0" B B -- one more 1 +]] +assert(match("00011011", g) == 9) + +local g = [[ + S <- ("0" B / "1" A)* + A <- "0" / "1" A A + B <- "1" / "0" B B +]] +assert(match("00011011", g) == 9) +assert(match("000110110", g) == 9) +assert(match("011110110", g) == 3) +assert(match("000110010", g) == 1) + +s = "aaaaaaaaaaaaaaaaaaaaaaaa" +assert(match(s, "'a'^3") == 4) +assert(match(s, "'a'^0") == 1) +assert(match(s, "'a'^+3") == s:len() + 1) +assert(not match(s, "'a'^+30")) +assert(match(s, "'a'^-30") == s:len() + 1) +assert(match(s, "'a'^-5") == 6) +for i = 1, s:len() do + assert(match(s, string.format("'a'^+%d", i)) >= i + 1) + assert(match(s, string.format("'a'^-%d", i)) <= i + 1) + assert(match(s, string.format("'a'^%d", i)) == i + 1) +end +assert(match("01234567890123456789", "[0-9]^3+") == 19) + + +assert(match("01234567890123456789", "({....}{...}) -> '%2%1'") == "4560123") +t = match("0123456789", "{| {.}* |}") +checkeq(t, {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}) +assert(match("012345", "{| (..) -> '%0%0' |}")[1] == "0101") + +assert(match("abcdef", "( {.} {.} {.} {.} {.} ) -> 3") == "c") +assert(match("abcdef", "( {:x: . :} {.} {.} {.} {.} ) -> 3") == "d") +assert(match("abcdef", "( {:x: . :} {.} {.} {.} {.} ) -> 0") == 6) + +assert(not match("abcdef", "{:x: ({.} {.} {.}) -> 2 :} =x")) +assert(match("abcbef", "{:x: ({.} {.} {.}) -> 2 :} =x")) + +eqcharset(compile"[]]", "]") +eqcharset(compile"[][]", m.S"[]") +eqcharset(compile"[]-]", m.S"-]") +eqcharset(compile"[-]", m.S"-") +eqcharset(compile"[az-]", m.S"a-z") +eqcharset(compile"[-az]", m.S"a-z") +eqcharset(compile"[a-z]", m.R"az") +eqcharset(compile"[]['\"]", m.S[[]['"]]) + +eqcharset(compile"[^]]", any - "]") +eqcharset(compile"[^][]", any - m.S"[]") +eqcharset(compile"[^]-]", any - m.S"-]") +eqcharset(compile"[^]-]", any - m.S"-]") +eqcharset(compile"[^-]", any - m.S"-") +eqcharset(compile"[^az-]", any - m.S"a-z") +eqcharset(compile"[^-az]", any - m.S"a-z") +eqcharset(compile"[^a-z]", any - m.R"az") +eqcharset(compile"[^]['\"]", any - m.S[[]['"]]) + +-- tests for comments in 're' +e = compile[[ +A <- _B -- \t \n %nl .<> <- -> -- +_B <- 'x' --]] +assert(e:match'xy' == 2) + +-- tests for 're' with pre-definitions +defs = {digits = m.R"09", letters = m.R"az", _=m.P"__"} +e = compile("%letters (%letters / %digits)*", defs) +assert(e:match"x123" == 5) +e = compile("%_", defs) +assert(e:match"__" == 3) + +e = compile([[ + S <- A+ + A <- %letters+ B + B <- %digits+ +]], defs) + +e = compile("{[0-9]+'.'?[0-9]*} -> sin", math) +assert(e:match("2.34") == math.sin(2.34)) + + +function eq (_, _, a, b) return a == b end + +c = re.compile([[ + longstring <- '[' {:init: '='* :} '[' close + close <- ']' =init ']' / . close +]]) + +assert(c:match'[==[]]===]]]]==]===[]' == 17) +assert(c:match'[[]=]====]=]]]==]===[]' == 14) +assert(not c:match'[[]=]====]=]=]==]===[]') + +c = re.compile" '[' {:init: '='* :} '[' (!(']' =init ']') .)* ']' =init ']' !. " + +assert(c:match'[==[]]===]]]]==]') +assert(c:match'[[]=]====]=][]==]===[]]') +assert(not c:match'[[]=]====]=]=]==]===[]') + +assert(re.find("hi alalo", "{:x:..:} =x") == 4) +assert(re.find("hi alalo", "{:x:..:} =x", 4) == 4) +assert(not re.find("hi alalo", "{:x:..:} =x", 5)) +assert(re.find("hi alalo", "{'al'}", 5) == 6) +assert(re.find("hi aloalolo", "{:x:..:} =x") == 8) +assert(re.find("alo alohi x x", "{:word:%w+:}%W*(=word)!%w") == 11) + +-- re.find discards any captures +local a,b,c = re.find("alo", "{.}{'o'}") +assert(a == 2 and b == 3 and c == nil) + +local function match (s,p) + local i,e = re.find(s,p) + if i then return s:sub(i, e) end +end +assert(match("alo alo", '[a-z]+') == "alo") +assert(match("alo alo", '{:x: [a-z]+ :} =x') == nil) +assert(match("alo alo", "{:x: [a-z]+ :} ' ' =x") == "alo alo") + +assert(re.gsub("alo alo", "[abc]", "x") == "xlo xlo") +assert(re.gsub("alo alo", "%w+", ".") == ". .") +assert(re.gsub("hi, how are you", "[aeiou]", string.upper) == + "hI, hOw ArE yOU") + +s = 'hi [[a comment[=]=] ending here]] and [=[another]]=]]' +c = re.compile" '[' {:i: '='* :} '[' (!(']' =i ']') .)* ']' { =i } ']' " +assert(re.gsub(s, c, "%2") == 'hi and =]') +assert(re.gsub(s, c, "%0") == s) +assert(re.gsub('[=[hi]=]', c, "%2") == '=') + +assert(re.find("", "!.") == 1) +assert(re.find("alo", "!.") == 4) + +function addtag (s, i, t, tag) t.tag = tag; return i, t end + +c = re.compile([[ + doc <- block !. + block <- (start {| (block / { [^<]+ })* |} end?) => addtag + start <- '<' {:tag: [a-z]+ :} '>' + end <- '' +]], {addtag = addtag}) + +x = c:match[[ +hihellobuttotheend]] +checkeq(x, {tag='x', 'hi', {tag = 'b', 'hello'}, 'but', + {'totheend'}}) + + +-- tests for look-ahead captures +x = {re.match("alo", "&(&{.}) !{'b'} {&(...)} &{..} {...} {!.}")} +checkeq(x, {"", "alo", ""}) + +assert(re.match("aloalo", + "{~ (((&'al' {.}) -> 'A%1' / (&%l {.}) -> '%1%1') / .)* ~}") + == "AallooAalloo") + +-- bug in 0.9 (and older versions), due to captures in look-aheads +x = re.compile[[ {~ (&(. ([a-z]* -> '*')) ([a-z]+ -> '+') ' '*)* ~} ]] +assert(x:match"alo alo" == "+ +") + +-- valid capture in look-ahead (used inside the look-ahead itself) +x = re.compile[[ + S <- &({:two: .. :} . =two) {[a-z]+} / . S +]] +assert(x:match("hello aloaLo aloalo xuxu") == "aloalo") + + +p = re.compile[[ + block <- {| {:ident:space*:} line + ((=ident !space line) / &(=ident space) block)* |} + line <- {[^%nl]*} %nl + space <- '_' -- should be ' ', but '_' is simpler for editors +]] + +t= p:match[[ +1 +__1.1 +__1.2 +____1.2.1 +____ +2 +__2.1 +]] +checkeq(t, {"1", {"1.1", "1.2", {"1.2.1", "", ident = "____"}, ident = "__"}, + "2", {"2.1", ident = "__"}, ident = ""}) + + +-- nested grammars +p = re.compile[[ + s <- a b !. + b <- ( x <- ('b' x)? ) + a <- ( x <- 'a' x? ) +]] + +assert(p:match'aaabbb') +assert(p:match'aaa') +assert(not p:match'bbb') +assert(not p:match'aaabbba') + +-- testing groups +t = {re.match("abc", "{:S <- {:.:} {S} / '':}")} +checkeq(t, {"a", "bc", "b", "c", "c", ""}) + +t = re.match("1234", "{| {:a:.:} {:b:.:} {:c:.{.}:} |}") +checkeq(t, {a="1", b="2", c="4"}) +t = re.match("1234", "{|{:a:.:} {:b:{.}{.}:} {:c:{.}:}|}") +checkeq(t, {a="1", b="2", c="4"}) +t = re.match("12345", "{| {:.:} {:b:{.}{.}:} {:{.}{.}:} |}") +checkeq(t, {"1", b="2", "4", "5"}) +t = re.match("12345", "{| {:.:} {:{:b:{.}{.}:}:} {:{.}{.}:} |}") +checkeq(t, {"1", "23", "4", "5"}) +t = re.match("12345", "{| {:.:} {{:b:{.}{.}:}} {:{.}{.}:} |}") +checkeq(t, {"1", "23", "4", "5"}) + + +-- testing pre-defined names +assert(os.setlocale("C") == "C") + +function eqlpeggsub (p1, p2) + local s1 = cs2str(re.compile(p1)) + local s2 = string.gsub(allchar, "[^" .. p2 .. "]", "") + -- if s1 ~= s2 then print(#s1,#s2) end + assert(s1 == s2) +end + + +eqlpeggsub("%w", "%w") +eqlpeggsub("%a", "%a") +eqlpeggsub("%l", "%l") +eqlpeggsub("%u", "%u") +eqlpeggsub("%p", "%p") +eqlpeggsub("%d", "%d") +eqlpeggsub("%x", "%x") +eqlpeggsub("%s", "%s") +eqlpeggsub("%c", "%c") + +eqlpeggsub("%W", "%W") +eqlpeggsub("%A", "%A") +eqlpeggsub("%L", "%L") +eqlpeggsub("%U", "%U") +eqlpeggsub("%P", "%P") +eqlpeggsub("%D", "%D") +eqlpeggsub("%X", "%X") +eqlpeggsub("%S", "%S") +eqlpeggsub("%C", "%C") + +eqlpeggsub("[%w]", "%w") +eqlpeggsub("[_%w]", "_%w") +eqlpeggsub("[^%w]", "%W") +eqlpeggsub("[%W%S]", "%W%S") + +re.updatelocale() + + +-- testing nested substitutions x string captures + +p = re.compile[[ + text <- {~ item* ~} + item <- macro / [^()] / '(' item* ')' + arg <- ' '* {~ (!',' item)* ~} + args <- '(' arg (',' arg)* ')' + macro <- ('apply' args) -> '%1(%2)' + / ('add' args) -> '%1 + %2' + / ('mul' args) -> '%1 * %2' +]] + +assert(p:match"add(mul(a,b), apply(f,x))" == "a * b + f(x)") + +rev = re.compile[[ R <- (!.) -> '' / ({.} R) -> '%2%1']] + +assert(rev:match"0123456789" == "9876543210") + + +-- testing error messages in re + +local function errmsg (p, err) + checkerr(err, re.compile, p) +end + +errmsg('aaaa', "rule 'aaaa'") +errmsg('a', 'outside') +errmsg('b <- a', 'undefined') +errmsg("x <- 'a' x <- 'b'", 'already defined') +errmsg("'a' -", "near '-'") + + +print"OK" + +