/* token.c: Convert text into tokens for yyparse(). */
#define _POSIX_SOURCE
#include <ctype.h>
#include "x.tab.h"
#include "token.h"
#include "memory.h"
#include "data.h"
#define NUM_RESERVED_WORDS (sizeof(reserved_words) / sizeof(*reserved_words))
#define SUBSCRIPT(c) ((c) & 0x7f)
static char *string_token(char *s, int len, int *token_len);
static char *identifier_token(char *s, int len, int *token_len);
static Data *code;
static int num_lines, cur_line, cur_pos;
/* Words with same first letters must be together. */
static struct {
char *word;
int token;
} reserved_words[] = {
{ "any", ANY },
{ "arg", ARG },
{ "atomic", ATOMIC },
{ "break", BREAK },
{ "case", CASE },
{ "catch", CATCH },
{ "continue", CONTINUE },
{ "default", DEFAULT },
{ "disallow_overrides", DISALLOW_OVERRIDES },
{ "else", ELSE },
{ "for", FOR },
{ "fork", FORK },
{ "handler", HANDLER },
{ "if", IF },
{ "in", IN },
{ "non_atomic", NON_ATOMIC },
{ "pass", PASS },
{ "return", RETURN },
{ "switch", SWITCH },
{ "to", TO },
{ "var", VAR },
{ "while", WHILE },
{ "with", WITH },
{ "(|", CRITLEFT },
{ "(>", PROPLEFT },
{ "<)", PROPRIGHT },
{ "<=", LE },
{ "..", UPTO },
{ "|)", CRITRIGHT },
{ "||", OR },
{ "#[", START_DICT },
{ "`[", START_BUFFER },
{ "&&", AND },
{ "==", EQ },
{ "!=", NE },
{ ">=", GE }
};
static struct {
int start;
int num;
} starting[128];
extern Pile *compiler_pile; /* For allocating strings. */
void init_token(void)
{
int i, c;
for (i = 0; i < 128; i++)
starting[i].start = -1;
i = 0;
while (i < NUM_RESERVED_WORDS) {
c = SUBSCRIPT(*reserved_words[i].word);
starting[c].start = i;
starting[c].num = 1;
for (i++; i < NUM_RESERVED_WORDS && *reserved_words[i].word == c; i++)
starting[c].num++;
}
}
void lex_start(Data *code_arg, int lines)
{
code = code_arg;
num_lines = lines;
cur_line = cur_pos = 0;
}
/* Returns if s can be parsed as an identifier. */
int is_valid_ident(char *s)
{
while (*s) {
if (!isalnum(*s) && *s != '_')
return 0;
s++;
}
return 1;
}
int yylex(void)
{
char *s = NULL, *word;
int len = 0, i, j, start, type;
/* Find the beginning of the next token. */
while (cur_line < num_lines) {
/* Fetch text and length of current line. */
s = data_sptr(&code[cur_line]);
len = code[cur_line].u.substr.span;
/* Scan over line for a non-space character. */
while (cur_pos < len && isspace(s[cur_pos]))
cur_pos++;
/* If we didn't hit the end, return the character we stopped at. */
if (cur_pos < len)
break;
/* Go on to the next line. */
cur_line++;
cur_pos = 0;
}
if (cur_line == num_lines) {
return 0;
} else {
s += cur_pos;
len -= cur_pos;
}
/* Check if it's a reserved word. */
start = starting[SUBSCRIPT(*s)].start;
if (start != -1) {
for (i = start; i < start + starting[SUBSCRIPT(*s)].num; i++) {
/* Compare remaining letters of word against s. */
word = reserved_words[i].word;
for (j = 1; j < len && word[j]; j++) {
if (s[j] != word[j])
break;
}
/* Comparison fails if we didn't match all the characters in word,
* or if word is an identifier and the next character in s isn't
* punctuation. */
if (word[j])
continue;
if (isalpha(*s) && j < len && (isalnum(s[j]) || s[j] == '_'))
continue;
cur_pos += j;
return reserved_words[i].token;
}
}
/* Check if it's an identifier. */
if (isalpha(*s) || *s == '_') {
yylval.s = identifier_token(s, len, &i);
cur_pos += i;
return IDENT;
}
/* Check if it's a number. */
if (isdigit(*s)) {
/* Convert the string to a number. */
yylval.num = 0;
while (len && isdigit(*s)) {
yylval.num = yylval.num * 10 + (*s - '0');
s++, cur_pos++, len--;
}
return INTEGER;
}
/* Check if it's a string. */
if (*s == '"') {
yylval.s = string_token(s, len, &i);
cur_pos += i;
return STRING;
}
/* Check if it's an object literal, symbol, or error code. */
if ((*s == '$' || *s == '\'' || *s == '~') && len > 1) {
type = ((*s == '$') ? NAME : ((*s == '\'') ? SYMBOL : ERROR));
if (s[1] == '"') {
yylval.s = string_token(s + 1, len - 1, &i);
cur_pos += i + 1;
return type;
} else if (isalnum(s[1]) || s[1] == '_') {
yylval.s = identifier_token(s + 1, len - 1, &i);
cur_pos += i + 1;
return type;
}
}
/* Check if it's a comment. */
if (len >= 2 && *s == '/' && s[1] == '/') {
/* Copy in text after //, and move to next line. */
yylval.s = PMALLOC(compiler_pile, char, len - 1);
MEMCPY(yylval.s, s + 2, len - 2);
yylval.s[len - 2] = 0;
cur_line++;
cur_pos = 0;
return COMMENT;
}
/* Check if it's a dbref. */
if (len >= 2 && *s == '#' && isdigit(s[1])) {
/* Convert the string to a number. */
s++, cur_pos++, len--;
yylval.num = 0;
while (len && isdigit(*s)) {
yylval.num = yylval.num * 10 + (*s - '0');
s++, cur_pos++, len--;
}
return DBREF;
}
/* None of the above. */
cur_pos++;
return *s;
}
int cur_lineno(void)
{
return cur_line + 1;
}
static char *string_token(char *s, int len, int *token_len)
{
int count = 0, i;
char *p, *q;
/* Count characters in string. */
for (i = 1; i < len && s[i] != '"'; i++) {
if (s[i] == '\\' && i < len - 1)
i++;
count++;
}
/* Allocate space and copy. */
q = p = PMALLOC(compiler_pile, char, count + 1);
for (i = 1; i < len && s[i] != '"'; i++) {
if (s[i] == '\\' && i < len - 1)
i++;
*q++ = s[i];
}
*q = 0;
*token_len = (i == len) ? i : i + 1;
return p;
}
/* Assumption: isalpha(*s) || *s == '_'. */
static char *identifier_token(char *s, int len, int *token_len)
{
int count = 1, i;
char *p;
/* Count characters in identifier. */
for (i = 1; i < len && (isalnum(s[i]) || s[i] == '_'); i++)
count++;
/* Allocate space and copy. */
p = PMALLOC(compiler_pile, char, count + 1);
MEMCPY(p, s, count);
p[count] = 0;
*token_len = count;
return p;
}