/*
// Full copyright information is available in the file ../doc/CREDITS
//
// Convert text into tokens for yyparse().
*/
#include "defs.h"
#include <ctype.h>
#include "token.h"
#define NUM_RESERVED_WORDS (sizeof(reserved_words) / sizeof(*reserved_words))
#define SUBSCRIPT(c) ((c) & 0x7f)
INTERNAL char *string_token(char *s, Int len, Int *token_len);
INTERNAL char *identifier_token(char *s, Int len, Int *token_len);
static cList *code;
static cur_line, cur_pos;
/* Words with same first letters must be together. */
static struct {
char *word;
Int token;
} reserved_words[] = {
{ "any", ANY },
{ "arg", ARG },
{ "break", BREAK },
{ "case", CASE },
{ "catch", CATCH },
{ "continue", CONTINUE },
{ "default", DEFAULT },
{ "disallow_overrides", DISALLOW_OVERRIDES },
{ "else", ELSE },
{ "filter", OP_FILTER },
{ "find", OP_FIND },
{ "for", FOR },
{ "fork", FORK },
{ "handler", HANDLER },
{ "hash", OP_MAPHASH },
{ "if", IF },
{ "in", OP_IN },
{ "map", OP_MAP },
{ "pass", PASS },
{ "return", RETURN },
{ "switch", SWITCH },
{ "to", TO },
{ "var", VAR },
{ "where", WHERE },
{ "while", WHILE },
{ "with", WITH },
/* these are around for backwards/future compatability */
/* cryptic reserved 'words' */
{ "(|", CRITLEFT },
{ "(>", PROPLEFT },
{ "<)", PROPRIGHT },
{ "<=", LE },
{ "..", UPTO },
{ "|)", CRITRIGHT },
{ "||", OR },
{ "|", OP_COND_OTHER_ELSE },
{ "#[", START_DICT },
{ "`[", START_BUFFER },
{ "&&", AND },
{ "==", EQ },
{ "=", OP_ASSIGN },
{ "!=", NE },
{ ">=", GE },
{ "++", INCREMENT },
{ "+=", PLUS_EQ },
{ "--", DECREMENT },
{ "-=", MINUS_EQ },
{ "/=", DIV_EQ },
{ "*=", MULT_EQ },
{ "?=", OPTIONAL_ASSIGN },
{ "?", OP_COND_IF },
};
static struct {
Int start;
Int num;
} starting[128];
extern Pile *compiler_pile; /* For allocating strings. */
void init_token(void)
{
Int i, c;
for (i = 0; i < 128; i++)
starting[i].start = -1;
i = 0;
while (i < NUM_RESERVED_WORDS) {
c = SUBSCRIPT(*reserved_words[i].word);
starting[c].start = i;
starting[c].num = 1;
for (i++; i < NUM_RESERVED_WORDS && *reserved_words[i].word == c; i++)
starting[c].num++;
}
}
void lex_start(cList * code_list) {
code = code_list;
cur_line = cur_pos = 0;
}
/* Returns if s can be parsed as an identifier. */
Bool is_valid_ident(char *s) {
for (; *s; s++) {
if (!isalnum(*s) && *s != '_')
return 0;
}
return 1;
}
Bool string_is_valid_ident(cStr * str) {
char * s = string_chars(str);
int len = string_length(str);
for (; len; len--, s++) {
if (!isalnum(*s) && *s != '_')
return 0;
}
return 1;
}
Bool is_reserved_word(char *s) {
int start, i, j, len;
char * word;
len = strlen(s);
start = starting[SUBSCRIPT(*s)].start;
if (start != -1) {
for (i = start; i < start + starting[SUBSCRIPT(*s)].num; i++) {
/* Compare remaining letters of word against s. */
word = reserved_words[i].word;
for (j = 1; j < len && word[j]; j++) {
if (s[j] != word[j]) {
break;
}
}
/* Comparison fails if we didn't match all the characters in word,
* or if word is an identifier and the next character in s isn't
* punctuation. */
if (word[j])
continue;
if (isalpha(*s) && j < len && (isalnum(s[j]) || s[j] == '_'))
continue;
return TRUE;
}
}
return FALSE;
}
Int yylex(void)
{
cData *d = (cData *)0;
cStr *line, *float_buf;
char *s = NULL, *word;
Int len = 0, i, j, start, type;
Bool negative;
/* Find the beginning of the next token. */
while (cur_line < list_length(code)) {
/* Fetch text and length of current line. */
d = list_elem(code, cur_line);
line = d->u.str;
s = string_chars(line);
len = string_length(line);
/* Scan over line for a non-space character. */
while (cur_pos < len && isspace(s[cur_pos]))
cur_pos++;
/* If we didn't hit the end, return the character we stopped at. */
if (cur_pos < len)
break;
/* Go on to the next line. */
cur_line++;
cur_pos = 0;
d = (cData *)0;
}
if (!d) {
return 0;
} else {
s += cur_pos;
len -= cur_pos;
}
/* Check if it's a reserved word. */
start = starting[SUBSCRIPT(*s)].start;
if (start != -1) {
for (i = start; i < start + starting[SUBSCRIPT(*s)].num; i++) {
/* Compare remaining letters of word against s. */
word = reserved_words[i].word;
for (j = 1; j < len && word[j]; j++) {
if (s[j] != word[j]) {
break;
}
}
/* Comparison fails if we didn't match all the characters in word,
* or if word is an identifier and the next character in s isn't
* punctuation. */
if (word[j])
continue;
if (isalpha(*s) && j < len && (isalnum(s[j]) || s[j] == '_'))
continue;
cur_pos += j;
return reserved_words[i].token;
}
}
/* Check if it's an identifier. */
if (isalpha(*s) || *s == '_') {
yylval.s = identifier_token(s, len, &i);
cur_pos += i;
return IDENT;
}
/* Check if it's a number. */
if (isdigit(*s)) {
float_buf = string_new(32);
/* Convert the string to a number. */
yylval.num = 0;
while (len && isdigit(*s)) {
float_buf = string_addc(float_buf, *s);
yylval.num = yylval.num * 10 + (*s - '0');
s++, cur_pos++, len--;
}
if ((*s == '.' && isdigit(*(s+1))) || *s == 'e') {
Float f=yylval.num;
f = atof(string_chars(float_buf));
string_discard(float_buf);
if (*s=='.') {
Float muly=1;
s++, cur_pos++, len--;
while (len && isdigit(*s)) {
muly/=10; f+=(*s - '0')*muly;
s++, cur_pos++, len--;
}
}
if (len && *s=='e') {
Int esign=0, evalue=0;
s++, cur_pos++, len--;
if (len && *s=='-') {
esign=1;
s++, cur_pos++, len--;
}
else if (len && *s=='+') {
esign=0;
s++, cur_pos++, len--;
}
while (len && isdigit(*s)) {
evalue=evalue * 10 + (*s - '0');
s++, cur_pos++, len--;
}
if (esign) evalue =- evalue;
if (evalue > 0)
while (evalue--) f*=10;
else
while (evalue++) f/=10;
}
yylval.fnum=f;
return FLOAT;
} else {
string_discard(float_buf);
return INTEGER;
}
}
/* Check if it's a string. */
if (*s == '"') {
yylval.s = string_token(s, len, &i);
cur_pos += i;
return STRING;
}
/* Check if it's an object literal, symbol, or error code. */
if ((*s == '$' || *s == '\'' || *s == '~')) {
type = ((*s == '$') ? OBJNAME : ((*s == '\'') ? SYMBOL : T_ERROR));
if (len > 1 && s[1] == '"') {
yylval.s = string_token(s + 1, len - 1, &i);
cur_pos += i + 1;
return type;
} else if (isalnum(s[1]) || s[1] == '_') {
yylval.s = identifier_token(s + 1, len - 1, &i);
cur_pos += i + 1;
return type;
}
}
/* Check if it's a comment. */
if (len >= 2 && *s == '/' && s[1] == '/') {
/* Copy in text after //, and move to next line. */
yylval.s = PMALLOC(compiler_pile, char, len - 1);
MEMCPY(yylval.s, s + 2, len - 2);
yylval.s[len - 2] = 0;
cur_line++;
cur_pos = 0;
return COMMENT;
}
/* Check if it's a objnum. */
if (*s == '#') {
s++; len--; cur_pos++;
if (len && *s == '-') {
negative = YES;
s++; len--; cur_pos++;
} else {
negative = NO;
}
if (len && isdigit(*s)) {
yylval.num = 0;
while (len && isdigit(*s)) {
yylval.num = yylval.num * 10 + (*s - '0');
s++, cur_pos++, len--;
}
if (negative)
yylval.num = -yylval.num;
} else {
yylval.num = INV_OBJNUM;
}
return OBJNUM;
}
if (len >= 2 && *s == '+' && s[1] == '+') {
s += 2, cur_pos += 2, len -= 2;
return INCREMENT;
}
if (len >= 2 && *s == '-' && s[1] == '-') {
s += 2, cur_pos += 2, len -= 2;
return DECREMENT;
}
/* None of the above. */
cur_pos++;
return *s;
}
Int cur_lineno(void)
{
return cur_line + 1;
}
INTERNAL char * string_token(char * s, Int len, Int *token_len) {
Int count = 0, i;
char *p, *q;
/* Count the length */
for (i = 1; i < len && s[i] != '"'; i++) {
if (s[i] == '\\' && i < len -1 && (s[i+1] == '"' || s[i+1] == '\\'))
i++;
count++;
}
/* Allocate space and copy. */
q = p = PMALLOC(compiler_pile, char, count + 1);
for (i = 1; i < len && s[i] != '"'; i++) {
if (s[i] == '\\' && i < len - 1 && (s[i+1] == '"' || s[i+1] == '\\'))
i++;
*q++ = s[i];
}
*q = 0;
*token_len = (i == len) ? i : i + 1;
return p;
}
/* Assumption: isalpha(*s) || *s == '_'. */
INTERNAL char *identifier_token(char *s, Int len, Int *token_len)
{
Int count = 1, i;
char *p;
/* Count characters in identifier. */
for (i = 1; i < len && (isalnum(s[i]) || s[i] == '_'); i++)
count++;
/* Allocate space and copy. */
p = PMALLOC(compiler_pile, char, count + 1);
MEMCPY(p, s, count);
p[count] = 0;
*token_len = count;
return p;
}