diff options
author | Kimplul <kimi.h.kuparinen@gmail.com> | 2024-10-18 22:35:27 +0300 |
---|---|---|
committer | Kimplul <kimi.h.kuparinen@gmail.com> | 2024-10-23 18:25:23 +0300 |
commit | 7c5f098511b8f612a17f4ccdd8a4924c325d37e1 (patch) | |
tree | 4a97d802557bbffcadfcfd776c8e4afdb31d96aa /src | |
parent | 18262dcbecd97591dd15ee9274a81abb8c2ba1c4 (diff) | |
download | lyn-7c5f098511b8f612a17f4ccdd8a4924c325d37e1.tar.gz lyn-7c5f098511b8f612a17f4ccdd8a4924c325d37e1.zip |
initial parser+lexer
+ AST might still change, also not a *huge* fan of having to reverse
all arrays but I guess it's unlikely to be a significant bottleneck
Diffstat (limited to 'src')
-rw-r--r-- | src/ast.c | 53 | ||||
-rw-r--r-- | src/debug.c | 108 | ||||
-rw-r--r-- | src/lexer.l | 88 | ||||
-rw-r--r-- | src/lyn.c | 89 | ||||
-rw-r--r-- | src/main.c | 16 | ||||
-rw-r--r-- | src/parser.y | 290 | ||||
-rw-r--r-- | src/source.mk | 2 |
7 files changed, 646 insertions, 0 deletions
diff --git a/src/ast.c b/src/ast.c new file mode 100644 index 0000000..faeae6e --- /dev/null +++ b/src/ast.c @@ -0,0 +1,53 @@ +#include <stdio.h> +#include <lyn/ast.h> + +struct ast *reverse_ast_list(struct ast *root) +{ + struct ast *new_root = NULL; + while (root) { + struct ast *next = root->next; + root->next = new_root; + new_root = root; + root = next; + } + + return new_root; +} + +#define dump(depth, fmt, ...) \ + do { \ + printf("//%*s", 2 * depth, ""); \ + printf(fmt,##__VA_ARGS__); \ + } while (0) + +void ast_dump(int depth, struct ast *ast) +{ + switch (ast->kind) { + case LYN_ID: dump(depth, "%s\n", ast->s); return; + case LYN_STR: dump(depth, "\"%s\"\n", ast->s); return; + case LYN_INT: dump(depth, "%lld\n", ast->i); return; + case LYN_FLOAT: dump(depth, "%f\n", ast->d); return; + case LYN_CMD: + dump(depth, "CMD\n"); + ast_dump_list(depth + 1, ast->args); + return; + + case LYN_LIST: + dump(depth, "LIST\n"); + ast_dump_list(depth + 1, ast->args); + return; + + case LYN_APPLY: + dump(depth, "APPLY\n"); + ast_dump_list(depth + 1, ast->args); + return; + } +} + +void ast_dump_list(int depth, struct ast *ast) +{ + while (ast) { + ast_dump(depth, ast); + ast = ast->next; + } +} diff --git a/src/debug.c b/src/debug.c new file mode 100644 index 0000000..7226640 --- /dev/null +++ b/src/debug.c @@ -0,0 +1,108 @@ +#include <stdarg.h> +#include <string.h> + +#include <lyn/debug.h> + +/** + * Get string representation of issue_level. + * + * @param level issue_level to get string representation for. + * @return \p level as a string. + */ +const char *issue_level_str(enum issue_level level) +{ + switch (level) { + case SRC_INFO: return "info"; + case SRC_WARN: return "warn"; + case SRC_ERROR: return "error"; + } + + return "unknown"; +} + +/** + * Find position in file buffer where line number \p no + * starts. Lines are assumed to be one-indexed, with + * \p no = \c 0 and \p no = \c 1 both considered the first line. + * + * @param buf Buffer to look in. + * @param no Line number whose start to look for. + * @return Pointer to location in buffer where line number \p no + * starts. + */ +static const char *find_lineno(const char *buf, size_t no) +{ + if (no == 0 || no == 1) + return buf; + + char c; + while ((c = *buf)) { + buf++; + + if (c == '\n') + no--; + + if (no == 1) + break; + } + + return buf; +} + +/** + * Helper for printing out an issue. + * + * @param issue Issue context. + * @param fmt Format string. Follows standard printf() formatting. + * @param args Arguments for \p fmt. + */ +static void _issue(struct src_issue issue, const char *fmt, va_list args) +{ + /* get start and end of current line in buffer */ + const char *line_start = find_lineno(issue.fctx.fbuf, + (size_t)issue.loc.first_line); + const char *line_end = strchr(line_start, '\n'); + if (!line_end) + line_end = strchr(line_start, 0); + + const int line_len = (int)(line_end - line_start); + + fprintf(stderr, "%s:%i:%i: %s: ", issue.fctx.fname, + issue.loc.first_line, + issue.loc.first_col, + issue_level_str(issue.level)); + + vfprintf(stderr, fmt, args); + fputc('\n', stderr); + + int lineno_len = snprintf(NULL, 0, "%i", issue.loc.first_line); + fputc(' ', stderr); + fprintf(stderr, "%i | ", issue.loc.first_line); + + fprintf(stderr, "%.*s\n", line_len, line_start); + + for (int i = 0; i < lineno_len + 2; ++i) + fputc(' ', stderr); + + fprintf(stderr, "| "); + + for (int i = 0; i < issue.loc.first_col - 1; ++i) + fputc(line_start[i] == '\t' ? '\t' : ' ', stderr); + + for (int i = issue.loc.first_col; i < issue.loc.last_col; ++i) { + if (i == issue.loc.first_col) + fputc('^', stderr); + else + fputc('~', stderr); + } + + fputc('\n', stderr); +} + +void src_issue(struct src_issue issue, const char *err_msg, ...) +{ + va_list args; + va_start(args, err_msg); + _issue(issue, err_msg, args); + va_end(args); +} diff --git a/src/lexer.l b/src/lexer.l new file mode 100644 index 0000000..3c611ab --- /dev/null +++ b/src/lexer.l @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: copyleft-next-0.3.1 */ +/* Copyright 2023 Kim Kuparinen < kimi.h.kuparinen@gmail.com > */ + +%option reentrant noyywrap nounput noinput nodefault +%{ +#define FROM_LEXER +#include <lyn/parser.h> +#include <lyn/debug.h> + +static void update_yylloc(struct parser *parser, YYLTYPE *lloc, const char *text) +{ + (void)parser; + + lloc->first_line = lloc->last_line; + lloc->first_column = lloc->last_column; + + for (size_t i = 0; text[i] != 0; ++i) { + if (text[i] == '\n') { + lloc->last_line++; + /* flex uses 1 based indexing */ + lloc->last_column = 1; + } else { + lloc->last_column++; + } + } +} + +#define YY_USER_ACTION update_yylloc(parser, yylloc, yytext); +%} +ID [^(){};[:space:]]+ +STRING \"(\\.|[^"\\])*\" + +HEX 0[xX][0-9a-fA-F]+ +DEC -?[0-9]+ +OCT 0[0-8]+ +BIN 0b[0-1]+ + +INT {HEX}|{DEC}|{OCT}|{BIN} + +HEXF [+-]?0[xX][0-9a-fA-F]+([pP][+-]?[0-9]+) +DECF [+-]?[0-9]+[.]([eE]?[+-]?[0-9]+)?[fF]? + +FLOAT {HEXF}|{DECF} + +%% +"#".* {/* skip line comments */} + +"(" {return LPAREN;} +")" {return RPAREN;} +"{" {return LBRACE;} +"}" {return RBRACE;} +";" {return SEMICOLON;} +"\n" {return NL;} + +{STRING} { + /* seems risky, I know, but letting the parser choose when to allocate a + * new string seems to help with syntax error cleanup */ + yylval->str = strdup(yytext); + return STRING; +} + +{INT} { + yylval->integer = strtoull(yytext, 0, 0); + return INT; +} + +{FLOAT} { + yylval->floating = strtod(yytext, 0); + return FLOAT; +} + +{ID} { + yylval->str = strdup(yytext); + return ID; +} + +[^\n[:graph:]]+ {/* skip whitespace */} + +. { + struct src_issue issue; + issue.level = SRC_ERROR; + issue.loc = src_loc(*yylloc); + issue.fctx.fbuf = parser->buf; + issue.fctx.fname = parser->fname; + src_issue(issue, "Unexpected token: %s", yytext); + parser->failed = true; +} +%% diff --git a/src/lyn.c b/src/lyn.c new file mode 100644 index 0000000..2527677 --- /dev/null +++ b/src/lyn.c @@ -0,0 +1,89 @@ +#include <errno.h> +#include <stdbool.h> +#include <string.h> +#include <stdio.h> +#include <limits.h> +#include <stdlib.h> + +#include <lyn/lyn.h> +#include <lyn/parser.h> +#include <lyn/debug.h> + +struct lyn lyn_create() +{ + return (struct lyn){}; +} + +int lyn_eval_str(struct lyn *lyn, const char *name, const char *str) +{ + struct parser *p = create_parser(); + if (!p) + return -1; + + parse(p, name, str); + struct ast *ast = p->tree; + bool failed = p->failed; + destroy_parser(p); + + if (!failed) { + ast_dump_list(0, ast); + } + + return failed; +} + +/** + * Read whole file into a buffer and return pointer to buffer. + * Possibly kind of silly to have both \p file and \p f. + * Apparently there's no standardized way to get the file name of a + * file pointer. + * + * @param file Name of file to read. + * @param f File pointer. + * @return Pointer to buffer with file contents. + */ +static char *read_file(const char *file, FILE *f) +{ + fseek(f, 0, SEEK_END); + /** @todo check how well standardized this actually is */ + long s = ftell(f); + if (s == LONG_MAX) { + error("%s might be a directory", file); + return NULL; + } + + fseek(f, 0, SEEK_SET); + + char *buf = malloc((size_t)(s + 1)); + if (!buf) + return NULL; + + fread(buf, (size_t)(s + 1), 1, f); + /* remember terminating null */ + buf[s] = 0; + return buf; +} + +int lyn_eval_file(struct lyn *lyn, const char *fname) +{ + FILE *f = fopen(fname, "rb"); + if (!f) { + error("failed opening %s: %s\n", fname, strerror(errno)); + return -1; + } + + char *buf = read_file(fname, f); + fclose(f); + + if (!buf) + return -1; + + int ret = lyn_eval_str(lyn, fname, buf); + free(buf); + + return ret; +} + +void lyn_destroy(struct lyn *lyn) +{ +} diff --git a/src/main.c b/src/main.c new file mode 100644 index 0000000..85dd445 --- /dev/null +++ b/src/main.c @@ -0,0 +1,16 @@ +#include <lyn/lyn.h> +#include <lyn/debug.h> + +int main(int argc, char *argv[]) +{ + if (argc != 2) { + error("wrong number of arguments (should be just one for a file)"); + return -1; + } + + struct lyn lyn = lyn_create(); + int ret = lyn_eval_file(&lyn, argv[1]); + lyn_destroy(&lyn); + + return ret; +} diff --git a/src/parser.y b/src/parser.y new file mode 100644 index 0000000..3e0156f --- /dev/null +++ b/src/parser.y @@ -0,0 +1,290 @@ +/* SPDX-License-Identifier: copyleft-next-0.3.1 */ +/* Copyright 2023 Kim Kuparinen < kimi.h.kuparinen@gmail.com > */ + +%{ + +/* TODO: clean up this mess and I guess fix location tracking, it works for the + * parser but each ast node should also get some location data + * I'm trying something over in ast.c, but I'm not sure about it + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> + +#include <lyn/parser.h> +#include <lyn/ast.h> + +%} + +%locations + +%define parse.trace +%define parse.error verbose +%define api.pure full +%define lr.type ielr + +%lex-param {void *scanner} {struct parser *parser} +%parse-param {void *scanner} {struct parser* parser} + +%union { + struct ast *ast; + char *str; + long long integer; + double floating; +}; + +%token <str> STRING +%token <str> ID +%token <integer> INT +%token <floating> FLOAT + +%token LPAREN "(" +%token RPAREN ")" +%token LBRACE "{" +%token RBRACE "}" +%token SEMICOLON ";" +%token NL "nl" + +%nterm <ast> arg args rev_args +%nterm <ast> cmd cmds rev_cmds + +%{ + +/** Modifies the signature of yylex to fit our parser better. */ +#define YY_DECL int yylex(YYSTYPE *yylval, YYLTYPE *yylloc, \ + void *yyscanner, struct parser *parser) + +/** + * Declare yylex. + * + * @param yylval Bison current value. + * @param yylloc Bison location info. + * @param yyscanner Flex scanner. + * @param parser Current parser state. + * @return \c 0 when succesful, \c 1 otherwise. + * More info on yylex() can be found in the flex manual. + */ +YY_DECL; + +/** + * Gobble tokens until we reach the next interesting feature. + * Interesting features are generally new statements. + * Mainly intended for trying to get to a sensible + * location to continue parser after an error has occured. + * + * @param yylval Current parser value. + * @param yylloc Parser location info. + * @param scanner Lex scanner. + * @param parser Current parser. + * @return \c 0 on success, non-zero otherwise. + */ +static int next_interesting_feature(YYSTYPE *yylval, YYLTYPE *yylloc, + void *scanner, struct parser *parser); + +/** + * Convert bison location info to our own source location info. + * + * @param yylloc Bison location info. + * @return Internal location info. + */ +static struct src_loc src_loc(YYLTYPE yylloc); + +/** + * Print parsing error. + * Automatically called by bison. + * + * @param yylloc Location of error. + * @param lexer Lexer. + * @param parser Parser state. + * @param msg Message to print. + */ +static void yyerror(YYLTYPE *yylloc, void *lexer, + struct parser *parser, const char *msg); + +/** + * Try to convert escape code to its actual value. + * I.e. '\n' -> 0x0a. + * + * @param c Escape character without backslash. + * @return Corresponding value. + */ +static char match_escape(char c); + +/** + * Similar to strdup() but skips quotation marks that would + * otherwise be included. + * I.e. "something" -> something. + * + * @param s String to clone, with quotation marks surrounding it. + * @return Identical string but without quotation marks around it. + */ +static char *strip(const char *s); + +%} + +%start input; +%% + +arg + : "(" cmds ")" {$$ = gen_apply($2);} + | "{" cmds "}" {$$ = gen_list($2);} + | ID {$$ = gen_id($1);} + | STRING {$$ = gen_str($1);} + | INT {$$ = gen_int($1);} + | FLOAT {$$ = gen_float($1);} + +rev_args + : rev_args arg {$$ = $2; $$->next = $1;} + | arg + +args + : rev_args {$$ = reverse_ast_list($1);} + +sep + : sep ";" + | sep NL + | ";" + | NL + +cmd + : args {$$ = gen_cmd($1);} + +rev_cmds + : rev_cmds sep cmd {$$ = $3; $$->next = $1;} + | cmd + +cmds + : rev_cmds {$$ = reverse_ast_list($1);} + | rev_cmds sep {$$ = reverse_ast_list($1);} + | sep rev_cmds {$$ = reverse_ast_list($2);} + | sep rev_cmds sep {$$ = reverse_ast_list($2);} + | {$$ = NULL;} + +input + : cmds {parser->tree = gen_list($1);} + +%% + +#include "gen_lexer.inc" + +/* I'm not convinced this is foolproof quite yet, more testing would be nice. */ +static int next_interesting_feature(YYSTYPE *yylval, YYLTYPE *yylloc, + void *scanner, struct parser *parser) +{ + size_t depth = 0; + while (1) { + int ret = yylex(yylval, yylloc, scanner, parser); + if (ret == LBRACE) { + depth++; + continue; + } + + if (ret == RBRACE && depth > 0) + depth--; + + if (ret == RBRACE && depth == 0) + return 0; + + if (ret == SEMICOLON && depth == 0) + return 0; + + /* return fatal error and parser should abort */ + if (ret == YYEOF) + /* some error for unmatched braces would be cool I think */ + return 1; + } +} + + +static struct src_loc src_loc(YYLTYPE yylloc) +{ + struct src_loc loc; + loc.first_line = yylloc.first_line; + loc.last_line = yylloc.last_line; + loc.first_col = yylloc.first_column; + loc.last_col = yylloc.last_column; + return loc; +} + +static void yyerror(YYLTYPE *yylloc, void *lexer, + struct parser *parser, const char *msg) +{ + (void)lexer; + + struct src_issue issue; + issue.level = SRC_ERROR; + issue.loc = src_loc(*yylloc); + issue.fctx.fbuf = parser->buf; + issue.fctx.fname = parser->fname; + src_issue(issue, msg); +} + +static char match_escape(char c) +{ + switch (c) { + case '\'': return '\''; + case '\\': return '\\'; + case 'a': return '\a'; + case 'b': return '\b'; + case 'f': return '\f'; + case 'n': return '\n'; + case 'r': return '\r'; + case 't': return '\t'; + case 'v': return '\v'; + } + + return c; +} + +static char *strip(const char *str) +{ + const size_t len = strlen(str) + 1; + char *buf = malloc(len); + if (!buf) { + /* should probably try to handle the error in some way... */ + error("failed allocating buffer for string clone"); + free((void *)str); + return NULL; + } + + /* skip quotation marks */ + size_t j = 0; + for (size_t i = 1; i < len - 2; ++i) { + char c = str[i]; + + if (c == '\\') + c = match_escape(str[++i]); + + buf[j++] = c; + } + + buf[j] = 0; + free((void *)str); + return buf; + +} + +struct parser *create_parser() +{ + return calloc(1, sizeof(struct parser)); +} + +void destroy_parser(struct parser *p) +{ + yylex_destroy(p->lexer); + free(p); +} + +void parse(struct parser *p, const char *fname, const char *buf) +{ + p->fname = fname; + p->buf = buf; + + p->failed = false; + + yylex_init(&p->lexer); + yy_scan_string(buf, p->lexer); + yyparse(p->lexer, p); +} diff --git a/src/source.mk b/src/source.mk new file mode 100644 index 0000000..caa476c --- /dev/null +++ b/src/source.mk @@ -0,0 +1,2 @@ +SRC_LOCAL != echo src/*.c +LYN_SOURCES := $(LYN_SOURCES) $(SRC_LOCAL) |