From c035571d85e3d756804519d82de8b354f3910b29 Mon Sep 17 00:00:00 2001
From: Kimplul <kimi.h.kuparinen@gmail.com>
Date: Thu, 8 Feb 2024 21:19:38 +0200
Subject: project work phase 1

---
 src/core.c    |  68 ++++++++++++++++
 src/date.c    |  72 +++++++++++++++++
 src/debug.c   |  81 ++++++++++++++++++++
 src/lexer.l   | 191 +++++++++++++++++++++++++++++++++++++++++++++
 src/main.c    |  28 +++++++
 src/parser.y  | 243 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/source.mk |   4 +
 7 files changed, 687 insertions(+)
 create mode 100644 src/core.c
 create mode 100644 src/date.c
 create mode 100644 src/debug.c
 create mode 100644 src/lexer.l
 create mode 100644 src/main.c
 create mode 100644 src/parser.y
 create mode 100644 src/source.mk

(limited to 'src')

diff --git a/src/core.c b/src/core.c
new file mode 100644
index 0000000..7eca8bd
--- /dev/null
+++ b/src/core.c
@@ -0,0 +1,68 @@
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+
+#include <posthaste/debug.h>
+#include <posthaste/parser.h>
+#include <posthaste/core.h>
+
+/**
+ * Read whole file into a buffer and return pointer to buffer.
+ * Possibly kind of silly to have both \p file and \p f.
+ * Apparently there's no standardized way to get the file name of a
+ * file pointer.
+ *
+ * @param fname Name of file to read.
+ * @param f File pointer.
+ * @return Pointer to buffer with file contents.
+ */
+static char *read_file(const char *fname, FILE *f)
+{
+	fseek(f, 0, SEEK_END);
+	long s = ftell(f);
+	if (s == LONG_MAX) {
+		/** @todo should probably do this via fstat or something */
+		error("%s might be a directory", fname);
+		return NULL;
+	}
+
+	fseek(f, 0, SEEK_SET);
+
+	char *buf = malloc(s + 1);
+	if (!buf)
+		return NULL;
+
+	fread(buf, s + 1, 1, f);
+	/* remember terminating null */
+	buf[s] = 0;
+	return buf;
+}
+
+int run(const char *fname)
+{
+	FILE *f = fopen(fname, "rb");
+	if (!f) {
+		error("failed opening %s: %s\n", fname, strerror(errno));
+		return -1;
+	}
+
+	const char *buf = read_file(fname, f);
+	fclose(f);
+
+	if (!buf)
+		return -1;
+
+	struct parser *p = create_parser();
+	if (!p)
+		return -1;
+
+	parse(p, fname, buf);
+	int ret = p->failed ? -1 : 0;
+
+	/* eventually do other stuff as well */
+	free((void *)buf);
+
+	destroy_parser(p);
+	return ret;
+}
diff --git a/src/date.c b/src/date.c
new file mode 100644
index 0000000..7e6bc77
--- /dev/null
+++ b/src/date.c
@@ -0,0 +1,72 @@
+#include <time.h>
+
+#include <posthaste/date.h>
+
+ph_date_t date_from_string(const char str[static 11])
+{
+	unsigned year  = (str[0] - '0') * 1000
+	                 + (str[1] - '0') * 100
+	                 + (str[2] - '0') * 10
+	                 + (str[3] - '0');
+
+	unsigned month = (str[5] - '0') * 10
+	                 + (str[6] - '0');
+
+	unsigned day   = (str[8] - '0') * 10
+	                 + (str[9] - '0');
+
+	return date_from_numbers(year, month, day);
+}
+
+ph_date_t date_from_numbers(unsigned year, unsigned month, unsigned day)
+{
+	return year << 9 | month << 5 | day;
+}
+
+void date_split(ph_date_t date, unsigned *year, unsigned *month, unsigned *day)
+{
+	if (year) *year  = date >> 9;
+	if (month) *month = (date >> 5) & 0xf;
+	if (day) *day   = date & 0x1f;
+}
+
+void date_to_string(char str[static 11], ph_date_t date)
+{
+	unsigned year, month, day;
+	date_split(date, &year, &month, &day);
+
+	str[0] = '0' + (year / 1000) % 10;
+	str[1] = '0' + (year / 100) % 10;
+	str[2] = '0' + (year / 10) % 10;
+	str[3] = '0' + year % 10;
+	str[4] = '-';
+	str[5] = '0' + (month / 10) % 10;
+	str[6] = '0' + month % 10;
+	str[7] = '-';
+	str[8] = '0' + (day / 10) % 10;
+	str[9] = '0' + day % 10;
+	str[10] = '\0';
+}
+
+bool date_valid(ph_date_t date)
+{
+	unsigned year, month, day;
+	date_split(date, &year, &month, &day);
+
+	struct tm tm = {0};
+	tm.tm_year = year;
+	/* goddammit */
+	tm.tm_mon = month - 1;
+	tm.tm_mday = day;
+
+	time_t r = mktime(&tm);
+	/* if mktime fails to represent our time, it returns -1. mktime somewhat
+	 * curiously accepts dates outside regular ranges, like negative time,
+	 * but adjusts the tm structure to be 'correct', so we can check if any
+	 * of the times were modified to see if this is a valid time. */
+	if (r == -1 || (int)year != tm.tm_year
+	    || (int)month != tm.tm_mon + 1|| (int)day != tm.tm_mday)
+		return false;
+
+	return true;
+}
diff --git a/src/debug.c b/src/debug.c
new file mode 100644
index 0000000..c146719
--- /dev/null
+++ b/src/debug.c
@@ -0,0 +1,81 @@
+#include <stdarg.h>
+#include <string.h>
+
+#include <posthaste/debug.h>
+
+/**
+ * Find position in file buffer where line number \p no
+ * starts. Lines are assumed to be one-indexed, with
+ * \p no = \c 0 and \p no = \c 1 both considered the first line.
+ *
+ * @param buf Buffer to look in.
+ * @param no Line number whose start to look for.
+ * @return Pointer to location in buffer where line number \p no
+ * starts.
+ */
+static const char *find_lineno(const char *buf, size_t no)
+{
+	if (no == 0 || no == 1)
+		return buf;
+
+	char c;
+	while ((c = *buf)) {
+		buf++;
+
+		if (c == '\n')
+			no--;
+
+		if (no == 1)
+			break;
+	}
+
+	return buf;
+}
+
+void vsrc_issue(struct src_issue issue, const char *msg, va_list args)
+{
+	const char *line_start = find_lineno(issue.buf, issue.loc.first_line);
+	const char *line_end = strchr(line_start, '\n');
+	if (!line_end)
+		line_end = strchr(line_start, 0);
+
+	int line_len = line_end - line_start;
+
+	fprintf(stderr, "%s:%i:%i: ", issue.fname,
+	        issue.loc.first_line,
+	        issue.loc.first_col);
+
+	vfprintf(stderr, msg, args);
+	fputc('\n', stderr);
+
+	int lineno_len = snprintf(NULL, 0, "%i", issue.loc.first_line);
+	fputc(' ', stderr);
+	fprintf(stderr, "%i | ", issue.loc.first_line);
+
+	fprintf(stderr, "%.*s\n", line_len, line_start);
+
+	for (int i = 0; i < lineno_len + 2; ++i)
+		fputc(' ', stderr);
+
+	fprintf(stderr, "| ");
+
+	for (int i = 0; i < issue.loc.first_col - 1; ++i)
+		fputc(line_start[i] == '\t' ? '\t' : ' ', stderr);
+
+	for (int i = issue.loc.first_col; i < issue.loc.last_col; ++i) {
+		if (i == issue.loc.first_col)
+			fputc('^', stderr);
+		else
+			fputc('~', stderr);
+	}
+
+	fputc('\n', stderr);
+}
+
+void src_issue(struct src_issue issue, const char *msg, ...)
+{
+	va_list args;
+	va_start(args, msg);
+	vsrc_issue(issue, msg, args);
+	va_end(args);
+}
diff --git a/src/lexer.l b/src/lexer.l
new file mode 100644
index 0000000..6f58bd5
--- /dev/null
+++ b/src/lexer.l
@@ -0,0 +1,191 @@
+%option reentrant noyywrap nounput noinput nodefault
+%{
+
+#include <stdio.h>
+#include <stdarg.h>
+
+#include <posthaste/parser.h>
+#include <posthaste/debug.h>
+
+static void update_yylloc(struct parser *parser, YYLTYPE *lloc, const char *text)
+{
+	(void)parser;
+
+	lloc->first_line = lloc->last_line;
+	lloc->first_column = lloc->last_column;
+
+	for (size_t i = 0; text[i] != 0; ++i) {
+		if (text[i] == '\n') {
+			lloc->last_line++;
+			/* flex uses 1 based indexing */
+			lloc->last_column = 1;
+		} else {
+			lloc->last_column++;
+		}
+	}
+}
+
+static void lex_fail(struct parser *p, struct src_loc loc, const char *msg, ...)
+{
+	va_list args;
+	va_start(args, msg);
+
+	struct src_issue issue;
+	issue.loc = loc;
+	issue.fname = p->fname;
+	issue.buf = p->buf;
+	vsrc_issue(issue, msg, args);
+
+	va_end(args);
+
+	p->failed = true;
+}
+
+static ph_date_t lex_date(struct parser *p, struct src_loc loc, const char *date)
+{
+	ph_date_t d = date_from_string(date);
+	if (!date_valid(d)) {
+		lex_fail(p, loc, "Not a valid date.");
+		return 0;
+	}
+
+	return d;
+}
+
+static int64_t lex_int(struct parser *p, struct src_loc loc, const char *num)
+{
+	bool neg = num[0] == '-';
+
+	/* jump over minus sign */
+	if (neg)
+		num += 1;
+
+	int64_t sum = 0;
+	int64_t power = 1;
+	size_t n = strlen(num);
+	if (n > 18) {
+		lex_fail(p, loc, "Literal integer too large");
+		return 0;
+	}
+
+	for (size_t i = 0; i < n; ++i) {
+		/* jump over separators */
+		char c = num[n - i - 1];
+		if (c == '\'')
+			continue;
+
+		/* decimal value at i:th least significant place */
+		int64_t d = c - '0';
+		sum += d * power;
+		power *= 10;
+	}
+
+	if (sum > 1000000000000) {
+		lex_fail(p, loc, "Literal integer too large");
+		return 0;
+	}
+
+	if (neg)
+		sum = -sum;
+
+	return sum;
+}
+
+#define YY_USER_ACTION update_yylloc(parser, yylloc, yytext);
+%}
+
+IDENT		[a-z][a-zA-Z0-9_]+
+FUNC_IDENT	[A-Z][a-z0-9_]+
+PROC_IDENT	[A-Z]{2}[A-Z0-9_]*
+
+DATE_LITERAL	[0-9]{4}-[0-9]{2}-[0-9]{2}
+INT_LITERAL	-?[0-9]+('[0-9][0-9][0-9]+)*
+STRING		\"(\\.|[^"\\])*\"
+
+%x SC_COMMENT
+
+%%
+"(%"	{BEGIN(SC_COMMENT);}
+<SC_COMMENT>{
+	"(%"	{parser->comment_nesting += 1;}
+	"%)"	{
+		if (parser->comment_nesting)
+			parser->comment_nesting -= 1;
+		else
+			BEGIN(INITIAL);
+	}
+
+	/* magic to avoid lexer jamming on open braces */
+	"*"+ {}
+	[^(%\n]+ {}
+	[(] {}
+	\n {}
+}
+
+"("		{return LPAREN;}
+")"		{return RPAREN;}
+"["		{return LSQUARE;}
+"]"		{return RSQUARE;}
+"{"		{return LCURLY;}
+"}"		{return RCURLY;}
+
+"'"		{return APOSTROPHE;}
+"&"		{return AMPERSAND;}
+","		{return COMMA;}
+"."		{return DOT;}
+"="		{return EQ;}
+"<"		{return LT;}
+"+"		{return PLUS;}
+"-"		{return MINUS;}
+"*"		{return MULT;}
+"/"		{return DIV;}
+
+"var"		{return VAR;}
+"is"		{return IS;}
+"unless"	{return UNLESS;}
+"otherwise"	{return OTHERWISE;}
+"until"		{return UNTIL;}
+"do"		{return DO;}
+"done"		{return DONE;}
+"procedure"	{return PROCEDURE;}
+"function"	{return FUNCTION;}
+"return"	{return RETURN;}
+"print"		{return PRINT;}
+"end"		{return END;}
+
+{STRING} {
+	yylval->str = yytext;
+	return STRING;
+}
+
+{DATE_LITERAL} {
+	yylval->num = lex_date(parser, src_loc(*yylloc), yytext);
+	return DATE_LITERAL;
+}
+
+{INT_LITERAL} {
+	yylval->snum = lex_int(parser, src_loc(*yylloc), yytext);
+	return INT_LITERAL;
+}
+
+{IDENT} {
+	yylval->str = yytext;
+	return IDENT;
+}
+
+{FUNC_IDENT} {
+	yylval->str = yytext;
+	return FUNC_IDENT;
+}
+
+{PROC_IDENT} {
+	yylval->str = yytext;
+	return PROC_IDENT;
+}
+
+[[:space:]]+	{/* skip whitespace */}
+
+. {
+	lex_fail(parser, src_loc(*yylloc), "Unexpected token: %s", yytext);
+}
+%%
diff --git a/src/main.c b/src/main.c
new file mode 100644
index 0000000..a40d841
--- /dev/null
+++ b/src/main.c
@@ -0,0 +1,28 @@
+#include <stdio.h>
+#include <posthaste/core.h>
+
+static void usage(FILE *f, const char *pname)
+{
+	fprintf(f, "Usage:\n	%s <filename>\n", pname);
+}
+
+/**
+ * Main entry to posthaste.
+ * Checks command line and drives the rest of the language.
+ *
+ * Feels kind of weird documenting main, but doxygen warns about not
+ * doing it so whatever.
+ *
+ * @param argc Number of command line arguments.
+ * @param argv Array of command line arguments.
+ * @return \c 0 when succesful, non-zero otherwise.
+ */
+int main(int argc, char *argv[])
+{
+	if (argc != 2) {
+		usage(stderr, argv[0]);
+		return -1;
+	}
+
+	return run(argv[1]);
+}
diff --git a/src/parser.y b/src/parser.y
new file mode 100644
index 0000000..8a4ddb5
--- /dev/null
+++ b/src/parser.y
@@ -0,0 +1,243 @@
+%{
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <posthaste/parser.h>
+#include <posthaste/date.h>
+
+#define FOREACH_TOKEN(M) \
+	M(LPAREN)	\
+	M(RPAREN)	\
+	M(LSQUARE)	\
+	M(RSQUARE)	\
+	M(LCURLY)	\
+	M(RCURLY)	\
+	M(APOSTROPHE)	\
+	M(AMPERSAND)	\
+	M(COMMA)	\
+	M(DOT)		\
+	M(EQ)		\
+	M(LT)		\
+	M(PLUS)		\
+	M(MINUS)	\
+	M(MULT)		\
+	M(DIV)		\
+	M(VAR)		\
+	M(IS)		\
+	M(UNLESS)	\
+	M(OTHERWISE)	\
+	M(UNTIL)	\
+	M(DO)		\
+	M(DONE)		\
+	M(PROCEDURE)	\
+	M(FUNCTION)	\
+	M(RETURN)	\
+	M(PRINT)	\
+	M(END)		\
+	M(STRING)	\
+	M(DATE_LITERAL)	\
+	M(INT_LITERAL)	\
+	M(IDENT)	\
+	M(FUNC_IDENT)	\
+	M(PROC_IDENT)
+
+%}
+
+%locations
+
+%define parse.trace
+%define parse.error verbose
+%define api.pure full
+%define lr.type ielr
+
+%lex-param {void *scanner} {struct parser *parser}
+%parse-param {void *scanner} {struct parser* parser}
+
+%union {
+	struct ast_node *node;
+	ph_date_t num;
+	int64_t snum;
+	char *str;
+};
+
+%token <str> STRING
+%token <num> DATE_LITERAL
+%token <snum> INT_LITERAL
+%token <str> IDENT;
+%token <str> FUNC_IDENT;
+%token <str> PROC_IDENT;
+
+%token LPAREN "("
+%token RPAREN ")"
+%token LSQUARE "["
+%token RSQUARE "]"
+%token LCURLY "{"
+%token RCURLY "}"
+
+%token APOSTROPHE "'"
+%token AMPERSAND "&"
+%token COMMA ","
+%token DOT "."
+%token EQ "="
+%token LT "<"
+%token PLUS "+"
+%token MINUS "-"
+%token MULT "*"
+%token DIV "/"
+
+%token VAR "var"
+%token IS "is"
+%token UNLESS "unless"
+%token OTHERWISE "otherwise"
+%token UNTIL "until"
+%token DO "do"
+%token DONE "done"
+%token PROCEDURE "procedure"
+%token FUNCTION "function"
+%token RETURN "return"
+%token PRINT "print"
+%token END "end"
+
+%{
+
+/** Modifies the signature of yylex to fit our parser better. */
+#define YY_DECL int yylex(YYSTYPE *yylval, YYLTYPE *yylloc, \
+	                  void *yyscanner, struct parser *parser)
+
+/**
+ * Declare yylex.
+ *
+ * @param yylval Bison current value.
+ * @param yylloc Bison location info.
+ * @param yyscanner Flex scanner.
+ * @param parser Current parser state.
+ * @return \c 0 when succesful, \c 1 otherwise.
+ * More info on yylex() can be found in the flex manual.
+ */
+YY_DECL;
+
+/**
+ * Convert bison location info to our own source location info.
+ *
+ * @param yylloc Bison location info.
+ * @return Internal location info.
+ */
+static struct src_loc src_loc(YYLTYPE yylloc);
+
+/**
+ * Print parsing error.
+ * Automatically called by bison.
+ *
+ * @param yylloc Location of error.
+ * @param lexer Lexer.
+ * @param parser Parser state.
+ * @param msg Message to print.
+ */
+static void yyerror(YYLTYPE *yylloc, void *lexer,
+		struct parser *parser, const char *msg);
+
+%}
+
+%start input;
+%%
+
+input: /* empty */
+
+%%
+
+#include "gen_lexer.inc"
+
+static void dump_yychar(struct parser *p, int yychar, YYSTYPE yylval, YYLTYPE yylloc)
+{
+	struct src_loc loc = src_loc(yylloc);
+	printf("%s:%d:%d: ", p->fname, loc.first_line, loc.first_col);
+
+#define PRINT_NAME(token) case token: printf(#token " "); break;
+	switch (yychar) {
+	FOREACH_TOKEN(PRINT_NAME);
+	default: printf("Unknown yychar\n"); return;
+	}
+
+	char date_str[11] = {0};
+	switch (yychar) {
+	case INT_LITERAL:	printf("(%lld)", (long long int)yylval.snum); break;
+	case IDENT:		printf("(%s)", yylval.str); break;
+	case FUNC_IDENT:	printf("(%s)", yylval.str); break;
+	case PROC_IDENT:	printf("(%s)", yylval.str); break;
+	case DATE_LITERAL:
+		date_to_string(date_str, yylval.num);
+		printf("(%s)", date_str);
+		break;
+	}
+
+	printf("\n");
+}
+
+static void dump_lex(struct parser *p)
+{
+	int yychar;
+	YYSTYPE yylval;
+	YYLTYPE yylloc = {1, 1, 1, 1};
+
+	/* run lexer until we reach the end of the file */
+	while ((yychar = yylex(&yylval, &yylloc, p->lexer, p)) != YYEOF) {
+		dump_yychar(p, yychar, yylval, yylloc);
+	}
+}
+
+static struct src_loc src_loc(YYLTYPE yylloc)
+{
+	struct src_loc loc;
+	loc.first_line = yylloc.first_line;
+	loc.last_line = yylloc.last_line;
+	loc.first_col = yylloc.first_column;
+	loc.last_col = yylloc.last_column;
+	return loc;
+}
+
+static void yyerror(YYLTYPE *yylloc, void *lexer,
+		struct parser *parser, const char *msg)
+{
+	(void)lexer;
+
+	struct src_issue issue;
+	issue.loc = src_loc(*yylloc);
+	issue.fname = parser->fname;
+	issue.buf = parser->buf;
+
+	src_issue(issue, msg);
+}
+
+struct parser *create_parser()
+{
+	return calloc(1, sizeof(struct parser));
+}
+
+void destroy_parser(struct parser *p)
+{
+	yylex_destroy(p->lexer);
+	free(p);
+}
+
+void parse(struct parser *p, const char *fname, const char *buf)
+{
+	p->fname = fname;
+	p->buf = buf;
+
+	p->comment_nesting = 0;
+
+	p->failed = false;
+
+	yylex_init(&p->lexer);
+
+	yy_scan_string(buf, p->lexer);
+
+	// debugging, remember to reset yy_scan_string once the actual parser
+	// runs
+	dump_lex(p);
+
+	// yyparse(p->lexer, p);
+}
diff --git a/src/source.mk b/src/source.mk
new file mode 100644
index 0000000..6a8d30f
--- /dev/null
+++ b/src/source.mk
@@ -0,0 +1,4 @@
+SRCS			!= echo src/*.c
+
+# rules for gen_parser.c are in scripts/makefile
+POSTHASTE_SOURCES	+= $(SRCS) gen/gen_parser.c
-- 
cgit v1.2.3