From 6c2e51a3f8695cb95d6a4a6859d3f934e28c8f9f Mon Sep 17 00:00:00 2001
From: Kimplul <kimi.h.kuparinen@gmail.com>
Date: Wed, 2 Apr 2025 21:14:23 +0300
Subject: improve build system a bit

---
 scripts/makefile | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'scripts/makefile')

diff --git a/scripts/makefile b/scripts/makefile
index 3a4f200..bd02c98 100644
--- a/scripts/makefile
+++ b/scripts/makefile
@@ -29,16 +29,21 @@ OBJCOPY		!= [ "$(LLVM)" != "0" ] \
 			&& echo llvm-objcopy \
 			|| echo $(CROSS_COMPILE)objcopy
 
-COMPILER	!= [ "$(LLVM)" != "0" ] \
-			&& echo clang --target="$(CROSS_COMPILE)" \
-			|| echo $(CROSS_COMPILE)gcc
+COMPILER	!= [ -n "$(CROSS_COMPILE)" ]						\
+			&& {								\
+				[ "$(LLVM)" != "0" ]					\
+					&& echo clang --target="$(CROSS_COMPILE)"	\
+					|| echo $(CROSS_COMPILE)gcc			\
+					;						\
+			}								\
+			|| echo $(CC)
 
 
 OBFLAGS		:= -g
 WARNFLAGS	:= -Wall -Wextra
 
 COMPILE_FLAGS	:= $(CFLAGS) $(WARNFLAGS) $(OPTFLAGS) $(LTOFLAGS) \
-		   $(OBFLAGS) $(ASSERTFLAGS) $(DEBUGFLAGS)
+		   $(OBFLAGS) $(DEBUGFLAGS)
 
 INCLUDE_FLAGS	:= -I include
 
-- 
cgit v1.2.3


From 441ddf9277f878b83c8d093def51b27285353fed Mon Sep 17 00:00:00 2001
From: Kimplul <kimi.h.kuparinen@gmail.com>
Date: Thu, 3 Apr 2025 18:26:02 +0300
Subject: fix example compilation

---
 scripts/makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'scripts/makefile')

diff --git a/scripts/makefile b/scripts/makefile
index bd02c98..5178359 100644
--- a/scripts/makefile
+++ b/scripts/makefile
@@ -68,13 +68,13 @@ libejit.a: $(EJIT_OBJS)
 examples: examples/loop examples/fib examples/matrix_mult
 
 examples/matrix_mult: examples/matrix_mult.c libejit.a
-	$(COMPILE_EJIT) examples/matrix_mult.c libejit.a -o $@
+	$(COMPILE_EJIT) examples/matrix_mult.c libejit.a -o $@ -lm
 
 examples/loop: examples/loop.c libejit.a
-	$(COMPILE_EJIT) examples/loop.c libejit.a -o $@
+	$(COMPILE_EJIT) examples/loop.c libejit.a -o $@ -lm
 
 examples/fib: examples/fib.c libejit.a
-	$(COMPILE_EJIT) examples/fib.c libejit.a -o $@
+	$(COMPILE_EJIT) examples/fib.c libejit.a -o $@ -lm
 
 # might lint some common things twice
 .PHONY:
-- 
cgit v1.2.3


From 6824dd4b1ee22184f0e600115db3998924ed39d6 Mon Sep 17 00:00:00 2001
From: Kimplul <kimi.h.kuparinen@gmail.com>
Date: Wed, 9 Apr 2025 19:56:33 +0300
Subject: initial tail call stuff

---
 .gitignore            |   1 +
 deps/lightening       |   2 +-
 examples/fib.c        |   4 +-
 examples/sum.c        |  78 +++++++++++++++++++++++++++++++++++++
 include/ejit/ejit.h   |  12 ++----
 scripts/makefile      |   5 ++-
 src/common.h          |   7 +---
 src/compile/compile.c | 106 ++++++++++++++++++++++++++++++++++++++++++--------
 src/ejit.c            |  79 ++++++++++++++-----------------------
 src/interp.c          |  58 +++++++++++++--------------
 10 files changed, 236 insertions(+), 116 deletions(-)
 create mode 100644 examples/sum.c

(limited to 'scripts/makefile')

diff --git a/.gitignore b/.gitignore
index 93b2293..50343e4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,5 @@ test-*
 examples/matrix_mult
 examples/loop
 examples/fib
+examples/sum
 examples/*.d
diff --git a/deps/lightening b/deps/lightening
index 8d59c87..3f4127c 160000
--- a/deps/lightening
+++ b/deps/lightening
@@ -1 +1 @@
-Subproject commit 8d59c872edc5a21e7dd78768b291d0c4d8136e48
+Subproject commit 3f4127c3ef16177be55cf6153a206ca4f8a4859f
diff --git a/examples/fib.c b/examples/fib.c
index 999546b..fcb0659 100644
--- a/examples/fib.c
+++ b/examples/fib.c
@@ -20,12 +20,12 @@ struct ejit_func *compile(bool try_jit, bool im_scawed)
 	struct ejit_operand arg[1] = {
 		EJIT_OPERAND_GPR(0, EJIT_INT32)
 	};
-	ejit_calli_i(f, f, 1, arg);
+	ejit_calli(f, f, 1, arg);
 	ejit_retval(f, EJIT_GPR(1)); /* loc 1 contains temp result */
 
 	/* fib(n - 2) */
 	ejit_subi(f, EJIT_GPR(0), EJIT_GPR(0), 1);
-	ejit_calli_i(f, f, 1, arg);
+	ejit_calli(f, f, 1, arg);
 	ejit_retval(f, EJIT_GPR(0)); /* loc 0 now contains second temp result */
 
 	ejit_addr(f, EJIT_GPR(0), EJIT_GPR(0), EJIT_GPR(1)); /* add results */
diff --git a/examples/sum.c b/examples/sum.c
new file mode 100644
index 0000000..cc8f54b
--- /dev/null
+++ b/examples/sum.c
@@ -0,0 +1,78 @@
+#include <stdio.h>
+#include <time.h>
+
+#include "../include/ejit/ejit.h"
+
+struct ejit_func *compile(bool try_jit, bool im_scawed)
+{
+	struct ejit_operand params[2] = {
+		EJIT_OPERAND_GPR(0, EJIT_INT32), /* loc 0 contains s */
+		EJIT_OPERAND_GPR(1, EJIT_INT32)  /* loc 1 contains n */
+	};
+	struct ejit_func *f = ejit_create_func(EJIT_INT32, 2, params);
+
+	/* n == 0, return s */
+	struct ejit_reloc recurse = ejit_bnei(f, EJIT_GPR(1), 0);
+	ejit_retr(f, EJIT_GPR(0));
+	ejit_patch(f, recurse, ejit_label(f));
+
+	/* s += n */
+	ejit_addr(f, EJIT_GPR(0), EJIT_GPR(0), EJIT_GPR(1));
+
+	/* n -= 1 */
+	ejit_subi(f, EJIT_GPR(1), EJIT_GPR(1), 1);
+
+	struct ejit_operand args[2] = {
+		EJIT_OPERAND_GPR(0, EJIT_INT32), /* s */
+		EJIT_OPERAND_GPR(1, EJIT_INT32) /* n */
+	};
+
+	/* recurse */
+	ejit_movi(f, EJIT_GPR(2), (uintptr_t)f);
+	ejit_tailr(f, EJIT_GPR(2), 2, args);
+
+	ejit_select_compile_func(f, 3, 0, EJIT_USE64(uintptr_t), try_jit, im_scawed);
+	return f;
+}
+
+int main(int argc, char *argv[])
+{
+	if(argc != 4){
+		fprintf(stderr, "Usage: %s compile_num loop_num jit\n", argv[0]);
+		return -1;
+	}
+
+	int jit_level = strtoull(argv[3], 0, 0);
+	size_t compile_num = strtoull(argv[1], 0, 0);
+	struct ejit_func **info = calloc(compile_num, sizeof(struct ejit_func *));
+
+	clock_t t = clock();
+	for(size_t i = 0; i < compile_num; ++i){
+		info[i] = compile(jit_level > 0, jit_level > 1);
+	}
+	t = clock() - t;
+
+	double compile_time_total = ((double)t) / CLOCKS_PER_SEC;
+	double compile_time_one = compile_time_total / compile_num;
+	printf("Compilation for n = %zu took %fs (1/%f).\n",
+			compile_num, compile_time_total, compile_time_one);
+
+	size_t run_num = strtoull(argv[2], 0, 0);
+	t = clock();
+	struct ejit_arg args[2] = {
+		(struct ejit_arg){.type = EJIT_INT32, .l = 0}, /* s */
+		(struct ejit_arg){.type = EJIT_INT32, .l = run_num} /* n */
+	};
+	int32_t result = ejit_run_func_i(info[0], 2, args);
+	t = clock() - t;
+
+	double run_time_total = ((double)t) / CLOCKS_PER_SEC;
+	printf("Running loop for n = %zu took %fs with res %ld\n",
+			run_num, run_time_total, (long)result);
+
+	for(size_t i = 0; i < compile_num; ++i)
+		ejit_destroy_func(info[i]);
+
+	free(info);
+	return 0;
+}
diff --git a/include/ejit/ejit.h b/include/ejit/ejit.h
index 965103c..aa42eca 100644
--- a/include/ejit/ejit.h
+++ b/include/ejit/ejit.h
@@ -454,18 +454,12 @@ typedef double (*ejit_escape_d_t)(size_t argc, const struct ejit_arg args[argc])
 
 struct ejit_label ejit_label(struct ejit_func *s);
 
-void ejit_calli_i(struct ejit_func *s, struct ejit_func *f, size_t argc,
-                const struct ejit_operand args[argc]);
+void ejit_tailr(struct ejit_func *s, struct ejit_gpr target,
+		size_t argc, const struct ejit_operand args[argc]);
 
-void ejit_calli_l(struct ejit_func *s, struct ejit_func *f, size_t argc,
+void ejit_calli(struct ejit_func *s, struct ejit_func *f, size_t argc,
                 const struct ejit_operand args[argc]);
 
-void ejit_calli_f(struct ejit_func *s, struct ejit_func *f, size_t argc,
-                  const struct ejit_operand args[argc]);
-
-void ejit_calli_d(struct ejit_func *s, struct ejit_func *f, size_t argc,
-                  const struct ejit_operand args[argc]);
-
 void ejit_escapei_i(struct ejit_func *s, ejit_escape_i_t f, size_t argc,
                   const struct ejit_operand args[argc]);
 
diff --git a/scripts/makefile b/scripts/makefile
index 5178359..dbb7a1c 100644
--- a/scripts/makefile
+++ b/scripts/makefile
@@ -65,7 +65,7 @@ COMPILE_EJIT	= $(COMPILE) $(EJIT_FLAGS)
 libejit.a: $(EJIT_OBJS)
 	$(CROSS_COMPILE)ar rcs libejit.a $(EJIT_OBJS)
 
-examples: examples/loop examples/fib examples/matrix_mult
+examples: examples/loop examples/fib examples/sum examples/matrix_mult
 
 examples/matrix_mult: examples/matrix_mult.c libejit.a
 	$(COMPILE_EJIT) examples/matrix_mult.c libejit.a -o $@ -lm
@@ -76,6 +76,9 @@ examples/loop: examples/loop.c libejit.a
 examples/fib: examples/fib.c libejit.a
 	$(COMPILE_EJIT) examples/fib.c libejit.a -o $@ -lm
 
+examples/sum: examples/sum.c libejit.a
+	$(COMPILE_EJIT) examples/sum.c libejit.a -o $@ -lm
+
 # might lint some common things twice
 .PHONY:
 lint: $(TRISCV_LINTS)
diff --git a/src/common.h b/src/common.h
index 69f1441..3512717 100644
--- a/src/common.h
+++ b/src/common.h
@@ -218,10 +218,8 @@ enum ejit_opcode {
 	EJIT_OP_ESCAPEI_F,
 	EJIT_OP_ESCAPEI_D,
 
-	EJIT_OP_CALLI_I,
-	EJIT_OP_CALLI_L,
-	EJIT_OP_CALLI_F,
-	EJIT_OP_CALLI_D,
+	EJIT_OP_CALLI,
+	EJIT_OP_TAILR,
 
 	EJIT_OP_RETR,
 	EJIT_OP_RETI,
@@ -308,7 +306,6 @@ union interp_ret {
 
 union interp_ret ejit_run(struct ejit_func *f, size_t argc,
                      struct ejit_arg args[argc],
-		     bool run,
 		     void ***labels_wb);
 
 bool ejit_compile(struct ejit_func *f, bool use_64, bool im_scawed);
diff --git a/src/compile/compile.c b/src/compile/compile.c
index dcf662b..54d79f2 100644
--- a/src/compile/compile.c
+++ b/src/compile/compile.c
@@ -22,22 +22,22 @@ struct reloc_helper {
 /* skip assertions since we know they must be valid due to type checking earlier */
 static long checked_run_i(struct ejit_func *f, size_t argc, struct ejit_arg args[argc])
 {
-	return ejit_run(f, argc, args, true, NULL).i;
+	return ejit_run(f, argc, args, NULL).i;
 }
 
 static int64_t checked_run_l(struct ejit_func *f, size_t argc, struct ejit_arg args[argc])
 {
-	return ejit_run(f, argc, args, true, NULL).i;
+	return ejit_run(f, argc, args, NULL).i;
 }
 
 static float checked_run_f(struct ejit_func *f, size_t argc, struct ejit_arg args[argc])
 {
-	return ejit_run(f, argc, args, true, NULL).f;
+	return ejit_run(f, argc, args, NULL).f;
 }
 
 static double checked_run_d(struct ejit_func *f, size_t argc, struct ejit_arg args[argc])
 {
-	return ejit_run(f, argc, args, true, NULL).f;
+	return ejit_run(f, argc, args, NULL).f;
 }
 
 static void *alloc_arena(size_t size, bool im_scawed)
@@ -47,6 +47,11 @@ static void *alloc_arena(size_t size, bool im_scawed)
 	            MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 }
 
+static void assert_helper(const char *msg)
+{
+	assert(false && msg);
+}
+
 static void free_arena(void *arena, size_t size)
 {
 	munmap(arena, size);
@@ -2042,8 +2047,6 @@ static size_t compile_fn_body(struct ejit_func *f, jit_state_t *j, void *arena,
 	struct addrs addrs = addrs_create();
 	addrs_reserve(&addrs, insns_len(&f->insns));
 
-	void *call = NULL;
-
 	size_t label = 0;
 	foreach_vec(ii, f->insns) {
 		/* if we've hit a label, add it to our vector of label addresses */
@@ -2502,21 +2505,64 @@ static size_t compile_fn_body(struct ejit_func *f, jit_state_t *j, void *arena,
 			break;
 		}
 
-		case EJIT_OP_CALLI_L:
-#if __WORDSIZE == 64
-			 call = checked_run_l; goto calli;
-#else
-			  assert(0 && "trying to compile calli_l on 32bit arch");
-			  break;
+		case EJIT_OP_TAILR: {
+			/* this is admittedly a slightly roundabout way of
+			 * implementing tail calls and is arguably not the most
+			 * performant way (if it works at all, heh) but for now
+			 * I'm more interested in functionality than raw
+			 * performance. Currently only supports two gpr
+			 * registers, but should be fairly easy to extend with
+			 * fprs as well */
+
+			assert(operands_len(&direct) <= 2);
+			jit_gpr_t r = getloc(f, j, i.r1, 0);
+			jit_ldxi(j, JIT_R0, r, offsetof(struct ejit_func, direct_call));
+#if defined(DEBUG)
+			jit_reloc_t assert_reloc = jit_bnei(j, JIT_R0, 0); /* null */
+			jit_calli_1(j, assert_helper,
+					jit_operand_imm(JIT_OPERAND_ABI_POINTER,
+						(jit_imm_t)"trying to tail call interpreted function"));
+			jit_patch_here(j, assert_reloc);
 #endif
+			jit_operand_t regs[2] = {
+				jit_operand_gpr(JIT_OPERAND_ABI_WORD, JIT_R1),
+				jit_operand_gpr(JIT_OPERAND_ABI_WORD, JIT_R2)
+			};
+			jit_move_operands(j, regs, direct.buf, operands_len(&direct));
+
+			/* with args safely in registers, reset stack/state
+			 * while avoiding overwriting the call target */
+			jit_gpr_t tmp = get_callr_temp(j);
+			jit_movr(j, tmp, JIT_R0);
+
+			int frame_size = j->frame_size;
+			jit_shrink_stack(j, stack);
+			jit_leave_jit_abi(j, gprs, fprs, frame);
+
+			/* now move args into place */
+			jit_operand_t args[2] = {};
+			foreach_vec(oi, direct) {
+				args[oi] = *operands_at(&direct, oi);
+			}
 
-		case EJIT_OP_CALLI_F: { call = checked_run_f; goto calli; }
-		case EJIT_OP_CALLI_D: { call = checked_run_d; goto calli; }
-		case EJIT_OP_CALLI_I: { call = checked_run_i; goto calli;
-calli:
+			jit_locate_args(j, operands_len(&direct), args);
+			jit_move_operands(j, args, regs, operands_len(&direct));
+			jit_jmpr(j, tmp);
+			j->frame_size = frame_size;
+
+			operands_reset(&src);
+			operands_reset(&dst);
+			operands_reset(&direct);
+			break;
+		}
+
+		case EJIT_OP_CALLI: {
 			save_caller_save_regs(f, j);
 
 			struct ejit_func *f = (struct ejit_func *)(uintptr_t)i.o;
+#if __WORDSIZE != 64
+			assert(f->rtype != EJIT_INT64 && f->rtype != EJIT_UINT64);
+#endif
 			if (f && f->direct_call) {
 				jit_calli(j, f->direct_call, operands_len(&direct), direct.buf);
 				restore_caller_save_regs(f, j);
@@ -2535,6 +2581,16 @@ calli:
 				 * argument stack address */
 				jit_operand_gpr(JIT_OPERAND_ABI_POINTER, JIT_R0)
 			};
+
+			void *call = NULL;
+			switch (f->rtype) {
+			case EJIT_INT64:
+			case EJIT_UINT64: call = checked_run_l; break;
+			case EJIT_FLOAT: call = checked_run_f; break;
+			case EJIT_DOUBLE: call = checked_run_d; break;
+			default: call = checked_run_i; break;
+			}
+
 			compile_imm_call(j, &src, &dst, call, 3, args);
 			restore_caller_save_regs(f, j);
 
@@ -2552,39 +2608,55 @@ calli:
 			jit_gpr_t r = getloc(f, j, i.r1, 0);
 			/* R0 won't get overwritten by jit_leave_jit_abi */
 			jit_movr(j, JIT_R0, r);
+
+			/* keep track of frame size so we can continue
+			 * generating code after 'leaving' the ABI. Bit of a
+			 * hack, should maybe codify this better in the
+			 * lightening API? */
+			int frame_size = j->frame_size;
 			jit_shrink_stack(j, stack);
 			jit_leave_jit_abi(j, gprs, fprs, frame);
 			jit_retr(j, JIT_R0);
+			j->frame_size = frame_size;
 			break;
 		}
 
 		case EJIT_OP_RETR_F: {
 			jit_fpr_t r = getloc_f(f, j, i.r1, 0);
 			jit_movr_f(j, JIT_F0, r);
+
+			int frame_size = j->frame_size;
 			jit_shrink_stack(j, stack);
 			jit_leave_jit_abi(j, gprs, fprs, frame);
 			jit_retr_f(j, JIT_F0);
+			j->frame_size = frame_size;
 			break;
 		}
 
 		case EJIT_OP_RETR_D: {
 			jit_fpr_t r = getloc_d(f, j, i.r1, 0);
 			jit_movr_d(j, JIT_F0, r);
+
+			int frame_size = j->frame_size;
 			jit_shrink_stack(j, stack);
 			jit_leave_jit_abi(j, gprs, fprs, frame);
 			jit_retr_d(j, JIT_F0);
+			j->frame_size = frame_size;
 			break;
 		}
 
 		case EJIT_OP_RETI: {
+			int frame_size = j->frame_size;
 			jit_shrink_stack(j, stack);
 			jit_leave_jit_abi(j, gprs, fprs, frame);
 			jit_reti(j, i.o);
+			j->frame_size = frame_size;
 			break;
 		}
 
 		case EJIT_OP_END: {
-			/* 'void' return */
+			/* 'void' return, must be last thing in function so no
+			 * need to keep track of frame size */
 			jit_shrink_stack(j, stack);
 			jit_leave_jit_abi(j, gprs, fprs, frame);
 			jit_reti(j, 0);
diff --git a/src/ejit.c b/src/ejit.c
index 571a274..059d5d4 100644
--- a/src/ejit.c
+++ b/src/ejit.c
@@ -414,7 +414,7 @@ void ejit_select_compile_func(struct ejit_func *f, size_t gpr, size_t fpr,
 
 	void **labels;
 	/* just get labels, don't actually run anything yet */
-	ejit_run(f, 0, NULL, false, &labels);
+	ejit_run(f, 0, NULL, &labels);
 
 	foreach_vec(ii, f->insns) {
 		struct ejit_insn i = *insns_at(&f->insns, ii);
@@ -456,67 +456,48 @@ void ejit_patch(struct ejit_func *f, struct ejit_reloc r, struct ejit_label l)
 	*insns_at(&f->insns, r.insn) = i;
 }
 
-void ejit_calli_i(struct ejit_func *s, struct ejit_func *f, size_t argc,
+void ejit_tailr(struct ejit_func *s, struct ejit_gpr target, size_t argc,
                 const struct ejit_operand args[argc])
 {
 	s->max_args = argc > s->max_args ? argc : s->max_args;
-	check_operands(f, argc, args);
 
+	/** @todo check that gpr_args <= 2 and fpr_args <= 3 (?) */
+	size_t gpr_args = 0, fpr_args = 0;
 	for (size_t i = 0; i < argc; ++i) {
 		switch (args[i].kind) {
-		case EJIT_OPERAND_GPR: emit_insn_ar(s, EJIT_OP_ARG, i, args[i].type, EJIT_GPR(args[i].r)); break;
-		case EJIT_OPERAND_FPR: emit_insn_af(s, EJIT_OP_ARG_F, i, args[i].type, EJIT_FPR(args[i].r)); break;
-		case EJIT_OPERAND_IMM: emit_insn_ai(s, EJIT_OP_ARG_I, i, args[i].type, args[i].r); break;
-		case EJIT_OPERAND_FLT: emit_insn_ad(s, EJIT_OP_ARG_FI, i, args[i].type, args[i].d); break;
-		default: abort();
-		}
-	}
-
-	emit_insn_op(s, EJIT_OP_CALLI_I, f);
-}
-
-void ejit_calli_l(struct ejit_func *s, struct ejit_func *f, size_t argc,
-                const struct ejit_operand args[argc])
-{
-	s->use_64 = true;
-	s->max_args = argc > s->max_args ? argc : s->max_args;
-	check_operands(f, argc, args);
+		case EJIT_OPERAND_GPR:
+			gpr_args++;
+			emit_insn_ar(s, EJIT_OP_ARG, i, args[i].type, EJIT_GPR(args[i].r));
+			break;
 
-	for (size_t i = 0; i < argc; ++i) {
-		switch (args[i].kind) {
-		case EJIT_OPERAND_GPR: emit_insn_ar(s, EJIT_OP_ARG, i, args[i].type, EJIT_GPR(args[i].r)); break;
-		case EJIT_OPERAND_FPR: emit_insn_af(s, EJIT_OP_ARG_F, i, args[i].type, EJIT_FPR(args[i].r)); break;
-		case EJIT_OPERAND_IMM: emit_insn_ai(s, EJIT_OP_ARG_I, i, args[i].type, args[i].r); break;
-		case EJIT_OPERAND_FLT: emit_insn_ad(s, EJIT_OP_ARG_FI, i, args[i].type, args[i].d); break;
-		default: abort();
-		}
-	}
+		case EJIT_OPERAND_FPR:
+			fpr_args++;
+			emit_insn_af(s, EJIT_OP_ARG_F, i, args[i].type, EJIT_FPR(args[i].r));
+			break;
 
-	emit_insn_op(s, EJIT_OP_CALLI_L, f);
-}
+		case EJIT_OPERAND_IMM:
+			gpr_args++;
+			emit_insn_ai(s, EJIT_OP_ARG_I, i, args[i].type, args[i].r);
+			break;
 
-void ejit_calli_f(struct ejit_func *s, struct ejit_func *f, size_t argc,
-                const struct ejit_operand args[argc])
-{
-	s->max_args = argc > s->max_args ? argc : s->max_args;
-	check_operands(f, argc, args);
+		case EJIT_OPERAND_FLT:
+			fpr_args++;
+			emit_insn_ad(s, EJIT_OP_ARG_FI, i, args[i].type, args[i].d);
+			break;
 
-	for (size_t i = 0; i < argc; ++i) {
-		switch (args[i].kind) {
-		case EJIT_OPERAND_GPR: emit_insn_ar(s, EJIT_OP_ARG, i, args[i].type, EJIT_GPR(args[i].r)); break;
-		case EJIT_OPERAND_FPR: emit_insn_af(s, EJIT_OP_ARG_F, i, args[i].type, EJIT_FPR(args[i].r)); break;
-		case EJIT_OPERAND_IMM: emit_insn_ai(s, EJIT_OP_ARG_I, i, args[i].type, args[i].r); break;
-		case EJIT_OPERAND_FLT: emit_insn_ad(s, EJIT_OP_ARG_FI, i, args[i].type, args[i].d); break;
 		default: abort();
 		}
 	}
 
-	emit_insn_op(s, EJIT_OP_CALLI_F, f);
+	assert(gpr_args <= 2 && fpr_args == 0
+			&& "only 2 gpr args and 0 fpr args supported in tail calls for now");
+	emit_insn_oxr(s, EJIT_OP_TAILR, target);
 }
 
-void ejit_calli_d(struct ejit_func *s, struct ejit_func *f, size_t argc,
+void ejit_calli(struct ejit_func *s, struct ejit_func *f, size_t argc,
                 const struct ejit_operand args[argc])
 {
+	s->use_64 = f->rtype == EJIT_INT64 || f->rtype == EJIT_UINT64;
 	s->max_args = argc > s->max_args ? argc : s->max_args;
 	check_operands(f, argc, args);
 
@@ -530,7 +511,7 @@ void ejit_calli_d(struct ejit_func *s, struct ejit_func *f, size_t argc,
 		}
 	}
 
-	emit_insn_op(s, EJIT_OP_CALLI_D, f);
+	emit_insn_op(s, EJIT_OP_CALLI, f);
 }
 
 void ejit_escapei_i(struct ejit_func *s, ejit_escape_i_t f, size_t argc,
@@ -1712,7 +1693,7 @@ long ejit_run_func_i(struct ejit_func *f, size_t argc,
 #endif
 		);
 
-	return ejit_run(f, argc, args, true, NULL).i;
+	return ejit_run(f, argc, args, NULL).i;
 }
 
 int64_t ejit_run_func_l(struct ejit_func *f, size_t argc,
@@ -1720,21 +1701,21 @@ int64_t ejit_run_func_l(struct ejit_func *f, size_t argc,
 {
 	check_args(f, argc, args);
 	assert(f->rtype == EJIT_INT64 || f->rtype == EJIT_UINT64);
-	return ejit_run(f, argc, args, true, NULL).i;
+	return ejit_run(f, argc, args, NULL).i;
 }
 
 float ejit_run_func_f(struct ejit_func *f, size_t argc, struct ejit_arg args[argc])
 {
 	check_args(f, argc, args);
 	assert(f->rtype == EJIT_FLOAT);
-	return ejit_run(f, argc, args, true, NULL).f;
+	return ejit_run(f, argc, args, NULL).f;
 }
 
 double ejit_run_func_d(struct ejit_func *f, size_t argc, struct ejit_arg args[argc])
 {
 	check_args(f, argc, args);
 	assert(f->rtype == EJIT_DOUBLE);
-	return ejit_run(f, argc, args, true, NULL).f;
+	return ejit_run(f, argc, args, NULL).f;
 }
 
 struct ejit_arg ejit_run_func(struct ejit_func *f, size_t argc, struct ejit_arg args[argc])
diff --git a/src/interp.c b/src/interp.c
index 6ef414d..132ba4a 100644
--- a/src/interp.c
+++ b/src/interp.c
@@ -5,7 +5,7 @@
 /* this is the body of a given ejit_interp function, it assumes there's an
  * external int64_t retval and double retval_f into which it places the value to
  * be returned. Included from src/interp.c */
-union interp_ret ejit_run(struct ejit_func *f, size_t paramc, struct ejit_arg params[paramc], bool run, void ***labels_wb)
+union interp_ret ejit_run(struct ejit_func *f, size_t paramc, struct ejit_arg params[paramc], void ***labels_wb)
 {
 	static void *labels[EJIT_OPCODE_COUNT] = {
 		[EJIT_OP_MOVI] = &&MOVI,
@@ -213,10 +213,9 @@ union interp_ret ejit_run(struct ejit_func *f, size_t paramc, struct ejit_arg pa
 		[EJIT_OP_PARAM] = &&PARAM,
 		[EJIT_OP_PARAM_F] = &&PARAM_F,
 
-		[EJIT_OP_CALLI_I] = &&CALLI_I,
-		[EJIT_OP_CALLI_L] = &&CALLI_L,
-		[EJIT_OP_CALLI_F] = &&CALLI_F,
-		[EJIT_OP_CALLI_D] = &&CALLI_D,
+		[EJIT_OP_CALLI] = &&CALLI,
+		[EJIT_OP_TAILR] = &&TAILR,
+
 		[EJIT_OP_ESCAPEI_I] = &&ESCAPEI_I,
 		[EJIT_OP_ESCAPEI_F] = &&ESCAPEI_F,
 		[EJIT_OP_ESCAPEI_L] = &&ESCAPEI_L,
@@ -226,13 +225,12 @@ union interp_ret ejit_run(struct ejit_func *f, size_t paramc, struct ejit_arg pa
 		[EJIT_OP_END] = &&END,
 	};
 
-	if (!run) {
+	if (labels_wb) {
 		*labels_wb = labels;
 		return (union interp_ret){.i = 0};
 	}
 
 	assert(f->size && "trying to run a function that hasn't been compiled");
-
 	if (f->extern_call) {
 		if (f->rtype == EJIT_INT64 || f->rtype == EJIT_UINT64)
 			return (union interp_ret){
@@ -254,8 +252,8 @@ union interp_ret ejit_run(struct ejit_func *f, size_t paramc, struct ejit_arg pa
 		};
 	}
 
-	int64_t retval = 0; double retval_f = 0.0;
-
+top:
+	union interp_ret retval = {.i = 0};
 	union fpr {
 		double d;
 		float f;
@@ -994,15 +992,15 @@ union interp_ret ejit_run(struct ejit_func *f, size_t paramc, struct ejit_arg pa
 	DISPATCH();
 
 	DO(RETVAL);
-	gpr[i.r0] = retval;
+	gpr[i.r0] = retval.i;
 	DISPATCH();
 
 	DO(RETVAL_F);
-	fpr[i.r0].f = retval_f;
+	fpr[i.r0].f = retval.f;
 	DISPATCH();
 
 	DO(RETVAL_D);
-	fpr[i.r0].d = retval_f;
+	fpr[i.r0].d = retval.f;
 	DISPATCH();
 
 	DO(PARAM);
@@ -1058,51 +1056,47 @@ union interp_ret ejit_run(struct ejit_func *f, size_t paramc, struct ejit_arg pa
 	args[argc++] = a;
 	DISPATCH();
 
-	DO(CALLI_I);
-	struct ejit_func *f = i.p;
-	retval = ejit_run(f, argc, args, true, NULL).i;
-	argc = 0;
-	DISPATCH();
+	DO(TAILR);
+	f = (struct ejit_func *)gpr[i.r1];
 
-	DO(CALLI_L);
-	struct ejit_func *f = i.p;
-	retval = ejit_run(f, argc, args, true, NULL).i;
-	argc = 0;
-	DISPATCH();
+	/** @todo we could potentially just interpret the func as a fallback
+	 * instead of aborting here, but this is good enough for now */
+	assert(!f->direct_call && "trying to interpret compiled fun");
 
-	DO(CALLI_F);
-	struct ejit_func *f = i.p;
-	retval_f = ejit_run(f, argc, args, true, NULL).f;
-	argc = 0;
+	paramc = argc;
+	for (size_t i = 0; i < argc; ++i)
+		params[i] = args[i];
+
+	goto top;
 	DISPATCH();
 
-	DO(CALLI_D);
+	DO(CALLI);
 	struct ejit_func *f = i.p;
-	retval_f = ejit_run(f, argc, args, true, NULL).f;
+	retval = ejit_run(f, argc, args, NULL);
 	argc = 0;
 	DISPATCH();
 
 	DO(ESCAPEI_I);
 	ejit_escape_i_t f = i.p;
-	retval = f(argc, args);
+	retval.i = f(argc, args);
 	argc = 0;
 	DISPATCH();
 
 	DO(ESCAPEI_L);
 	ejit_escape_l_t f = i.p;
-	retval = f(argc, args);
+	retval.i = f(argc, args);
 	argc = 0;
 	DISPATCH();
 
 	DO(ESCAPEI_F);
 	ejit_escape_f_t f = i.p;
-	retval_f = f(argc, args);
+	retval.f = f(argc, args);
 	argc = 0;
 	DISPATCH();
 
 	DO(ESCAPEI_D);
 	ejit_escape_d_t f = i.p;
-	retval_f = f(argc, args);
+	retval.f = f(argc, args);
 	argc = 0;
 	DISPATCH();
 
-- 
cgit v1.2.3