From 449c23ebc32eb45f8a6360c472bfbf5db6cd132f Mon Sep 17 00:00:00 2001
From: Kimplul <kimi.h.kuparinen@gmail.com>
Date: Tue, 1 Apr 2025 20:39:42 +0300
Subject: aarch64 linux seems to work

---
 scripts/select-compile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'scripts')

diff --git a/scripts/select-compile b/scripts/select-compile
index fca7350..9eff9fd 100755
--- a/scripts/select-compile
+++ b/scripts/select-compile
@@ -9,6 +9,7 @@ JIT="src/compile/compile.c"
 NOJIT="src/compile/nocompile.c"
 
 case "$ARCH" in
+	aarch64) echo "$JIT" ;;
 	amd64) echo "$JIT" ;;
 	x86*) echo "$JIT" ;;
 	*) echo "$NOJIT" ;;
-- 
cgit v1.2.3


From 4135845b93d5c0eab23ad5da526b03a911878d67 Mon Sep 17 00:00:00 2001
From: Kimplul <kimi.h.kuparinen@gmail.com>
Date: Tue, 1 Apr 2025 22:03:14 +0300
Subject: mipsel seems to work

---
 scripts/select-compile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'scripts')

diff --git a/scripts/select-compile b/scripts/select-compile
index 9eff9fd..878e214 100755
--- a/scripts/select-compile
+++ b/scripts/select-compile
@@ -10,6 +10,7 @@ NOJIT="src/compile/nocompile.c"
 
 case "$ARCH" in
 	aarch64) echo "$JIT" ;;
+	mipsel) echo "$JIT" ;;
 	amd64) echo "$JIT" ;;
 	x86*) echo "$JIT" ;;
 	*) echo "$NOJIT" ;;
-- 
cgit v1.2.3


From 478c92b425eca53a0d884fb8f5dea8d769016858 Mon Sep 17 00:00:00 2001
From: Kimplul <kimi.h.kuparinen@gmail.com>
Date: Tue, 1 Apr 2025 22:16:25 +0300
Subject: expose sqrt

+ Requires linking with libm in some cases, which is fine I suppose, but
  kind of annoying
---
 include/ejit/ejit.h   |  3 +++
 scripts/gen-tests     |  2 +-
 src/common.h          |  3 +++
 src/compile/compile.c | 20 ++++++++++++++++++++
 src/ejit.c            | 10 ++++++++++
 src/interp.c          | 11 +++++++++++
 tests/sqrtr_d.c       | 23 +++++++++++++++++++++++
 tests/sqrtr_f.c       | 23 +++++++++++++++++++++++
 8 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 tests/sqrtr_d.c
 create mode 100644 tests/sqrtr_f.c

(limited to 'scripts')

diff --git a/include/ejit/ejit.h b/include/ejit/ejit.h
index d4bb725..920fdc5 100644
--- a/include/ejit/ejit.h
+++ b/include/ejit/ejit.h
@@ -805,6 +805,9 @@ void ejit_truncr_d_32(struct ejit_func *s, struct ejit_gpr r0,
 void ejit_truncr_d_64(struct ejit_func *s, struct ejit_gpr r0,
                       struct ejit_fpr r1);
 
+void ejit_sqrtr_f(struct ejit_func *s, struct ejit_fpr r0, struct ejit_fpr r1);
+void ejit_sqrtr_d(struct ejit_func *s, struct ejit_fpr r0, struct ejit_fpr r1);
+
 struct ejit_reloc ejit_bltr(struct ejit_func *s, struct ejit_gpr r0,
                             struct ejit_gpr r1);
 struct ejit_reloc ejit_bner(struct ejit_func *s, struct ejit_gpr r0,
diff --git a/scripts/gen-tests b/scripts/gen-tests
index 5521006..deac247 100755
--- a/scripts/gen-tests
+++ b/scripts/gen-tests
@@ -11,5 +11,5 @@ do
 	echo "${dep}:"						>> tests.mk
 	echo "-include ${dep}"					>> tests.mk
 	echo "${exe}: ${s} libejit.a"				>> tests.mk
-	echo "	\$(COMPILE_TEST) ${s} libejit.a -o ${exe}"	>> tests.mk
+	echo "	\$(COMPILE_TEST) ${s} libejit.a -o ${exe} -lm"	>> tests.mk
 done
diff --git a/src/common.h b/src/common.h
index 6a3c754..c690f8f 100644
--- a/src/common.h
+++ b/src/common.h
@@ -142,6 +142,9 @@ enum ejit_opcode {
 	EJIT_OP_TRUNCR_F_32,
 	EJIT_OP_TRUNCR_F_64,
 
+	EJIT_OP_SQRTR_F,
+	EJIT_OP_SQRTR_D,
+
 	EJIT_OP_EQR,
 	EJIT_OP_NER,
 	EJIT_OP_GTR,
diff --git a/src/compile/compile.c b/src/compile/compile.c
index b90ee54..580b7fa 100644
--- a/src/compile/compile.c
+++ b/src/compile/compile.c
@@ -1326,6 +1326,23 @@ static void compile_truncr_f_32(struct ejit_func *f, jit_state_t *j,
 #endif
 }
 
+static void compile_sqrtr_f(struct ejit_func *f, jit_state_t *j,
+		struct ejit_insn i)
+{
+	jit_fpr_t r0 = getfpr(f, i.r0, 0);
+	jit_fpr_t r1 = getloc_f(f, j, i.r1, 1);
+	jit_sqrtr_f(j, r0, r1);
+	putloc_d(f, j, i.r0, r0);
+}
+
+static void compile_sqrtr_d(struct ejit_func *f, jit_state_t *j,
+		struct ejit_insn i)
+{
+	jit_fpr_t r0 = getfpr(f, i.r0, 0);
+	jit_fpr_t r1 = getloc_d(f, j, i.r1, 1);
+	jit_sqrtr_d(j, r0, r1);
+	putloc_d(f, j, i.r0, r0);
+}
 
 static void compile_reg_cmp(struct ejit_func *f, jit_state_t *j,
                             struct ejit_insn i,
@@ -2084,6 +2101,9 @@ static size_t compile_fn_body(struct ejit_func *f, jit_state_t *j, void *arena,
 		case EJIT_OP_TRUNCR_F_32: compile_truncr_f_32(f, j, i); break;
 		case EJIT_OP_TRUNCR_F_64: compile_truncr_f_64(f, j, i); break;
 
+		case EJIT_OP_SQRTR_F: compile_sqrtr_f(f, j, i); break;
+		case EJIT_OP_SQRTR_D: compile_sqrtr_d(f, j, i); break;
+
 		case EJIT_OP_EQR: compile_eqr(f, j, i); break;
 		case EJIT_OP_EQR_F: compile_eqr_f(f, j, i); break;
 		case EJIT_OP_EQR_D: compile_eqr_d(f, j, i); break;
diff --git a/src/ejit.c b/src/ejit.c
index e7e2ff2..2224198 100644
--- a/src/ejit.c
+++ b/src/ejit.c
@@ -1371,6 +1371,16 @@ void ejit_truncr_f_64(struct ejit_func *s, struct ejit_gpr r0,
 	emit_insn_orf(s, EJIT_OP_TRUNCR_F_64, r0, f1);
 }
 
+void ejit_sqrtr_f(struct ejit_func *s, struct ejit_fpr r0, struct ejit_fpr r1)
+{
+	emit_insn_off(s, EJIT_OP_SQRTR_F, r0, r1);
+}
+
+void ejit_sqrtr_d(struct ejit_func *s, struct ejit_fpr r0, struct ejit_fpr r1)
+{
+	emit_insn_off(s, EJIT_OP_SQRTR_D, r0, r1);
+}
+
 struct ejit_reloc ejit_bner(struct ejit_func *s, struct ejit_gpr r0,
                             struct ejit_gpr r1)
 {
diff --git a/src/interp.c b/src/interp.c
index b858f26..2d9b7c7 100644
--- a/src/interp.c
+++ b/src/interp.c
@@ -147,6 +147,9 @@ union interp_ret ejit_run(struct ejit_func *f, size_t paramc, struct ejit_arg pa
 		[EJIT_OP_TRUNCR_F_32] = &&TRUNCR_F_32,
 		[EJIT_OP_TRUNCR_F_64] = &&TRUNCR_F_64,
 
+		[EJIT_OP_SQRTR_F] = &&SQRTR_F,
+		[EJIT_OP_SQRTR_D] = &&SQRTR_D,
+
 		[EJIT_OP_BNER] = &&BNER,
 		[EJIT_OP_BNEI] = &&BNEI,
 		[EJIT_OP_BNER_F] = &&BNER_F,
@@ -784,6 +787,14 @@ union interp_ret ejit_run(struct ejit_func *f, size_t paramc, struct ejit_arg pa
 	gpr[i.r0] = (int64_t)fpr[i.r1].f;
 	DISPATCH();
 
+	DO(SQRTR_F);
+	fpr[i.r0].f = sqrt(fpr[i.r1].f);
+	DISPATCH();
+
+	DO(SQRTR_D);
+	fpr[i.r0].d = sqrt(fpr[i.r1].d);
+	DISPATCH();
+
 	DO(BNER);
 	if (gpr[i.r1] != gpr[i.r2])
 		JUMP(i.r0);
diff --git a/tests/sqrtr_d.c b/tests/sqrtr_d.c
new file mode 100644
index 0000000..06e7894
--- /dev/null
+++ b/tests/sqrtr_d.c
@@ -0,0 +1,23 @@
+#include <ejit/ejit.h>
+#include <assert.h>
+#include "do_jit.h"
+
+int main(int argc, char *argv[])
+{
+        (void)argv;
+        bool do_jit = argc > 1;
+        struct ejit_operand operands[1] = {
+                EJIT_OPERAND_FPR(0, EJIT_TYPE(double)),
+        };
+        struct ejit_func *f = ejit_create_func(EJIT_TYPE(double), 1, operands);
+
+        ejit_sqrtr_d(f, EJIT_FPR(0), EJIT_FPR(0));
+        ejit_retr_d(f, EJIT_FPR(0));
+
+        ejit_select_compile_func(f, 0, 1, EJIT_USE64(double), do_jit, true);
+
+        assert(erfd1(f, EJIT_ARG( 0.0, double)) == 0.0);
+        assert(erfd1(f, EJIT_ARG( 4.0, double)) == 2.0);
+        assert(erfd1(f, EJIT_ARG(-4.0, double))
+                        != erfd1(f, EJIT_ARG(-4.0, double))); // nan
+}
diff --git a/tests/sqrtr_f.c b/tests/sqrtr_f.c
new file mode 100644
index 0000000..3baa00d
--- /dev/null
+++ b/tests/sqrtr_f.c
@@ -0,0 +1,23 @@
+#include <ejit/ejit.h>
+#include <assert.h>
+#include "do_jit.h"
+
+int main(int argc, char *argv[])
+{
+        (void)argv;
+        bool do_jit = argc > 1;
+        struct ejit_operand operands[1] = {
+                EJIT_OPERAND_FPR(0, EJIT_TYPE(float)),
+        };
+        struct ejit_func *f = ejit_create_func(EJIT_TYPE(float), 1, operands);
+
+        ejit_sqrtr_f(f, EJIT_FPR(0), EJIT_FPR(0));
+        ejit_retr_f(f, EJIT_FPR(0));
+
+        ejit_select_compile_func(f, 0, 1, EJIT_USE64(float), do_jit, true);
+
+        assert(erff1(f, EJIT_ARG( 0.0, float)) == 0.0);
+        assert(erff1(f, EJIT_ARG( 4.0, float)) == 2.0);
+        assert(erff1(f, EJIT_ARG(-4.0, float))
+                        != erff1(f, EJIT_ARG(-4.0, float))); // nan
+}
-- 
cgit v1.2.3


From 27362d8e0af2f2a39da69239dcd207e0ff20cced Mon Sep 17 00:00:00 2001
From: Kimplul <kimi.h.kuparinen@gmail.com>
Date: Tue, 1 Apr 2025 22:29:56 +0300
Subject: mips64el seems to work

---
 scripts/select-compile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'scripts')

diff --git a/scripts/select-compile b/scripts/select-compile
index 878e214..5d934b2 100755
--- a/scripts/select-compile
+++ b/scripts/select-compile
@@ -9,6 +9,7 @@ JIT="src/compile/compile.c"
 NOJIT="src/compile/nocompile.c"
 
 case "$ARCH" in
+	mips64el) echo "$JIT" ;;
 	aarch64) echo "$JIT" ;;
 	mipsel) echo "$JIT" ;;
 	amd64) echo "$JIT" ;;
-- 
cgit v1.2.3


From a9b21a1d5c55939cf3db1f3d5c857760601adb3b Mon Sep 17 00:00:00 2001
From: Kimplul <kimi.h.kuparinen@gmail.com>
Date: Tue, 1 Apr 2025 22:41:35 +0300
Subject: powerpc64le seems to work

---
 scripts/select-compile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'scripts')

diff --git a/scripts/select-compile b/scripts/select-compile
index 5d934b2..38fc7b0 100755
--- a/scripts/select-compile
+++ b/scripts/select-compile
@@ -9,6 +9,7 @@ JIT="src/compile/compile.c"
 NOJIT="src/compile/nocompile.c"
 
 case "$ARCH" in
+	powerpc64le) echo "$JIT" ;;
 	mips64el) echo "$JIT" ;;
 	aarch64) echo "$JIT" ;;
 	mipsel) echo "$JIT" ;;
-- 
cgit v1.2.3


From 6c2e51a3f8695cb95d6a4a6859d3f934e28c8f9f Mon Sep 17 00:00:00 2001
From: Kimplul <kimi.h.kuparinen@gmail.com>
Date: Wed, 2 Apr 2025 21:14:23 +0300
Subject: improve build system a bit

---
 scripts/makefile | 13 +++++++++----
 tests/makefile   | 34 +++++++++++++++++++++++++++-------
 2 files changed, 36 insertions(+), 11 deletions(-)

(limited to 'scripts')

diff --git a/scripts/makefile b/scripts/makefile
index 3a4f200..bd02c98 100644
--- a/scripts/makefile
+++ b/scripts/makefile
@@ -29,16 +29,21 @@ OBJCOPY		!= [ "$(LLVM)" != "0" ] \
 			&& echo llvm-objcopy \
 			|| echo $(CROSS_COMPILE)objcopy
 
-COMPILER	!= [ "$(LLVM)" != "0" ] \
-			&& echo clang --target="$(CROSS_COMPILE)" \
-			|| echo $(CROSS_COMPILE)gcc
+COMPILER	!= [ -n "$(CROSS_COMPILE)" ]						\
+			&& {								\
+				[ "$(LLVM)" != "0" ]					\
+					&& echo clang --target="$(CROSS_COMPILE)"	\
+					|| echo $(CROSS_COMPILE)gcc			\
+					;						\
+			}								\
+			|| echo $(CC)
 
 
 OBFLAGS		:= -g
 WARNFLAGS	:= -Wall -Wextra
 
 COMPILE_FLAGS	:= $(CFLAGS) $(WARNFLAGS) $(OPTFLAGS) $(LTOFLAGS) \
-		   $(OBFLAGS) $(ASSERTFLAGS) $(DEBUGFLAGS)
+		   $(OBFLAGS) $(DEBUGFLAGS)
 
 INCLUDE_FLAGS	:= -I include
 
diff --git a/tests/makefile b/tests/makefile
index 5aeef98..081170f 100644
--- a/tests/makefile
+++ b/tests/makefile
@@ -1,14 +1,34 @@
 include ./tests.mk
 
-LLVM			?= 0
-CROSS_COMPILE		:=
-COMPILER		!= [ "$(LLVM)" != "0" ] \
-				&& echo clang --target="$(CROSS_COMPILE)" \
-				|| echo $(CROSS_COMPILE)gcc
+LLVM		?= 0
+COMPILER	!= [ -n "$(CROSS_COMPILE)" ]						\
+			&& {								\
+				[ "$(LLVM)" != "0" ]					\
+					&& echo clang --target="$(CROSS_COMPILE)"	\
+					|| echo $(CROSS_COMPILE)gcc			\
+					;						\
+			}								\
+			|| echo $(CC)
 
-CFLAGS			:= -Wall -Wextra -O0 -g
+RELEASE		?= 0
+OPTFLAGS	!= [ "$(RELEASE)" != "0" ] \
+			&& echo "-O2" \
+			|| echo "-O0"
+
+LTO		?= 0
+LTOFLAGS	!= [ "$(LTO)" != "0" ] \
+			&& echo "-flto=auto"
+
+DEBUG		?= 1
+DEBUGFLAGS	!= [ "$(DEBUG)" != "0" ] \
+			&& echo "-DDEBUG=1" \
+			|| echo "-DNDEBUG=1"
+
+OBFLAGS			:= -g
+WARNFLAGS		:= -Wall -Wextra
 INCLUDE_FLAGS		:= -I include
-COMPILE_TEST		:= $(COMPILER) $(CFLAGS) $(INCLUDE_FLAGS)
+COMPILE_TEST		:= $(COMPILER) $(WARNFLAGS) $(OPTFLAGS) $(LTOFLAGS) \
+			   $(OBFLAGS) $(CFLAGS) $(DEBUGFLAGS) $(INCLUDE_FLAGS)
 
 .PHONY: check
 check: $(TESTS)
-- 
cgit v1.2.3


From 89d9f5c3fc59eff3b1a46fe6d44f5ee92eeb7be4 Mon Sep 17 00:00:00 2001
From: Kimplul <kimi.h.kuparinen@gmail.com>
Date: Wed, 2 Apr 2025 21:16:13 +0300
Subject: armhf seems to work

---
 scripts/select-compile |  1 +
 src/compile/compile.c  |  4 ++--
 src/ejit.c             |  1 +
 src/interp.c           | 10 +++++-----
 4 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'scripts')

diff --git a/scripts/select-compile b/scripts/select-compile
index 38fc7b0..37c67a1 100755
--- a/scripts/select-compile
+++ b/scripts/select-compile
@@ -13,6 +13,7 @@ case "$ARCH" in
 	mips64el) echo "$JIT" ;;
 	aarch64) echo "$JIT" ;;
 	mipsel) echo "$JIT" ;;
+	armhf) echo "$JIT" ;;
 	amd64) echo "$JIT" ;;
 	x86*) echo "$JIT" ;;
 	*) echo "$NOJIT" ;;
diff --git a/src/compile/compile.c b/src/compile/compile.c
index dac8dfd..3b5399a 100644
--- a/src/compile/compile.c
+++ b/src/compile/compile.c
@@ -1927,7 +1927,7 @@ static void compile_trampoline(struct ejit_func *f, jit_state_t *j)
 			jit_leave_jit_abi(j, 0, 0, frame);
 			jit_ret(j); /* should just forward the return value */
 
-			f->direct_call = jit_address(j);
+			f->direct_call = jit_address_to_function_pointer(jit_address(j));
 			jit_patch_here(j, r);
 
 			operands_destroy(&args);
@@ -2435,7 +2435,7 @@ calli:
 	relocs_destroy(&relocs);
 	addrs_destroy(&addrs);
 
-	if (jit_end(j, &size))
+	if ((f->extern_call = jit_end(j, &size)))
 		return 0;
 
 	return size;
diff --git a/src/ejit.c b/src/ejit.c
index 0ee3986..75f6a6a 100644
--- a/src/ejit.c
+++ b/src/ejit.c
@@ -345,6 +345,7 @@ struct ejit_func *ejit_create_func(enum ejit_type rtype, size_t argc,
 	f->fpr = fpr_stats_create();
 	f->arena = NULL;
 	f->direct_call = NULL;
+	f->extern_call = NULL;
 	f->size = 0;
 	f->prio = 1;
 	f->use_64 = false;
diff --git a/src/interp.c b/src/interp.c
index 049498a..e7be77b 100644
--- a/src/interp.c
+++ b/src/interp.c
@@ -227,24 +227,24 @@ union interp_ret ejit_run(struct ejit_func *f, size_t paramc, struct ejit_arg pa
 
 	assert(f->size && "trying to run a function that hasn't been compiled");
 
-	if (f->arena) {
+	if (f->extern_call) {
 		if (f->rtype == EJIT_INT64 || f->rtype == EJIT_UINT64)
 			return (union interp_ret){
-				.i = ((ejit_escape_l_t)f->arena)(paramc, params)
+				.i = ((ejit_escape_l_t)f->extern_call)(paramc, params)
 			};
 
 		if (f->rtype == EJIT_DOUBLE)
 			return (union interp_ret){
-				.f = ((ejit_escape_d_t)f->arena)(paramc, params)
+				.f = ((ejit_escape_d_t)f->extern_call)(paramc, params)
 			};
 
 		if (f->rtype == EJIT_FLOAT)
 			return (union interp_ret){
-				.f = ((ejit_escape_f_t)f->arena)(paramc, params)
+				.f = ((ejit_escape_f_t)f->extern_call)(paramc, params)
 			};
 
 		return (union interp_ret){
-			.i = ((ejit_escape_i_t)f->arena)(paramc, params)
+			.i = ((ejit_escape_i_t)f->extern_call)(paramc, params)
 		};
 	}
 
-- 
cgit v1.2.3


From 441ddf9277f878b83c8d093def51b27285353fed Mon Sep 17 00:00:00 2001
From: Kimplul <kimi.h.kuparinen@gmail.com>
Date: Thu, 3 Apr 2025 18:26:02 +0300
Subject: fix example compilation

---
 scripts/makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'scripts')

diff --git a/scripts/makefile b/scripts/makefile
index bd02c98..5178359 100644
--- a/scripts/makefile
+++ b/scripts/makefile
@@ -68,13 +68,13 @@ libejit.a: $(EJIT_OBJS)
 examples: examples/loop examples/fib examples/matrix_mult
 
 examples/matrix_mult: examples/matrix_mult.c libejit.a
-	$(COMPILE_EJIT) examples/matrix_mult.c libejit.a -o $@
+	$(COMPILE_EJIT) examples/matrix_mult.c libejit.a -o $@ -lm
 
 examples/loop: examples/loop.c libejit.a
-	$(COMPILE_EJIT) examples/loop.c libejit.a -o $@
+	$(COMPILE_EJIT) examples/loop.c libejit.a -o $@ -lm
 
 examples/fib: examples/fib.c libejit.a
-	$(COMPILE_EJIT) examples/fib.c libejit.a -o $@
+	$(COMPILE_EJIT) examples/fib.c libejit.a -o $@ -lm
 
 # might lint some common things twice
 .PHONY:
-- 
cgit v1.2.3


From 6824dd4b1ee22184f0e600115db3998924ed39d6 Mon Sep 17 00:00:00 2001
From: Kimplul <kimi.h.kuparinen@gmail.com>
Date: Wed, 9 Apr 2025 19:56:33 +0300
Subject: initial tail call stuff

---
 .gitignore            |   1 +
 deps/lightening       |   2 +-
 examples/fib.c        |   4 +-
 examples/sum.c        |  78 +++++++++++++++++++++++++++++++++++++
 include/ejit/ejit.h   |  12 ++----
 scripts/makefile      |   5 ++-
 src/common.h          |   7 +---
 src/compile/compile.c | 106 ++++++++++++++++++++++++++++++++++++++++++--------
 src/ejit.c            |  79 ++++++++++++++-----------------------
 src/interp.c          |  58 +++++++++++++--------------
 10 files changed, 236 insertions(+), 116 deletions(-)
 create mode 100644 examples/sum.c

(limited to 'scripts')

diff --git a/.gitignore b/.gitignore
index 93b2293..50343e4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,5 @@ test-*
 examples/matrix_mult
 examples/loop
 examples/fib
+examples/sum
 examples/*.d
diff --git a/deps/lightening b/deps/lightening
index 8d59c87..3f4127c 160000
--- a/deps/lightening
+++ b/deps/lightening
@@ -1 +1 @@
-Subproject commit 8d59c872edc5a21e7dd78768b291d0c4d8136e48
+Subproject commit 3f4127c3ef16177be55cf6153a206ca4f8a4859f
diff --git a/examples/fib.c b/examples/fib.c
index 999546b..fcb0659 100644
--- a/examples/fib.c
+++ b/examples/fib.c
@@ -20,12 +20,12 @@ struct ejit_func *compile(bool try_jit, bool im_scawed)
 	struct ejit_operand arg[1] = {
 		EJIT_OPERAND_GPR(0, EJIT_INT32)
 	};
-	ejit_calli_i(f, f, 1, arg);
+	ejit_calli(f, f, 1, arg);
 	ejit_retval(f, EJIT_GPR(1)); /* loc 1 contains temp result */
 
 	/* fib(n - 2) */
 	ejit_subi(f, EJIT_GPR(0), EJIT_GPR(0), 1);
-	ejit_calli_i(f, f, 1, arg);
+	ejit_calli(f, f, 1, arg);
 	ejit_retval(f, EJIT_GPR(0)); /* loc 0 now contains second temp result */
 
 	ejit_addr(f, EJIT_GPR(0), EJIT_GPR(0), EJIT_GPR(1)); /* add results */
diff --git a/examples/sum.c b/examples/sum.c
new file mode 100644
index 0000000..cc8f54b
--- /dev/null
+++ b/examples/sum.c
@@ -0,0 +1,78 @@
+#include <stdio.h>
+#include <time.h>
+
+#include "../include/ejit/ejit.h"
+
+struct ejit_func *compile(bool try_jit, bool im_scawed)
+{
+	struct ejit_operand params[2] = {
+		EJIT_OPERAND_GPR(0, EJIT_INT32), /* loc 0 contains s */
+		EJIT_OPERAND_GPR(1, EJIT_INT32)  /* loc 1 contains n */
+	};
+	struct ejit_func *f = ejit_create_func(EJIT_INT32, 2, params);
+
+	/* n == 0, return s */
+	struct ejit_reloc recurse = ejit_bnei(f, EJIT_GPR(1), 0);
+	ejit_retr(f, EJIT_GPR(0));
+	ejit_patch(f, recurse, ejit_label(f));
+
+	/* s += n */
+	ejit_addr(f, EJIT_GPR(0), EJIT_GPR(0), EJIT_GPR(1));
+
+	/* n -= 1 */
+	ejit_subi(f, EJIT_GPR(1), EJIT_GPR(1), 1);
+
+	struct ejit_operand args[2] = {
+		EJIT_OPERAND_GPR(0, EJIT_INT32), /* s */
+		EJIT_OPERAND_GPR(1, EJIT_INT32) /* n */
+	};
+
+	/* recurse */
+	ejit_movi(f, EJIT_GPR(2), (uintptr_t)f);
+	ejit_tailr(f, EJIT_GPR(2), 2, args);
+
+	ejit_select_compile_func(f, 3, 0, EJIT_USE64(uintptr_t), try_jit, im_scawed);
+	return f;
+}
+
+int main(int argc, char *argv[])
+{
+	if(argc != 4){
+		fprintf(stderr, "Usage: %s compile_num loop_num jit\n", argv[0]);
+		return -1;
+	}
+
+	int jit_level = strtoull(argv[3], 0, 0);
+	size_t compile_num = strtoull(argv[1], 0, 0);
+	struct ejit_func **info = calloc(compile_num, sizeof(struct ejit_func *));
+
+	clock_t t = clock();
+	for(size_t i = 0; i < compile_num; ++i){
+		info[i] = compile(jit_level > 0, jit_level > 1);
+	}
+	t = clock() - t;
+
+	double compile_time_total = ((double)t) / CLOCKS_PER_SEC;
+	double compile_time_one = compile_time_total / compile_num;
+	printf("Compilation for n = %zu took %fs (1/%f).\n",
+			compile_num, compile_time_total, compile_time_one);
+
+	size_t run_num = strtoull(argv[2], 0, 0);
+	t = clock();
+	struct ejit_arg args[2] = {
+		(struct ejit_arg){.type = EJIT_INT32, .l = 0}, /* s */
+		(struct ejit_arg){.type = EJIT_INT32, .l = run_num} /* n */
+	};
+	int32_t result = ejit_run_func_i(info[0], 2, args);
+	t = clock() - t;
+
+	double run_time_total = ((double)t) / CLOCKS_PER_SEC;
+	printf("Running loop for n = %zu took %fs with res %ld\n",
+			run_num, run_time_total, (long)result);
+
+	for(size_t i = 0; i < compile_num; ++i)
+		ejit_destroy_func(info[i]);
+
+	free(info);
+	return 0;
+}
diff --git a/include/ejit/ejit.h b/include/ejit/ejit.h
index 965103c..aa42eca 100644
--- a/include/ejit/ejit.h
+++ b/include/ejit/ejit.h
@@ -454,18 +454,12 @@ typedef double (*ejit_escape_d_t)(size_t argc, const struct ejit_arg args[argc])
 
 struct ejit_label ejit_label(struct ejit_func *s);
 
-void ejit_calli_i(struct ejit_func *s, struct ejit_func *f, size_t argc,
-                const struct ejit_operand args[argc]);
+void ejit_tailr(struct ejit_func *s, struct ejit_gpr target,
+		size_t argc, const struct ejit_operand args[argc]);
 
-void ejit_calli_l(struct ejit_func *s, struct ejit_func *f, size_t argc,
+void ejit_calli(struct ejit_func *s, struct ejit_func *f, size_t argc,
                 const struct ejit_operand args[argc]);
 
-void ejit_calli_f(struct ejit_func *s, struct ejit_func *f, size_t argc,
-                  const struct ejit_operand args[argc]);
-
-void ejit_calli_d(struct ejit_func *s, struct ejit_func *f, size_t argc,
-                  const struct ejit_operand args[argc]);
-
 void ejit_escapei_i(struct ejit_func *s, ejit_escape_i_t f, size_t argc,
                   const struct ejit_operand args[argc]);
 
diff --git a/scripts/makefile b/scripts/makefile
index 5178359..dbb7a1c 100644
--- a/scripts/makefile
+++ b/scripts/makefile
@@ -65,7 +65,7 @@ COMPILE_EJIT	= $(COMPILE) $(EJIT_FLAGS)
 libejit.a: $(EJIT_OBJS)
 	$(CROSS_COMPILE)ar rcs libejit.a $(EJIT_OBJS)
 
-examples: examples/loop examples/fib examples/matrix_mult
+examples: examples/loop examples/fib examples/sum examples/matrix_mult
 
 examples/matrix_mult: examples/matrix_mult.c libejit.a
 	$(COMPILE_EJIT) examples/matrix_mult.c libejit.a -o $@ -lm
@@ -76,6 +76,9 @@ examples/loop: examples/loop.c libejit.a
 examples/fib: examples/fib.c libejit.a
 	$(COMPILE_EJIT) examples/fib.c libejit.a -o $@ -lm
 
+examples/sum: examples/sum.c libejit.a
+	$(COMPILE_EJIT) examples/sum.c libejit.a -o $@ -lm
+
 # might lint some common things twice
 .PHONY:
 lint: $(TRISCV_LINTS)
diff --git a/src/common.h b/src/common.h
index 69f1441..3512717 100644
--- a/src/common.h
+++ b/src/common.h
@@ -218,10 +218,8 @@ enum ejit_opcode {
 	EJIT_OP_ESCAPEI_F,
 	EJIT_OP_ESCAPEI_D,
 
-	EJIT_OP_CALLI_I,
-	EJIT_OP_CALLI_L,
-	EJIT_OP_CALLI_F,
-	EJIT_OP_CALLI_D,
+	EJIT_OP_CALLI,
+	EJIT_OP_TAILR,
 
 	EJIT_OP_RETR,
 	EJIT_OP_RETI,
@@ -308,7 +306,6 @@ union interp_ret {
 
 union interp_ret ejit_run(struct ejit_func *f, size_t argc,
                      struct ejit_arg args[argc],
-		     bool run,
 		     void ***labels_wb);
 
 bool ejit_compile(struct ejit_func *f, bool use_64, bool im_scawed);
diff --git a/src/compile/compile.c b/src/compile/compile.c
index dcf662b..54d79f2 100644
--- a/src/compile/compile.c
+++ b/src/compile/compile.c
@@ -22,22 +22,22 @@ struct reloc_helper {
 /* skip assertions since we know they must be valid due to type checking earlier */
 static long checked_run_i(struct ejit_func *f, size_t argc, struct ejit_arg args[argc])
 {
-	return ejit_run(f, argc, args, true, NULL).i;
+	return ejit_run(f, argc, args, NULL).i;
 }
 
 static int64_t checked_run_l(struct ejit_func *f, size_t argc, struct ejit_arg args[argc])
 {
-	return ejit_run(f, argc, args, true, NULL).i;
+	return ejit_run(f, argc, args, NULL).i;
 }
 
 static float checked_run_f(struct ejit_func *f, size_t argc, struct ejit_arg args[argc])
 {
-	return ejit_run(f, argc, args, true, NULL).f;
+	return ejit_run(f, argc, args, NULL).f;
 }
 
 static double checked_run_d(struct ejit_func *f, size_t argc, struct ejit_arg args[argc])
 {
-	return ejit_run(f, argc, args, true, NULL).f;
+	return ejit_run(f, argc, args, NULL).f;
 }
 
 static void *alloc_arena(size_t size, bool im_scawed)
@@ -47,6 +47,11 @@ static void *alloc_arena(size_t size, bool im_scawed)
 	            MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 }
 
+static void assert_helper(const char *msg)
+{
+	assert(false && msg);
+}
+
 static void free_arena(void *arena, size_t size)
 {
 	munmap(arena, size);
@@ -2042,8 +2047,6 @@ static size_t compile_fn_body(struct ejit_func *f, jit_state_t *j, void *arena,
 	struct addrs addrs = addrs_create();
 	addrs_reserve(&addrs, insns_len(&f->insns));
 
-	void *call = NULL;
-
 	size_t label = 0;
 	foreach_vec(ii, f->insns) {
 		/* if we've hit a label, add it to our vector of label addresses */
@@ -2502,21 +2505,64 @@ static size_t compile_fn_body(struct ejit_func *f, jit_state_t *j, void *arena,
 			break;
 		}
 
-		case EJIT_OP_CALLI_L:
-#if __WORDSIZE == 64
-			 call = checked_run_l; goto calli;
-#else
-			  assert(0 && "trying to compile calli_l on 32bit arch");
-			  break;
+		case EJIT_OP_TAILR: {
+			/* this is admittedly a slightly roundabout way of
+			 * implementing tail calls and is arguably not the most
+			 * performant way (if it works at all, heh) but for now
+			 * I'm more interested in functionality than raw
+			 * performance. Currently only supports two gpr
+			 * registers, but should be fairly easy to extend with
+			 * fprs as well */
+
+			assert(operands_len(&direct) <= 2);
+			jit_gpr_t r = getloc(f, j, i.r1, 0);
+			jit_ldxi(j, JIT_R0, r, offsetof(struct ejit_func, direct_call));
+#if defined(DEBUG)
+			jit_reloc_t assert_reloc = jit_bnei(j, JIT_R0, 0); /* null */
+			jit_calli_1(j, assert_helper,
+					jit_operand_imm(JIT_OPERAND_ABI_POINTER,
+						(jit_imm_t)"trying to tail call interpreted function"));
+			jit_patch_here(j, assert_reloc);
 #endif
+			jit_operand_t regs[2] = {
+				jit_operand_gpr(JIT_OPERAND_ABI_WORD, JIT_R1),
+				jit_operand_gpr(JIT_OPERAND_ABI_WORD, JIT_R2)
+			};
+			jit_move_operands(j, regs, direct.buf, operands_len(&direct));
+
+			/* with args safely in registers, reset stack/state
+			 * while avoiding overwriting the call target */
+			jit_gpr_t tmp = get_callr_temp(j);
+			jit_movr(j, tmp, JIT_R0);
+
+			int frame_size = j->frame_size;
+			jit_shrink_stack(j, stack);
+			jit_leave_jit_abi(j, gprs, fprs, frame);
+
+			/* now move args into place */
+			jit_operand_t args[2] = {};
+			foreach_vec(oi, direct) {
+				args[oi] = *operands_at(&direct, oi);
+			}
 
-		case EJIT_OP_CALLI_F: { call = checked_run_f; goto calli; }
-		case EJIT_OP_CALLI_D: { call = checked_run_d; goto calli; }
-		case EJIT_OP_CALLI_I: { call = checked_run_i; goto calli;
-calli:
+			jit_locate_args(j, operands_len(&direct), args);
+			jit_move_operands(j, args, regs, operands_len(&direct));
+			jit_jmpr(j, tmp);
+			j->frame_size = frame_size;
+
+			operands_reset(&src);
+			operands_reset(&dst);
+			operands_reset(&direct);
+			break;
+		}
+
+		case EJIT_OP_CALLI: {
 			save_caller_save_regs(f, j);
 
 			struct ejit_func *f = (struct ejit_func *)(uintptr_t)i.o;
+#if __WORDSIZE != 64
+			assert(f->rtype != EJIT_INT64 && f->rtype != EJIT_UINT64);
+#endif
 			if (f && f->direct_call) {
 				jit_calli(j, f->direct_call, operands_len(&direct), direct.buf);
 				restore_caller_save_regs(f, j);
@@ -2535,6 +2581,16 @@ calli:
 				 * argument stack address */
 				jit_operand_gpr(JIT_OPERAND_ABI_POINTER, JIT_R0)
 			};
+
+			void *call = NULL;
+			switch (f->rtype) {
+			case EJIT_INT64:
+			case EJIT_UINT64: call = checked_run_l; break;
+			case EJIT_FLOAT: call = checked_run_f; break;
+			case EJIT_DOUBLE: call = checked_run_d; break;
+			default: call = checked_run_i; break;
+			}
+
 			compile_imm_call(j, &src, &dst, call, 3, args);
 			restore_caller_save_regs(f, j);
 
@@ -2552,39 +2608,55 @@ calli:
 			jit_gpr_t r = getloc(f, j, i.r1, 0);
 			/* R0 won't get overwritten by jit_leave_jit_abi */
 			jit_movr(j, JIT_R0, r);
+
+			/* keep track of frame size so we can continue
+			 * generating code after 'leaving' the ABI. Bit of a
+			 * hack, should maybe codify this better in the
+			 * lightening API? */
+			int frame_size = j->frame_size;
 			jit_shrink_stack(j, stack);
 			jit_leave_jit_abi(j, gprs, fprs, frame);
 			jit_retr(j, JIT_R0);
+			j->frame_size = frame_size;
 			break;
 		}
 
 		case EJIT_OP_RETR_F: {
 			jit_fpr_t r = getloc_f(f, j, i.r1, 0);
 			jit_movr_f(j, JIT_F0, r);
+
+			int frame_size = j->frame_size;
 			jit_shrink_stack(j, stack);
 			jit_leave_jit_abi(j, gprs, fprs, frame);
 			jit_retr_f(j, JIT_F0);
+			j->frame_size = frame_size;
 			break;
 		}
 
 		case EJIT_OP_RETR_D: {
 			jit_fpr_t r = getloc_d(f, j, i.r1, 0);
 			jit_movr_d(j, JIT_F0, r);
+
+			int frame_size = j->frame_size;
 			jit_shrink_stack(j, stack);
 			jit_leave_jit_abi(j, gprs, fprs, frame);
 			jit_retr_d(j, JIT_F0);
+			j->frame_size = frame_size;
 			break;
 		}
 
 		case EJIT_OP_RETI: {
+			int frame_size = j->frame_size;
 			jit_shrink_stack(j, stack);
 			jit_leave_jit_abi(j, gprs, fprs, frame);
 			jit_reti(j, i.o);
+			j->frame_size = frame_size;
 			break;
 		}
 
 		case EJIT_OP_END: {
-			/* 'void' return */
+			/* 'void' return, must be last thing in function so no
+			 * need to keep track of frame size */
 			jit_shrink_stack(j, stack);
 			jit_leave_jit_abi(j, gprs, fprs, frame);
 			jit_reti(j, 0);
diff --git a/src/ejit.c b/src/ejit.c
index 571a274..059d5d4 100644
--- a/src/ejit.c
+++ b/src/ejit.c
@@ -414,7 +414,7 @@ void ejit_select_compile_func(struct ejit_func *f, size_t gpr, size_t fpr,
 
 	void **labels;
 	/* just get labels, don't actually run anything yet */
-	ejit_run(f, 0, NULL, false, &labels);
+	ejit_run(f, 0, NULL, &labels);
 
 	foreach_vec(ii, f->insns) {
 		struct ejit_insn i = *insns_at(&f->insns, ii);
@@ -456,67 +456,48 @@ void ejit_patch(struct ejit_func *f, struct ejit_reloc r, struct ejit_label l)
 	*insns_at(&f->insns, r.insn) = i;
 }
 
-void ejit_calli_i(struct ejit_func *s, struct ejit_func *f, size_t argc,
+void ejit_tailr(struct ejit_func *s, struct ejit_gpr target, size_t argc,
                 const struct ejit_operand args[argc])
 {
 	s->max_args = argc > s->max_args ? argc : s->max_args;
-	check_operands(f, argc, args);
 
+	/** @todo check that gpr_args <= 2 and fpr_args <= 3 (?) */
+	size_t gpr_args = 0, fpr_args = 0;
 	for (size_t i = 0; i < argc; ++i) {
 		switch (args[i].kind) {
-		case EJIT_OPERAND_GPR: emit_insn_ar(s, EJIT_OP_ARG, i, args[i].type, EJIT_GPR(args[i].r)); break;
-		case EJIT_OPERAND_FPR: emit_insn_af(s, EJIT_OP_ARG_F, i, args[i].type, EJIT_FPR(args[i].r)); break;
-		case EJIT_OPERAND_IMM: emit_insn_ai(s, EJIT_OP_ARG_I, i, args[i].type, args[i].r); break;
-		case EJIT_OPERAND_FLT: emit_insn_ad(s, EJIT_OP_ARG_FI, i, args[i].type, args[i].d); break;
-		default: abort();
-		}
-	}
-
-	emit_insn_op(s, EJIT_OP_CALLI_I, f);
-}
-
-void ejit_calli_l(struct ejit_func *s, struct ejit_func *f, size_t argc,
-                const struct ejit_operand args[argc])
-{
-	s->use_64 = true;
-	s->max_args = argc > s->max_args ? argc : s->max_args;
-	check_operands(f, argc, args);
+		case EJIT_OPERAND_GPR:
+			gpr_args++;
+			emit_insn_ar(s, EJIT_OP_ARG, i, args[i].type, EJIT_GPR(args[i].r));
+			break;
 
-	for (size_t i = 0; i < argc; ++i) {
-		switch (args[i].kind) {
-		case EJIT_OPERAND_GPR: emit_insn_ar(s, EJIT_OP_ARG, i, args[i].type, EJIT_GPR(args[i].r)); break;
-		case EJIT_OPERAND_FPR: emit_insn_af(s, EJIT_OP_ARG_F, i, args[i].type, EJIT_FPR(args[i].r)); break;
-		case EJIT_OPERAND_IMM: emit_insn_ai(s, EJIT_OP_ARG_I, i, args[i].type, args[i].r); break;
-		case EJIT_OPERAND_FLT: emit_insn_ad(s, EJIT_OP_ARG_FI, i, args[i].type, args[i].d); break;
-		default: abort();
-		}
-	}
+		case EJIT_OPERAND_FPR:
+			fpr_args++;
+			emit_insn_af(s, EJIT_OP_ARG_F, i, args[i].type, EJIT_FPR(args[i].r));
+			break;
 
-	emit_insn_op(s, EJIT_OP_CALLI_L, f);
-}
+		case EJIT_OPERAND_IMM:
+			gpr_args++;
+			emit_insn_ai(s, EJIT_OP_ARG_I, i, args[i].type, args[i].r);
+			break;
 
-void ejit_calli_f(struct ejit_func *s, struct ejit_func *f, size_t argc,
-                const struct ejit_operand args[argc])
-{
-	s->max_args = argc > s->max_args ? argc : s->max_args;
-	check_operands(f, argc, args);
+		case EJIT_OPERAND_FLT:
+			fpr_args++;
+			emit_insn_ad(s, EJIT_OP_ARG_FI, i, args[i].type, args[i].d);
+			break;
 
-	for (size_t i = 0; i < argc; ++i) {
-		switch (args[i].kind) {
-		case EJIT_OPERAND_GPR: emit_insn_ar(s, EJIT_OP_ARG, i, args[i].type, EJIT_GPR(args[i].r)); break;
-		case EJIT_OPERAND_FPR: emit_insn_af(s, EJIT_OP_ARG_F, i, args[i].type, EJIT_FPR(args[i].r)); break;
-		case EJIT_OPERAND_IMM: emit_insn_ai(s, EJIT_OP_ARG_I, i, args[i].type, args[i].r); break;
-		case EJIT_OPERAND_FLT: emit_insn_ad(s, EJIT_OP_ARG_FI, i, args[i].type, args[i].d); break;
 		default: abort();
 		}
 	}
 
-	emit_insn_op(s, EJIT_OP_CALLI_F, f);
+	assert(gpr_args <= 2 && fpr_args == 0
+			&& "only 2 gpr args and 0 fpr args supported in tail calls for now");
+	emit_insn_oxr(s, EJIT_OP_TAILR, target);
 }
 
-void ejit_calli_d(struct ejit_func *s, struct ejit_func *f, size_t argc,
+void ejit_calli(struct ejit_func *s, struct ejit_func *f, size_t argc,
                 const struct ejit_operand args[argc])
 {
+	s->use_64 = f->rtype == EJIT_INT64 || f->rtype == EJIT_UINT64;
 	s->max_args = argc > s->max_args ? argc : s->max_args;
 	check_operands(f, argc, args);
 
@@ -530,7 +511,7 @@ void ejit_calli_d(struct ejit_func *s, struct ejit_func *f, size_t argc,
 		}
 	}
 
-	emit_insn_op(s, EJIT_OP_CALLI_D, f);
+	emit_insn_op(s, EJIT_OP_CALLI, f);
 }
 
 void ejit_escapei_i(struct ejit_func *s, ejit_escape_i_t f, size_t argc,
@@ -1712,7 +1693,7 @@ long ejit_run_func_i(struct ejit_func *f, size_t argc,
 #endif
 		);
 
-	return ejit_run(f, argc, args, true, NULL).i;
+	return ejit_run(f, argc, args, NULL).i;
 }
 
 int64_t ejit_run_func_l(struct ejit_func *f, size_t argc,
@@ -1720,21 +1701,21 @@ int64_t ejit_run_func_l(struct ejit_func *f, size_t argc,
 {
 	check_args(f, argc, args);
 	assert(f->rtype == EJIT_INT64 || f->rtype == EJIT_UINT64);
-	return ejit_run(f, argc, args, true, NULL).i;
+	return ejit_run(f, argc, args, NULL).i;
 }
 
 float ejit_run_func_f(struct ejit_func *f, size_t argc, struct ejit_arg args[argc])
 {
 	check_args(f, argc, args);
 	assert(f->rtype == EJIT_FLOAT);
-	return ejit_run(f, argc, args, true, NULL).f;
+	return ejit_run(f, argc, args, NULL).f;
 }
 
 double ejit_run_func_d(struct ejit_func *f, size_t argc, struct ejit_arg args[argc])
 {
 	check_args(f, argc, args);
 	assert(f->rtype == EJIT_DOUBLE);
-	return ejit_run(f, argc, args, true, NULL).f;
+	return ejit_run(f, argc, args, NULL).f;
 }
 
 struct ejit_arg ejit_run_func(struct ejit_func *f, size_t argc, struct ejit_arg args[argc])
diff --git a/src/interp.c b/src/interp.c
index 6ef414d..132ba4a 100644
--- a/src/interp.c
+++ b/src/interp.c
@@ -5,7 +5,7 @@
 /* this is the body of a given ejit_interp function, it assumes there's an
  * external int64_t retval and double retval_f into which it places the value to
  * be returned. Included from src/interp.c */
-union interp_ret ejit_run(struct ejit_func *f, size_t paramc, struct ejit_arg params[paramc], bool run, void ***labels_wb)
+union interp_ret ejit_run(struct ejit_func *f, size_t paramc, struct ejit_arg params[paramc], void ***labels_wb)
 {
 	static void *labels[EJIT_OPCODE_COUNT] = {
 		[EJIT_OP_MOVI] = &&MOVI,
@@ -213,10 +213,9 @@ union interp_ret ejit_run(struct ejit_func *f, size_t paramc, struct ejit_arg pa
 		[EJIT_OP_PARAM] = &&PARAM,
 		[EJIT_OP_PARAM_F] = &&PARAM_F,
 
-		[EJIT_OP_CALLI_I] = &&CALLI_I,
-		[EJIT_OP_CALLI_L] = &&CALLI_L,
-		[EJIT_OP_CALLI_F] = &&CALLI_F,
-		[EJIT_OP_CALLI_D] = &&CALLI_D,
+		[EJIT_OP_CALLI] = &&CALLI,
+		[EJIT_OP_TAILR] = &&TAILR,
+
 		[EJIT_OP_ESCAPEI_I] = &&ESCAPEI_I,
 		[EJIT_OP_ESCAPEI_F] = &&ESCAPEI_F,
 		[EJIT_OP_ESCAPEI_L] = &&ESCAPEI_L,
@@ -226,13 +225,12 @@ union interp_ret ejit_run(struct ejit_func *f, size_t paramc, struct ejit_arg pa
 		[EJIT_OP_END] = &&END,
 	};
 
-	if (!run) {
+	if (labels_wb) {
 		*labels_wb = labels;
 		return (union interp_ret){.i = 0};
 	}
 
 	assert(f->size && "trying to run a function that hasn't been compiled");
-
 	if (f->extern_call) {
 		if (f->rtype == EJIT_INT64 || f->rtype == EJIT_UINT64)
 			return (union interp_ret){
@@ -254,8 +252,8 @@ union interp_ret ejit_run(struct ejit_func *f, size_t paramc, struct ejit_arg pa
 		};
 	}
 
-	int64_t retval = 0; double retval_f = 0.0;
-
+top:
+	union interp_ret retval = {.i = 0};
 	union fpr {
 		double d;
 		float f;
@@ -994,15 +992,15 @@ union interp_ret ejit_run(struct ejit_func *f, size_t paramc, struct ejit_arg pa
 	DISPATCH();
 
 	DO(RETVAL);
-	gpr[i.r0] = retval;
+	gpr[i.r0] = retval.i;
 	DISPATCH();
 
 	DO(RETVAL_F);
-	fpr[i.r0].f = retval_f;
+	fpr[i.r0].f = retval.f;
 	DISPATCH();
 
 	DO(RETVAL_D);
-	fpr[i.r0].d = retval_f;
+	fpr[i.r0].d = retval.f;
 	DISPATCH();
 
 	DO(PARAM);
@@ -1058,51 +1056,47 @@ union interp_ret ejit_run(struct ejit_func *f, size_t paramc, struct ejit_arg pa
 	args[argc++] = a;
 	DISPATCH();
 
-	DO(CALLI_I);
-	struct ejit_func *f = i.p;
-	retval = ejit_run(f, argc, args, true, NULL).i;
-	argc = 0;
-	DISPATCH();
+	DO(TAILR);
+	f = (struct ejit_func *)gpr[i.r1];
 
-	DO(CALLI_L);
-	struct ejit_func *f = i.p;
-	retval = ejit_run(f, argc, args, true, NULL).i;
-	argc = 0;
-	DISPATCH();
+	/** @todo we could potentially just interpret the func as a fallback
+	 * instead of aborting here, but this is good enough for now */
+	assert(!f->direct_call && "trying to interpret compiled fun");
 
-	DO(CALLI_F);
-	struct ejit_func *f = i.p;
-	retval_f = ejit_run(f, argc, args, true, NULL).f;
-	argc = 0;
+	paramc = argc;
+	for (size_t i = 0; i < argc; ++i)
+		params[i] = args[i];
+
+	goto top;
 	DISPATCH();
 
-	DO(CALLI_D);
+	DO(CALLI);
 	struct ejit_func *f = i.p;
-	retval_f = ejit_run(f, argc, args, true, NULL).f;
+	retval = ejit_run(f, argc, args, NULL);
 	argc = 0;
 	DISPATCH();
 
 	DO(ESCAPEI_I);
 	ejit_escape_i_t f = i.p;
-	retval = f(argc, args);
+	retval.i = f(argc, args);
 	argc = 0;
 	DISPATCH();
 
 	DO(ESCAPEI_L);
 	ejit_escape_l_t f = i.p;
-	retval = f(argc, args);
+	retval.i = f(argc, args);
 	argc = 0;
 	DISPATCH();
 
 	DO(ESCAPEI_F);
 	ejit_escape_f_t f = i.p;
-	retval_f = f(argc, args);
+	retval.f = f(argc, args);
 	argc = 0;
 	DISPATCH();
 
 	DO(ESCAPEI_D);
 	ejit_escape_d_t f = i.p;
-	retval_f = f(argc, args);
+	retval.f = f(argc, args);
 	argc = 0;
 	DISPATCH();
 
-- 
cgit v1.2.3


From 531d307d310881e69efc8ae8c8119f5f5799e0f9 Mon Sep 17 00:00:00 2001
From: Kimplul <kimi.h.kuparinen@gmail.com>
Date: Thu, 10 Apr 2025 22:22:33 +0300
Subject: improve tests a bit

+ Runs tests on all arches that I have easy access to
---
 Makefile              | 125 ++++++++++++++++++++++++++++++++++++++++++++++----
 scripts/gen-tests     |   2 +
 src/compile/compile.c |  53 +++++++++++++++------
 src/interp.c          |   2 +-
 tests/makefile        |  10 ----
 5 files changed, 157 insertions(+), 35 deletions(-)

(limited to 'scripts')

diff --git a/Makefile b/Makefile
index 7a482b7..548e5b8 100644
--- a/Makefile
+++ b/Makefile
@@ -12,16 +12,121 @@ check: all
 	@./scripts/gen-tests $$(echo tests/*.c)
 	$(MAKE) -f tests/makefile check
 
-# this kicks all unrecognised targets to the client script.
-# note that trying to compile individual files, e.g.
-#
-#	make kernel.elf
-#
-# will not work, you would need
-#
-#	make -f scripts/makefile kernel.elf
-#
-# instead
+# supported by jit
+.PHONY: check_linux_amd64
+check_linux_amd64:
+	$(MAKE) clean
+	QEMU_LD_PREFIX=/usr/x86_64-linux-gnu \
+	$(MAKE) ARCH=amd64 CROSS_COMPILE=x86_64-linux-gnu- check
+
+.PHONY: check_linux_x86
+check_linux_x86:
+	$(MAKE) clean
+	QEMU_LD_PREFIX=/usr/i686-linux-gnu \
+	$(MAKE) ARCH=x86 CROSS_COMPILE=i686-linux-gnu- check
+
+.PHONY: check_linux_aarch64
+check_linux_aarch64:
+	$(MAKE) clean
+	QEMU_LD_PREFIX=/usr/aarch64-linux-gnu \
+	$(MAKE) ARCH=aarch64 CROSS_COMPILE=aarch64-linux-gnu- check
+
+.PHONY: check_linux_armhf
+check_linux_armhf:
+	$(MAKE) clean
+	QEMU_LD_PREFIX=/usr/arm-linux-gnueabihf \
+	$(MAKE) ARCH=armhf CROSS_COMPILE=arm-linux-gnueabihf- check
+
+.PHONY: check_linux_powerpc64le
+check_linux_powerpc64le:
+	$(MAKE) clean
+	QEMU_LD_PREFIX=/usr/powerpc64le-linux-gnu \
+	$(MAKE) ARCH=powerpc64le CROSS_COMPILE=powerpc64le-linux-gnu- check
+
+# note: older than revision 6
+.PHONY: check_linux_mips64el
+check_linux_mips64el:
+	$(MAKE) clean
+	QEMU_LD_PREFIX=/usr/mips64el-linux-gnuabi64 \
+	$(MAKE) ARCH=mips64el CROSS_COMPILE=mips64el-linux-gnuabi64- check
+
+.PHONY: check_linux_mipsel
+check_linux_mipsel:
+	$(MAKE) clean
+	QEMU_LD_PREFIX=/usr/mipsel-linux-gnu \
+	$(MAKE) ARCH=mipsel CROSS_COMPILE=mipsel-linux-gnu- check
+
+# not supported by jit atm
+.PHONY: check_linux_powerpc64
+check_linux_powerpc64:
+	$(MAKE) clean
+	QEMU_LD_PREFIX=/usr/powerpc64-linux-gnu \
+	$(MAKE) ARCH=powerpc64 CROSS_COMPILE=powerpc64-linux-gnu- check
+
+.PHONY: check_linux_powerpc
+check_linux_powerpc:
+	$(MAKE) clean
+	QEMU_LD_PREFIX=/usr/powerpc-linux-gnu \
+	$(MAKE) ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- check
+
+.PHONY: check_linux_sparc64
+check_linux_sparc64:
+	$(MAKE) clean
+	QEMU_LD_PREFIX=/usr/sparc64-linux-gnu \
+	$(MAKE) ARCH=sparc64 CROSS_COMPILE=sparc64-linux-gnu- check
+
+.PHONY: check_linux_riscv64
+check_linux_riscv64:
+	$(MAKE) clean
+	QEMU_LD_PREFIX=/usr/riscv64-linux-gnu \
+	$(MAKE) ARCH=riscv64 CROSS_COMPILE=riscv64-linux-gnu- check
+
+.PHONY: check_linux_s390x
+check_linux_s390x:
+	$(MAKE) clean
+	QEMU_LD_PREFIX=/usr/s390x-linux-gnu \
+	$(MAKE) ARCH=s390x CROSS_COMPILE=s390x-linux-gnu- check
+
+.PHONY: check_linux_alpha
+check_linux_alpha:
+	$(MAKE) clean
+	QEMU_LD_PREFIX=/usr/alpha-linux-gnu \
+	$(MAKE) ARCH=alpha CROSS_COMPILE=alpha-linux-gnu- check
+
+.PHONY: check_linux_hppa
+check_linux_hppa:
+	$(MAKE) clean
+	QEMU_LD_PREFIX=/usr/hppa-linux-gnu \
+	$(MAKE) ARCH=hppa CROSS_COMPILE=hppa-linux-gnu- check
+
+.PHONY: check_linux_m68k
+check_linux_m68k:
+	$(MAKE) clean
+	QEMU_LD_PREFIX=/usr/m68k-linux-gnu \
+	$(MAKE) ARCH=m68k CROSS_COMPILE=m68k-linux-gnu- check
+
+.PHONY: check_linux
+check_linux:
+	$(MAKE) check_linux_amd64
+	$(MAKE) check_linux_x86
+	$(MAKE) check_linux_aarch64
+	$(MAKE) check_linux_armhf
+	$(MAKE) check_linux_powerpc64le
+	$(MAKE) check_linux_mips64el
+	$(MAKE) check_linux_mipsel
+	$(MAKE) check_linux_powerpc64
+	$(MAKE) check_linux_powerpc
+	$(MAKE) check_linux_sparc64
+	$(MAKE) check_linux_riscv64
+	$(MAKE) check_linux_s390x
+	$(MAKE) check_linux_alpha
+	$(MAKE) check_linux_hppa
+	$(MAKE) check_linux_m68k
+	# compiler+emulator not available or broken
+	#$(MAKE) check_linux_hppa64
+	#$(MAKE) check_linux_arc
+	#$(MAKE) check_linux_sh4
+
 .DEFAULT: setup
 	$(MAKE) -f scripts/makefile $<
 
diff --git a/scripts/gen-tests b/scripts/gen-tests
index deac247..47ff9c9 100755
--- a/scripts/gen-tests
+++ b/scripts/gen-tests
@@ -12,4 +12,6 @@ do
 	echo "-include ${dep}"					>> tests.mk
 	echo "${exe}: ${s} libejit.a"				>> tests.mk
 	echo "	\$(COMPILE_TEST) ${s} libejit.a -o ${exe} -lm"	>> tests.mk
+	echo "	./${exe}  \t# bytecode"				>> tests.mk
+	echo "	./${exe} 1\t# jit"				>> tests.mk
 done
diff --git a/src/compile/compile.c b/src/compile/compile.c
index bfcb12d..5432bc1 100644
--- a/src/compile/compile.c
+++ b/src/compile/compile.c
@@ -52,6 +52,19 @@ static void assert_helper(const char *msg)
 	assert(false && msg);
 }
 
+static bool gpr_free(size_t argc, jit_operand_t args[argc], jit_gpr_t r)
+{
+	for (size_t i = 0; i < argc; ++i) {
+		if (args[i].kind != JIT_OPERAND_KIND_GPR)
+			continue;
+
+		if (jit_gpr_regno(args[i].loc.gpr.gpr) == jit_gpr_regno(r))
+			return false;
+	}
+
+	return true;
+}
+
 static void free_arena(void *arena, size_t size)
 {
 	munmap(arena, size);
@@ -2489,7 +2502,7 @@ static size_t compile_fn_body(struct ejit_func *f, jit_state_t *j, void *arena,
 				 * argument stack address */
 				jit_operand_gpr(JIT_OPERAND_ABI_POINTER, JIT_R0)
 			};
-			compile_imm_call(j, &src, &dst, (void *)(uintptr_t)i.o, 2, args);
+			compile_imm_call(j, &src, &dst, (void *)i.p, 2, args);
 			restore_caller_save_regs(f, j);
 
 			operands_reset(&src);
@@ -2502,7 +2515,7 @@ static size_t compile_fn_body(struct ejit_func *f, jit_state_t *j, void *arena,
 			/* a bit of copy-paste between this and the next func,
 			 * hmm */
 			assert(operands_len(&direct) <= 2);
-			struct ejit_func *f = (struct ejit_func *)(uintptr_t)i.o;
+			struct ejit_func *f = (struct ejit_func *)i.p;
 			assert(f->direct_call);
 
 			jit_operand_t regs[2] = {
@@ -2560,32 +2573,44 @@ static size_t compile_fn_body(struct ejit_func *f, jit_state_t *j, void *arena,
 						(jit_imm_t)"trying to tail call interpreted function"));
 			jit_patch_here(j, direct_reloc);
 #endif
+			size_t argc = operands_len(&direct);
 
+			/* r0 = target, r1 = arg1, r2 = arg2 */
 			jit_ldxi(j, JIT_R0, r, offsetof(struct ejit_func, direct_call));
-			jit_operand_t regs[2] = {
+			jit_operand_t regs[3] = {
 				jit_operand_gpr(JIT_OPERAND_ABI_WORD, JIT_R1),
 				jit_operand_gpr(JIT_OPERAND_ABI_WORD, JIT_R2)
 			};
-			jit_move_operands(j, regs, direct.buf, operands_len(&direct));
+			jit_move_operands(j, regs, direct.buf, argc);
 
 			/* with args safely in registers, reset stack/state
 			 * while avoiding overwriting the call target */
-			jit_gpr_t tmp = get_callr_temp(j);
-			jit_movr(j, tmp, JIT_R0);
-
 			int frame_size = j->frame_size;
 			jit_shrink_stack(j, stack);
 			jit_leave_jit_abi(j, gprs, fprs, frame);
 
-			/* now move args into place */
-			jit_operand_t args[2] = {};
-			foreach_vec(oi, direct) {
+			/* now move args into place, making sure we avoid our
+			 * target register  */
+			jit_operand_t args[3] = {};
+			for (size_t oi = 0; oi < argc; ++oi) {
 				args[oi] = *operands_at(&direct, oi);
 			}
 
-			jit_locate_args(j, operands_len(&direct), args);
-			jit_move_operands(j, args, regs, operands_len(&direct));
-			jit_jmpr(j, tmp);
+			jit_locate_args(j, argc, args);
+
+			/* we know that at least one gpr must be free */
+			jit_gpr_t target = gpr_free(argc, args, JIT_R0) ? JIT_R0
+				         : gpr_free(argc, args, JIT_R1) ? JIT_R1
+					 : gpr_free(argc, args, JIT_R2) ? JIT_R2
+					 : (abort(), JIT_R0);
+
+			/* move our target in JIT_R0 to whatever the free
+			 * register is to avoid it being clobbered when we move
+			 * the actual arguments */
+			args[argc] = jit_operand_gpr(JIT_OPERAND_ABI_POINTER, target);
+			regs[argc] = jit_operand_gpr(JIT_OPERAND_ABI_POINTER, JIT_R0);
+			jit_move_operands(j, args, regs, argc + 1);
+			jit_jmpr(j, target);
 			j->frame_size = frame_size;
 
 			operands_reset(&src);
@@ -2641,7 +2666,7 @@ static size_t compile_fn_body(struct ejit_func *f, jit_state_t *j, void *arena,
 		case EJIT_OP_CALLI: {
 			save_caller_save_regs(f, j);
 
-			struct ejit_func *f = (struct ejit_func *)(uintptr_t)i.o;
+			struct ejit_func *f = (struct ejit_func *)i.p;
 #if __WORDSIZE != 64
 			assert(f->rtype != EJIT_INT64 && f->rtype != EJIT_UINT64);
 #endif
diff --git a/src/interp.c b/src/interp.c
index 6f94f98..894be30 100644
--- a/src/interp.c
+++ b/src/interp.c
@@ -1063,7 +1063,7 @@ top:
 	DISPATCH();
 
 	DO(TAILI);
-	f = (struct ejit_func *)(uintptr_t)i.o;
+	f = (struct ejit_func *)i.p;
 
 	assert(!f->direct_call && "trying to interpret compiled fun");
 
diff --git a/tests/makefile b/tests/makefile
index 081170f..53115de 100644
--- a/tests/makefile
+++ b/tests/makefile
@@ -32,14 +32,4 @@ COMPILE_TEST		:= $(COMPILER) $(WARNFLAGS) $(OPTFLAGS) $(LTOFLAGS) \
 
 .PHONY: check
 check: $(TESTS)
-	@echo "Running bytecode tests..."
-	@set -e; for test in $(TESTS); do \
-		echo "Testing: $$test"; \
-		./$$test; \
-	done
-	@echo "Running jit tests..."
-	@set -e; for test in $(TESTS); do \
-		echo "Testing: $$test"; \
-		./$$test 1; \
-	done
 	@echo "Success!"
-- 
cgit v1.2.3