#include <ejit/ejit.h>
#include "../../deps/lightening/lightening/lightening.c"
#include "../common.h"

#define VEC_TYPE jit_operand_t
#define VEC_NAME operands
#include "../vec.h"

struct reloc_helper {
	jit_reloc_t r;
	size_t to;
};

#define VEC_TYPE struct reloc_helper
#define VEC_NAME relocs
#include "../vec.h"

#define VEC_TYPE jit_addr_t
#define VEC_NAME addrs
#include "../vec.h"

/* skip assertions since we know they must be valid due to type checking earlier */
static long checked_run_i(struct ejit_func *f, size_t argc, struct ejit_arg args[argc])
{
	return ejit_run(f, argc, args, true, NULL).i;
}

static int64_t checked_run_l(struct ejit_func *f, size_t argc, struct ejit_arg args[argc])
{
	return ejit_run(f, argc, args, true, NULL).i;
}

static float checked_run_f(struct ejit_func *f, size_t argc, struct ejit_arg args[argc])
{
	return ejit_run(f, argc, args, true, NULL).f;
}

static double checked_run_d(struct ejit_func *f, size_t argc, struct ejit_arg args[argc])
{
	return ejit_run(f, argc, args, true, NULL).f;
}

static void *alloc_arena(size_t size, bool im_scawed)
{
	return mmap(NULL, size,
	            (!im_scawed ? PROT_EXEC : 0) | PROT_READ | PROT_WRITE,
	            MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
}

static void free_arena(void *arena, size_t size)
{
	munmap(arena, size);
}

/* value slots are mapped to physical registers such that the first
 * available callee-save register is used. When those run out, skip R0/R1/R2 as
 * they are reserved for transferring values to/from the stack, continue with
 * caller-save. When those run out, put slots on stack.
 *
 * The reasoning here is that callee-save are stored to the stack once, and
 * after that are 'free', whereas caller-save registers must be stored/restored
 * before/after each call. With some more advanced liveness analysis we might be
 * able to avoid storing most caller-save registers, but the simple scheme I
 * have here just pushes all regs so try to avoid that as much as possible.
 */
static size_t physgpr_count()
{
	return jit_v_num() + jit_r_num() - 3;
}

/* how many gpr slots are on the stack */
static size_t gprloc_stack_count(struct ejit_func *f)
{
	return gpr_stats_len(&f->gpr) < physgpr_count()
		? 0
		: gpr_stats_len(&f->gpr) - physgpr_count();
}

/* get physical register for ordered gpr slot */
static jit_gpr_t physgpr_at(size_t r)
{
	if (r < jit_v_num())
		return jit_v(r);

	/* avoid R0 - R2 as they're reserved for tmp use */
	return jit_r(r - jit_v_num() + 3);
}

static size_t caller_save_gprs(struct ejit_func *f)
{
	if (gpr_stats_len(&f->gpr) >= physgpr_count())
		return jit_r_num() - 3;

	if (gpr_stats_len(&f->gpr) <= jit_v_num())
		return 0;

	return gpr_stats_len(&f->gpr) - jit_v_num();
}

static size_t physfpr_count()
{
	return jit_vf_num() + jit_f_num() - 3;
}

static jit_fpr_t physfpr_at(size_t r)
{
	if (r < jit_vf_num())
		return jit_vf(r);

	return jit_f(r - jit_vf_num() + 3);
}

static size_t fprloc_stack_count(struct ejit_func *f)
{
	return fpr_stats_len(&f->fpr) < physfpr_count()
		? 0
		: fpr_stats_len(&f->fpr) - physfpr_count();
}

static size_t caller_save_fprs(struct ejit_func *f)
{
	if (fpr_stats_len(&f->fpr) >= physfpr_count())
		return jit_f_num() - 3;

	if (fpr_stats_len(&f->fpr) <= jit_vf_num())
		return 0;

	return fpr_stats_len(&f->fpr) - jit_vf_num();
}

static size_t stack_size(struct ejit_func *f)
{
	return gprloc_stack_count(f) * sizeof(jit_uword_t)
	       + fprloc_stack_count(f) * sizeof(jit_float64_t)
	       + caller_save_gprs(f) * sizeof(jit_uword_t)
	       + caller_save_fprs(f) * sizeof(jit_float64_t)
	       ;
}

static jit_off_t stack_loc(size_t l)
{
	assert(l >= physgpr_count());
	return (l - physgpr_count()) * sizeof(jit_uword_t);
}

static jit_off_t stack_loc_f(struct ejit_func *f, size_t l)
{
	assert(l >= physfpr_count());
	return gprloc_stack_count(f) * sizeof(jit_uword_t)
	       + (l - physfpr_count()) * sizeof(jit_float64_t);
}

static jit_off_t stack_loc_save_gpr(struct ejit_func *f, size_t i)
{
	return gprloc_stack_count(f) * sizeof(jit_uword_t)
		+ fprloc_stack_count(f) * sizeof(jit_float64_t)
		+ i * sizeof(jit_uword_t);
}

static jit_off_t stack_loc_save_fpr(struct ejit_func *f, size_t i)
{
	return gprloc_stack_count(f) * sizeof(jit_uword_t)
		+ fprloc_stack_count(f) * sizeof(jit_float64_t)
		+ caller_save_gprs(f) * sizeof(jit_uword_t)
		+ i * sizeof(jit_float64_t);
}

/* for now, just push all caller-save register. Theoretically, we could fairly
 * easily keep track of ranges where registers are alive and skip ones that are
 * dead here, but I'm not sure how useful that would be without some form of
 * SSA, which is maybe pushing how complex I want this to become. */
static void save_caller_save_regs(struct ejit_func *f, jit_state_t *j)
{
	for (size_t i = 0; i < caller_save_gprs(f); ++i)
		jit_stxi(j, stack_loc_save_gpr(f, i), JIT_SP, jit_r(i + 3));

	for (size_t i = 0; i < caller_save_fprs(f); ++i)
		jit_stxi_d(j, stack_loc_save_fpr(f, i), JIT_SP, jit_f(i + 3));
}

static void restore_caller_save_regs(struct ejit_func *f, jit_state_t *j)
{
	for (size_t i = 0; i < caller_save_gprs(f); ++i)
		jit_ldxi(j, jit_r(i + 3), JIT_SP, stack_loc_save_gpr(f, i));

	for (size_t i = 0; i < caller_save_fprs(f); ++i)
		jit_ldxi_d(j, jit_f(i + 3), JIT_SP, stack_loc_save_fpr(f, i));
}


/* get ordered slot register. If slot is directly mapped to a physical register,
 * return it, otherwise load from stack into R0-R2 (given by i) */
static jit_gpr_t getloc(struct ejit_func *f, jit_state_t *j, size_t l, size_t i)
{
	assert(l < gpr_stats_len(&f->gpr));
	assert(i <= 2);
	size_t r = gpr_stats_at(&f->gpr, l)->rno;
	if (r < physgpr_count())
		return physgpr_at(r);

	jit_ldxi(j, jit_r(i), JIT_SP, stack_loc(r));
	return jit_r(i);
}

static jit_fpr_t getloc_f(struct ejit_func *f, jit_state_t *j, size_t l,
                          size_t i)
{
	assert(l < fpr_stats_len(&f->fpr));
	assert(i <= 2);
	size_t r = fpr_stats_at(&f->fpr, l)->fno;
	if (r < physfpr_count())
		return physfpr_at(r);

	jit_ldxi_f(j, jit_f(i), JIT_SP, stack_loc_f(f, r));
	return jit_f(i);
}

static jit_fpr_t getloc_d(struct ejit_func *f, jit_state_t *j, size_t l,
                          size_t i)
{
	assert(l < fpr_stats_len(&f->fpr));
	assert(i <= 2);
	size_t r = fpr_stats_at(&f->fpr, l)->fno;
	if (r < physfpr_count())
		return physfpr_at(r);

	/* not that stack_loc_f assumes double, so floats technically take up
	 * more space than needed but at least we don't get any alignment issues */
	jit_ldxi_d(j, jit_f(i), JIT_SP, stack_loc_f(f, r));
	return jit_f(i);
}

/* get physical register for slot l. If l is already in a physical register,
 * return it, otherwise R0-R2 given by i. Does not fetch any values from a
 * stack, mainly used for preparing a destination register. */
static jit_gpr_t getgpr(struct ejit_func *f, size_t l, size_t i)
{
	assert(l < gpr_stats_len(&f->gpr));
	assert(i <= 2);
	size_t r = gpr_stats_at(&f->gpr, l)->rno;
	if (r < physgpr_count())
		return physgpr_at(r);

	return jit_r(i);
}

static jit_fpr_t getfpr(struct ejit_func *f, size_t l, size_t i)
{
	assert(l < fpr_stats_len(&f->fpr));
	assert(i <= 2);
	size_t r = fpr_stats_at(&f->fpr, l)->fno;
	if (r < physfpr_count())
		return physfpr_at(r);

	return jit_f(i);
}

static void putloc(struct ejit_func *f, jit_state_t *j, size_t l, jit_gpr_t r)
{
	assert(l < gpr_stats_len(&f->gpr));
	size_t rno = gpr_stats_at(&f->gpr, l)->rno;
	if (rno < physgpr_count()) {
		assert(physgpr_at(rno).regno == r.regno);
		return;
	}

	jit_stxi(j, stack_loc(rno), JIT_SP, r);
}

static void putloc_f(struct ejit_func *f, jit_state_t *j, size_t l, jit_fpr_t r)
{
	assert(l < fpr_stats_len(&f->fpr));
	size_t rno = fpr_stats_at(&f->fpr, l)->fno;
	if (rno < physfpr_count()) {
		assert(physfpr_at(rno).regno == r.regno);
		return;
	}

	jit_stxi_f(j, stack_loc_f(f, rno), JIT_SP, r);
}

static void putloc_d(struct ejit_func *f, jit_state_t *j, size_t l, jit_fpr_t r)
{
	assert(l < fpr_stats_len(&f->fpr));
	size_t rno = fpr_stats_at(&f->fpr, l)->fno;
	if (rno < physfpr_count()) {
		assert(physfpr_at(rno).regno == r.regno);
		return;
	}

	jit_stxi_d(j, stack_loc_f(f, rno), JIT_SP, r);
}

static void compile_label(jit_state_t *j, size_t ii, struct addrs *addrs)
{
	*addrs_at(addrs, ii) = jit_address(j);
}

static void compile_movi(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_gpr_t r = getgpr(f, i.r0, 0);
	jit_movi(j, r, i.o);
	putloc(f, j, i.r0, r);
}

static void compile_movi_f(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_fpr_t r = getfpr(f, i.r0, 0);
	jit_movi_f(j, r, i.f);
	putloc_f(f, j, i.r0, r);
}

static void compile_movi_d(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_fpr_t r = getfpr(f, i.r0, 0);
	jit_movi_d(j, r, i.d);
	putloc_d(f, j, i.r0, r);
}

static void compile_movr(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_gpr_t to = getgpr(f, i.r0, 0);
	jit_gpr_t from = getloc(f, j, i.r1, 1);
	jit_movr(j, to, from);
	putloc(f, j, i.r0, to);
}

static void compile_movr_f(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_fpr_t to = getfpr(f, i.r0, 0);
	jit_fpr_t from = getloc_f(f, j, i.r1, 1);
	jit_movr_f(j, to, from);
	putloc_f(f, j, i.r0, to);
}

static void compile_movr_d(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_fpr_t to = getfpr(f, i.r0, 0);
	jit_fpr_t from = getloc_d(f, j, i.r1, 1);
	jit_movr_d(j, to, from);
	putloc_d(f, j, i.r0, to);
}

static void compile_addr(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_gpr_t dst = getgpr(f, i.r0, 0);
	jit_gpr_t src0 = getloc(f, j, i.r1, 1);
	jit_gpr_t src1 = getloc(f, j, i.r2, 2);
	jit_addr(j, dst, src0, src1);
	putloc(f, j, i.r0, dst);
}

static void compile_addi(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_gpr_t dst = getgpr(f, i.r0, 0);
	jit_gpr_t src0 = getloc(f, j, i.r1, 1);
	jit_addi(j, dst, src0, i.o);
	putloc(f, j, i.r0, dst);
}

static void compile_subr(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_gpr_t dst = getgpr(f, i.r0, 0);
	jit_gpr_t src0 = getloc(f, j, i.r1, 1);
	jit_gpr_t src1 = getloc(f, j, i.r2, 2);
	jit_subr(j, dst, src0, src1);
	putloc(f, j, i.r0, dst);
}

static void compile_subi(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_gpr_t dst = getgpr(f, i.r0, 0);
	jit_gpr_t src0 = getloc(f, j, i.r1, 1);
	jit_subi(j, dst, src0, i.o);
	putloc(f, j, i.r0, dst);
}

static void compile_subr_f(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_fpr_t r1 = getloc_f(f, j, i.r1, 1);
	jit_fpr_t r2 = getloc_f(f, j, i.r2, 2);
	jit_subr_f(j, r0, r1, r2);
	putloc_f(f, j, i.r0, r0);
}

static void compile_subr_d(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_fpr_t r1 = getloc_d(f, j, i.r1, 1);
	jit_fpr_t r2 = getloc_d(f, j, i.r2, 2);
	jit_subr_d(j, r0, r1, r2);
	putloc_d(f, j, i.r0, r0);
}

static void compile_mulr(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_mulr(j, r0, r1, r2);
	putloc(f, j, i.r0, r0);
}

static void compile_mulr_f(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_fpr_t r1 = getloc_f(f, j, i.r1, 1);
	jit_fpr_t r2 = getloc_f(f, j, i.r2, 2);
	jit_mulr_f(j, r0, r1, r2);
	putloc_f(f, j, i.r0, r0);
}

static void compile_mulr_d(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_fpr_t r1 = getloc_d(f, j, i.r1, 1);
	jit_fpr_t r2 = getloc_d(f, j, i.r2, 2);
	jit_mulr_d(j, r0, r1, r2);
	putloc_d(f, j, i.r0, r0);
}

static void compile_andi(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_andi(j, r0, r1, i.o);
	putloc(f, j, i.r0, r0);
}

static void compile_andr(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_andr(j, r0, r1, r2);
	putloc(f, j, i.r0, r0);
}

static void compile_ori(struct ejit_func *f, jit_state_t *j,
                        struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_ori(j, r0, r1, i.o);
	putloc(f, j, i.r0, r0);
}

static void compile_orr(struct ejit_func *f, jit_state_t *j,
                        struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_orr(j, r0, r1, r2);
	putloc(f, j, i.r0, r0);
}

static void compile_xori(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_xori(j, r0, r1, i.o);
	putloc(f, j, i.r0, r0);
}

static void compile_xorr(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_xorr(j, r0, r1, r2);
	putloc(f, j, i.r0, r0);
}

static void compile_divr(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_divr(j, r0, r1, r2);
	putloc(f, j, i.r0, r0);
}

static void compile_divr_u(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_divr_u(j, r0, r1, r2);
	putloc(f, j, i.r0, r0);
}

static void compile_divr_d(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_fpr_t r1 = getloc_d(f, j, i.r1, 1);
	jit_fpr_t r2 = getloc_d(f, j, i.r2, 2);
	jit_divr_d(j, r0, r1, r2);
	putloc_d(f, j, i.r0, r0);
}

static void compile_divr_f(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_fpr_t r1 = getloc_f(f, j, i.r1, 1);
	jit_fpr_t r2 = getloc_f(f, j, i.r2, 2);
	jit_divr_f(j, r0, r1, r2);
	putloc_f(f, j, i.r0, r0);
}

static void compile_remr(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_remr(j, r0, r1, r2);
	putloc(f, j, i.r0, r0);
}

static void compile_remr_u(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_remr_u(j, r0, r1, r2);
	putloc(f, j, i.r0, r0);
}

static void compile_absr_f(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_fpr_t r1 = getloc_f(f, j, i.r1, 1);
	jit_absr_f(j, r0, r1);
	putloc_f(f, j, i.r0, r0);
}

static void compile_absr_d(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_fpr_t r1 = getloc_d(f, j, i.r1, 1);
	jit_absr_d(j, r0, r1);
	putloc_d(f, j, i.r0, r0);
}

static void compile_addr_f(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_fpr_t r1 = getloc_f(f, j, i.r1, 1);
	jit_fpr_t r2 = getloc_f(f, j, i.r2, 2);
	jit_addr_f(j, r0, r1, r2);
	putloc_f(f, j, i.r0, r0);
}

static void compile_addr_d(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_fpr_t r1 = getloc_d(f, j, i.r1, 1);
	jit_fpr_t r2 = getloc_d(f, j, i.r2, 2);
	jit_addr_d(j, r0, r1, r2);
	putloc_d(f, j, i.r0, r0);
}

static void compile_lshi(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_lshi(j, r0, r1, i.o);
	putloc(f, j, i.r0, r0);
}

static void compile_lshr(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_lshr(j, r0, r1, r2);
	putloc(f, j, i.r0, r0);
}

static void compile_rshi(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_rshi(j, r0, r1, i.o);
	putloc(f, j, i.r0, r0);
}

static void compile_rshi_u(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_rshi_u(j, r0, r1, i.o);
	putloc(f, j, i.r0, r0);
}

static void compile_rshr(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_rshr(j, r0, r1, r2);
	putloc(f, j, i.r0, r0);
}

static void compile_rshr_u(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_rshr_u(j, r0, r1, r2);
	putloc(f, j, i.r0, r0);
}

static void compile_sti8(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_gpr_t r0 = getloc(f, j, i.r0, 0);
	jit_sti_c(j, i.p, r0);
}

static void compile_sti16(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	jit_gpr_t r0 = getloc(f, j, i.r0, 0);
	jit_sti_s(j, i.p, r0);
}

static void compile_sti32(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	jit_gpr_t r0 = getloc(f, j, i.r0, 0);
	jit_sti_i(j, i.p, r0);
}

static void compile_sti64(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
#if __WORDSIZE == 64
	jit_gpr_t r0 = getloc(f, j, i.r0, 0);
	jit_sti_l(j, i.p, r0);
#else
	(void)f;
	(void)j;
	(void)i;
	assert(0 && "trying to compile sti64 on 32bit arch");
#endif
}

static void compile_stif(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_fpr_t r0 = getloc_f(f, j, i.r0, 0);
	jit_sti_f(j, i.p, r0);
}

static void compile_stid(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_fpr_t r0 = getloc_d(f, j, i.r0, 0);
	jit_sti_d(j, i.p, r0);
}

static void compile_stxi8(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	jit_gpr_t r0 = getloc(f, j, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_stxi_c(j, i.o, r1, r0);
}

static void compile_stxi16(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_gpr_t r0 = getloc(f, j, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_stxi_s(j, i.o, r1, r0);
}

static void compile_stxi32(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_gpr_t r0 = getloc(f, j, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_stxi_i(j, i.o, r1, r0);
}

static void compile_stxi64(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
#if __WORDSIZE == 64
	jit_gpr_t r0 = getloc(f, j, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_stxi_l(j, i.o, r1, r0);
#else
	(void)f;
	(void)j;
	(void)i;
	assert(0 && "trying to compile stxi64 on 32bit arch");
#endif
}

static void compile_stxif(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	jit_fpr_t r0 = getloc_f(f, j, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_stxi_f(j, i.o, r1, r0);
}

static void compile_stxid(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	jit_fpr_t r0 = getloc_d(f, j, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_stxi_d(j, i.o, r1, r0);
}

static void compile_stxr8(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	jit_gpr_t r0 = getloc(f, j, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_stxr_c(j, r2, r1, r0);
}

static void compile_stxr16(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_gpr_t r0 = getloc(f, j, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_stxr_s(j, r2, r1, r0);
}

static void compile_stxr32(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_gpr_t r0 = getloc(f, j, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_stxr_i(j, r2, r1, r0);
}

static void compile_stxr64(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
#if __WORDSIZE == 64
	jit_gpr_t r0 = getloc(f, j, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_stxr_l(j, r2, r1, r0);
#else
	(void)f;
	(void)j;
	(void)i;
	assert(0 && "trying to compile stxr64 on 32bit arch");
#endif
}

static void compile_stxrf(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	jit_fpr_t r0 = getloc_f(f, j, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_stxr_f(j, r2, r1, r0);
}

static void compile_stxrd(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	jit_fpr_t r0 = getloc_d(f, j, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_stxr_d(j, r2, r1, r0);
}

static void compile_ldiu8(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_ldi_uc(j, r0, i.p);
	putloc(f, j, i.r0, r0);
}

static void compile_ldiu16(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_ldi_us(j, r0, i.p);
	putloc(f, j, i.r0, r0);
}

static void compile_ldiu32(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
#if __WORDSIZE == 64
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_ldi_ui(j, r0, i.p);
	putloc(f, j, i.r0, r0);
#else
	(void)f;
	(void)j;
	(void)i;
	assert(0 && "trying to compile ldiu32 on 32bit arch");
#endif
}

static void compile_ldiu64(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
#if __WORDSIZE == 64
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_ldi_l(j, r0, i.p);
	putloc(f, j, i.r0, r0);
#else
	(void)f;
	(void)j;
	(void)i;
	assert(0 && "trying to compile ldiu64 on 32bit arch");
#endif
}

static void compile_ldif(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_ldi_f(j, r0, i.p);
	putloc_f(f, j, i.r0, r0);
}

static void compile_ldid(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_ldi_d(j, r0, i.p);
	putloc_d(f, j, i.r0, r0);
}

static void compile_ldi8(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_ldi_c(j, r0, i.p);
	putloc(f, j, i.r0, r0);
}

static void compile_ldi16(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_ldi_s(j, r0, i.p);
	putloc(f, j, i.r0, r0);
}

static void compile_ldi32(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_ldi_i(j, r0, i.p);
	putloc(f, j, i.r0, r0);
}

static void compile_ldi64(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
#if __WORDSIZE == 64
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_ldi_l(j, r0, i.p);
	putloc(f, j, i.r0, r0);
#else
	(void)f;
	(void)j;
	(void)i;
	assert(0 && "trying to compile ldi64 on 32bit arch");
#endif
}

static void compile_ldxiu8(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_ldxi_uc(j, r0, r1, i.o);
	putloc(f, j, i.r0, r0);
}

static void compile_ldxiu16(struct ejit_func *f, jit_state_t *j,
                            struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_ldxi_us(j, r0, r1, i.o);
	putloc(f, j, i.r0, r0);
}

static void compile_ldxiu32(struct ejit_func *f, jit_state_t *j,
                            struct ejit_insn i)
{
#if __WORDSIZE == 64
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_ldxi_ui(j, r0, r1, i.o);
	putloc(f, j, i.r0, r0);
#else
	(void)f;
	(void)j;
	(void)i;
	assert(0 && "trying to compile ldxiu32 on 32bit arch");
#endif
}

static void compile_ldxiu64(struct ejit_func *f, jit_state_t *j,
                            struct ejit_insn i)
{
#if __WORDSIZE == 64
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_ldxi_l(j, r0, r1, i.o);
	putloc(f, j, i.r0, r0);
#else
	(void)f;
	(void)j;
	(void)i;
	assert(0 && "trying to compile ldxiu64 on 32bit arch");
#endif
}

static void compile_ldxif(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_ldxi_f(j, r0, r1, i.o);
	putloc_f(f, j, i.r0, r0);
}

static void compile_ldxid(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_ldxi_d(j, r0, r1, i.o);
	putloc_d(f, j, i.r0, r0);
}

static void compile_ldxi8(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_ldxi_c(j, r0, r1, i.o);
	putloc(f, j, i.r0, r0);
}

static void compile_ldxi16(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_ldxi_s(j, r0, r1, i.o);
	putloc(f, j, i.r0, r0);
}

static void compile_ldxi32(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_ldxi_i(j, r0, r1, i.o);
	putloc(f, j, i.r0, r0);
}

static void compile_ldxi64(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
#if __WORDSIZE == 64
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_ldxi_l(j, r0, r1, i.o);
	putloc(f, j, i.r0, r0);
#else
	(void)f;
	(void)j;
	(void)i;
	assert(0 && "trying to compile ldxi64 on 32bit arch");
#endif
}

static void compile_ldxru8(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_ldxr_uc(j, r0, r1, r2);
	putloc(f, j, i.r0, r0);
}

static void compile_ldxru16(struct ejit_func *f, jit_state_t *j,
                            struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_ldxr_us(j, r0, r1, r2);
	putloc(f, j, i.r0, r0);
}

static void compile_ldxru32(struct ejit_func *f, jit_state_t *j,
                            struct ejit_insn i)
{
#if __WORDSIZE == 64
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_ldxr_ui(j, r0, r1, r2);
	putloc(f, j, i.r0, r0);
#else
	(void)f;
	(void)j;
	(void)i;
	assert(0 && "trying to compile ldxru32 on 32bit arch");
#endif
}

static void compile_ldxru64(struct ejit_func *f, jit_state_t *j,
                            struct ejit_insn i)
{
#if __WORDSIZE == 64
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_ldxr_l(j, r0, r1, r2);
	putloc(f, j, i.r0, r0);
#else
	(void)f;
	(void)j;
	(void)i;
	assert(0 && "trying to compile ldxru64 on 32bit arch");
#endif
}

static void compile_ldxr8(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_ldxr_c(j, r0, r1, r2);
	putloc(f, j, i.r0, r0);
}

static void compile_ldxr16(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_ldxr_s(j, r0, r1, r2);
	putloc(f, j, i.r0, r0);
}

static void compile_ldxr32(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_ldxr_i(j, r0, r1, r2);
	putloc(f, j, i.r0, r0);
}

static void compile_ldxr64(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
#if __WORDSIZE == 64
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_ldxr_l(j, r0, r1, r2);
	putloc(f, j, i.r0, r0);
#else
	(void)f;
	(void)j;
	(void)i;
	assert(0 && "trying to compile ldxr64 on 32bit arch");
#endif
}

static void compile_ldxrf(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_ldxr_f(j, r0, r1, r2);
	putloc_f(f, j, i.r0, r0);
}

static void compile_ldxrd(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_ldxr_d(j, r0, r1, r2);
	putloc_d(f, j, i.r0, r0);
}

static void compile_comr(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_comr(j, r0, r1);
	putloc(f, j, i.r0, r0);
}

static void compile_negr(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_negr(j, r0, r1);
	putloc(f, j, i.r0, r0);
}

static void compile_negr_f(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_fpr_t r1 = getloc_f(f, j, i.r1, 1);
	jit_negr_f(j, r0, r1);
	putloc_f(f, j, i.r0, r0);
}

static void compile_negr_d(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_fpr_t r1 = getloc_d(f, j, i.r1, 1);
	jit_negr_d(j, r0, r1);
	putloc_d(f, j, i.r0, r0);
}

static void compile_extr8(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_extr_c(j, r0, r1);
	putloc(f, j, i.r0, r0);
}

static void compile_extr16(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_extr_s(j, r0, r1);
	putloc(f, j, i.r0, r0);
}

static void compile_extr32(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
#if __WORDSIZE == 64
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_extr_i(j, r0, r1);
	putloc(f, j, i.r0, r0);
#else
	(void)f;
	(void)j;
	(void)i;
	assert(0 && "trying to compile extr32 on 32bit arch");
#endif
}

static void compile_extru8(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_extr_uc(j, r0, r1);
	putloc(f, j, i.r0, r0);
}

static void compile_extru16(struct ejit_func *f, jit_state_t *j,
                            struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_extr_us(j, r0, r1);
	putloc(f, j, i.r0, r0);
}

static void compile_extru32(struct ejit_func *f, jit_state_t *j,
                            struct ejit_insn i)
{
#if __WORDSIZE == 64
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_extr_ui(j, r0, r1);
	putloc(f, j, i.r0, r0);
#else
	(void)f;
	(void)j;
	(void)i;
	assert(0 && "trying to compile extru32 on 32bit arch");
#endif
}

static void compile_extrf(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_extr_f(j, r0, r1);
	putloc_f(f, j, i.r0, r0);
}

static void compile_extrd(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_extr_d(j, r0, r1);
	putloc_d(f, j, i.r0, r0);
}

static void compile_truncr_d_64(struct ejit_func *f, jit_state_t *j,
                                struct ejit_insn i)
{
#if __WORDSIZE == 64
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_fpr_t r1 = getloc_d(f, j, i.r1, 1);
	jit_truncr_d_l(j, r0, r1);
	putloc(f, j, i.r0, r0);
#else
	(void)f;
	(void)j;
	(void)i;
	assert(0 && "trying to compile truncr_d_64 on 32bit arch");
#endif
}

static void compile_truncr_d_32(struct ejit_func *f, jit_state_t *j,
                                struct ejit_insn i)
{
#if __WORDSIZE == 64
	return compile_truncr_d_64(f, j, i);
#else
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_fpr_t r1 = getloc_d(f, j, i.r1, 1);
	jit_truncr_d_i(j, r0, r1);
	putloc(f, j, i.r0, r0);
#endif
}


static void compile_truncr_f_64(struct ejit_func *f, jit_state_t *j,
                                struct ejit_insn i)
{
#if __WORDSIZE == 64
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_fpr_t r1 = getloc_f(f, j, i.r1, 1);
	jit_truncr_f_l(j, r0, r1);
	putloc(f, j, i.r0, r0);
#else
	(void)f;
	(void)j;
	(void)i;
	assert(0 && "trying to compile truncr_f_64 on 32bit arch");
#endif
}

static void compile_truncr_f_32(struct ejit_func *f, jit_state_t *j,
                                struct ejit_insn i)
{
#if __WORDSIZE == 64
	return compile_truncr_f_64(f, j, i);
#else
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_fpr_t r1 = getloc_f(f, j, i.r1, 1);
	jit_truncr_f_i(j, r0, r1);
	putloc(f, j, i.r0, r0);
#endif
}

static void compile_sqrtr_f(struct ejit_func *f, jit_state_t *j,
		struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_fpr_t r1 = getloc_f(f, j, i.r1, 1);
	jit_sqrtr_f(j, r0, r1);
	putloc_d(f, j, i.r0, r0);
}

static void compile_sqrtr_d(struct ejit_func *f, jit_state_t *j,
		struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_fpr_t r1 = getloc_d(f, j, i.r1, 1);
	jit_sqrtr_d(j, r0, r1);
	putloc_d(f, j, i.r0, r0);
}

static void compile_reg_cmp(struct ejit_func *f, jit_state_t *j,
                            struct ejit_insn i,
                            jit_reloc_t (*bcomp)(jit_state_t *, jit_gpr_t,
                                                 jit_gpr_t), long same)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	if (i.r1 == i.r2) {
		jit_movi(j, r0, same);
		putloc(f, j, i.r0, r0);
		return;
	}

	jit_gpr_t r1 = getloc(f, j, i.r1, 1);
	jit_gpr_t r2 = getloc(f, j, i.r2, 2);
	jit_reloc_t branch = bcomp(j, r1, r2);

	/* not equal */
	jit_movi(j, r0, 0);
	jit_reloc_t jump = jit_jmp(j);
	jit_patch_there(j, branch, jit_address(j));

	/* equal */
	jit_movi(j, r0, 1);
	jit_patch_there(j, jump, jit_address(j));

	/* write final result */
	putloc(f, j, i.r0, r0);
}

static void compile_reg_d_cmp(struct ejit_func *f, jit_state_t *j,
                              struct ejit_insn i,
                              jit_reloc_t (*bcomp)(jit_state_t *,
                                                   jit_fpr_t,
                                                   jit_fpr_t)
                              )
{
	/* note that we don't check for register sameness due to NaN, which
	 * compares to itself as fast so we can't say for sure if r1 == r1 will
	 * return true */
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_fpr_t r1 = getloc_d(f, j, i.r1, 1);
	jit_fpr_t r2 = getloc_d(f, j, i.r2, 2);
	jit_reloc_t branch = bcomp(j, r1, r2);

	/* not equal */
	jit_movi(j, r0, 0);
	jit_reloc_t jump = jit_jmp(j);
	jit_patch_there(j, branch, jit_address(j));

	/* equal */
	jit_movi(j, r0, 1);
	jit_patch_there(j, jump, jit_address(j));

	/* write final result */
	putloc(f, j, i.r0, r0);
}

static void compile_reg_f_cmp(struct ejit_func *f, jit_state_t *j,
                              struct ejit_insn i,
                              jit_reloc_t (*bcomp)(jit_state_t *,
                                                   jit_fpr_t,
                                                   jit_fpr_t)
                              )
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_fpr_t r1 = getloc_f(f, j, i.r1, 1);
	jit_fpr_t r2 = getloc_f(f, j, i.r2, 2);
	jit_reloc_t branch = bcomp(j, r1, r2);

	/* not equal */
	jit_movi(j, r0, 0);
	jit_reloc_t jump = jit_jmp(j);
	jit_patch_there(j, branch, jit_address(j));

	/* equal */
	jit_movi(j, r0, 1);
	jit_patch_there(j, jump, jit_address(j));

	/* write final result */
	putloc(f, j, i.r0, r0);
}

static void compile_eqr(struct ejit_func *f, jit_state_t *j, struct ejit_insn i)
{
	compile_reg_cmp(f, j, i, jit_beqr, 1);
}

static void compile_eqr_d(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	compile_reg_d_cmp(f, j, i, jit_beqr_d);
}

static void compile_eqr_f(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	compile_reg_f_cmp(f, j, i, jit_beqr_f);
}

static void compile_ner(struct ejit_func *f, jit_state_t *j, struct ejit_insn i)
{
	compile_reg_cmp(f, j, i, jit_bner, 0);
}

static void compile_ner_d(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	compile_reg_d_cmp(f, j, i, jit_bner_d);
}

static void compile_ner_f(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	compile_reg_f_cmp(f, j, i, jit_bner_f);
}

static void compile_ger(struct ejit_func *f, jit_state_t *j, struct ejit_insn i)
{
	compile_reg_cmp(f, j, i, jit_bger, 1);
}

static void compile_ger_u(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	compile_reg_cmp(f, j, i, jit_bger_u, 1);
}

static void compile_ger_d(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	compile_reg_d_cmp(f, j, i, jit_bger_d);
}

static void compile_ger_f(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	compile_reg_f_cmp(f, j, i, jit_bger_f);
}

static void compile_gtr(struct ejit_func *f, jit_state_t *j, struct ejit_insn i)
{
	compile_reg_cmp(f, j, i, jit_bgtr, 0);
}

static void compile_gtr_u(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	compile_reg_cmp(f, j, i, jit_bgtr_u, 0);
}

static void compile_gtr_d(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	compile_reg_d_cmp(f, j, i, jit_bgtr_d);
}

static void compile_gtr_f(struct ejit_func *f, jit_state_t *j,
                          struct ejit_insn i)
{
	compile_reg_f_cmp(f, j, i, jit_bgtr_f);
}

static void compile_bmci(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i, struct relocs *relocs)
{
	jit_gpr_t r1 = getloc(f, j, i.r1, 0);
	jit_reloc_t r = jit_bmci(j, r1, i.o);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_bmcr(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i, struct relocs *relocs)
{
	jit_gpr_t r1 = getloc(f, j, i.r1, 0);
	jit_gpr_t r2 = getloc(f, j, i.r2, 1);
	jit_reloc_t r = jit_bmcr(j, r1, r2);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_bmsi(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i, struct relocs *relocs)
{
	jit_gpr_t r1 = getloc(f, j, i.r1, 0);
	jit_reloc_t r = jit_bmsi(j, r1, i.o);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_bmsr(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i, struct relocs *relocs)
{
	jit_gpr_t r1 = getloc(f, j, i.r1, 0);
	jit_gpr_t r2 = getloc(f, j, i.r2, 1);
	jit_reloc_t r = jit_bmsr(j, r1, r2);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_beqi(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i, struct relocs *relocs)
{
	jit_gpr_t r1 = getloc(f, j, i.r1, 0);
	jit_reloc_t r = jit_beqi(j, r1, i.o);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_beqr(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i, struct relocs *relocs)
{
	jit_gpr_t r1 = getloc(f, j, i.r1, 0);
	jit_gpr_t r2 = getloc(f, j, i.r2, 1);
	jit_reloc_t r = jit_beqr(j, r1, r2);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_beqr_f(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i, struct relocs *relocs)
{
	jit_fpr_t r1 = getloc_f(f, j, i.r1, 0);
	jit_fpr_t r2 = getloc_f(f, j, i.r2, 1);
	jit_reloc_t r = jit_beqr_f(j, r1, r2);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_beqr_d(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i, struct relocs *relocs)
{
	jit_fpr_t r1 = getloc_d(f, j, i.r1, 0);
	jit_fpr_t r2 = getloc_d(f, j, i.r2, 1);
	jit_reloc_t r = jit_beqr_d(j, r1, r2);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_bnei(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i, struct relocs *relocs)
{
	jit_gpr_t r1 = getloc(f, j, i.r1, 0);
	jit_reloc_t r = jit_bnei(j, r1, i.o);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_bner(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i, struct relocs *relocs)
{
	jit_gpr_t r1 = getloc(f, j, i.r1, 0);
	jit_gpr_t r2 = getloc(f, j, i.r2, 1);
	jit_reloc_t r = jit_bner(j, r1, r2);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_bner_f(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i, struct relocs *relocs)
{
	jit_fpr_t r1 = getloc_f(f, j, i.r1, 0);
	jit_fpr_t r2 = getloc_f(f, j, i.r2, 1);
	jit_reloc_t r = jit_bner_f(j, r1, r2);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_bner_d(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i, struct relocs *relocs)
{
	jit_fpr_t r1 = getloc_d(f, j, i.r1, 0);
	jit_fpr_t r2 = getloc_d(f, j, i.r2, 1);
	jit_reloc_t r = jit_bner_d(j, r1, r2);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_bger(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i, struct relocs *relocs)
{
	jit_gpr_t r1 = getloc(f, j, i.r1, 0);
	jit_gpr_t r2 = getloc(f, j, i.r2, 1);
	jit_reloc_t r = jit_bger(j, r1, r2);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_bger_u(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i, struct relocs *relocs)
{
	jit_gpr_t r1 = getloc(f, j, i.r1, 0);
	jit_gpr_t r2 = getloc(f, j, i.r2, 1);
	jit_reloc_t r = jit_bger_u(j, r1, r2);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_bgei(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i, struct relocs *relocs)
{
	jit_gpr_t r1 = getloc(f, j, i.r1, 0);
	jit_reloc_t r = jit_bgei(j, r1, i.o);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_bgei_u(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i, struct relocs *relocs)
{
	jit_gpr_t r1 = getloc(f, j, i.r1, 0);
	jit_reloc_t r = jit_bgei_u(j, r1, i.o);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_bger_f(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i, struct relocs *relocs)
{
	jit_fpr_t r1 = getloc_f(f, j, i.r1, 0);
	jit_fpr_t r2 = getloc_f(f, j, i.r2, 1);
	jit_reloc_t r = jit_bger_f(j, r1, r2);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_bger_d(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i, struct relocs *relocs)
{
	jit_fpr_t r1 = getloc_d(f, j, i.r1, 0);
	jit_fpr_t r2 = getloc_d(f, j, i.r2, 1);
	jit_reloc_t r = jit_bger_d(j, r1, r2);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_bgtr(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i, struct relocs *relocs)
{
	jit_gpr_t r1 = getloc(f, j, i.r1, 0);
	jit_gpr_t r2 = getloc(f, j, i.r2, 1);
	jit_reloc_t r = jit_bgtr(j, r1, r2);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_bgtr_u(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i, struct relocs *relocs)
{
	jit_gpr_t r1 = getloc(f, j, i.r1, 0);
	jit_gpr_t r2 = getloc(f, j, i.r2, 1);
	jit_reloc_t r = jit_bgtr_u(j, r1, r2);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_bgti(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i, struct relocs *relocs)
{
	jit_gpr_t r1 = getloc(f, j, i.r1, 0);
	jit_reloc_t r = jit_bgti(j, r1, i.o);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_bgti_u(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i, struct relocs *relocs)
{
	jit_gpr_t r1 = getloc(f, j, i.r1, 0);
	jit_reloc_t r = jit_bgti_u(j, r1, i.o);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_bgtr_f(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i, struct relocs *relocs)
{
	jit_fpr_t r1 = getloc_f(f, j, i.r1, 0);
	jit_fpr_t r2 = getloc_f(f, j, i.r2, 1);
	jit_reloc_t r = jit_bgtr_f(j, r1, r2);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_bgtr_d(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i, struct relocs *relocs)
{
	jit_fpr_t r1 = getloc_d(f, j, i.r1, 0);
	jit_fpr_t r2 = getloc_d(f, j, i.r2, 1);
	jit_reloc_t r = jit_bgtr_d(j, r1, r2);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_blei(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i, struct relocs *relocs)
{
	jit_gpr_t r1 = getloc(f, j, i.r1, 0);
	jit_reloc_t r = jit_blei(j, r1, i.o);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_blei_u(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i, struct relocs *relocs)
{
	jit_gpr_t r1 = getloc(f, j, i.r1, 0);
	jit_reloc_t r = jit_blei_u(j, r1, i.o);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_blti(struct ejit_func *f, jit_state_t *j,
                         struct ejit_insn i, struct relocs *relocs)
{
	jit_gpr_t r1 = getloc(f, j, i.r1, 0);
	jit_reloc_t r = jit_blti(j, r1, i.o);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_blti_u(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i, struct relocs *relocs)
{
	jit_gpr_t r1 = getloc(f, j, i.r1, 0);
	jit_reloc_t r = jit_blti_u(j, r1, i.o);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_jmp(struct ejit_func *f, jit_state_t *j, struct ejit_insn i,
                        struct relocs *relocs)
{
	(void)(f);
	jit_reloc_t r = jit_jmp(j);
	struct reloc_helper h = {.r = r, .to = i.r0};
	relocs_append(relocs, h);
}

static void compile_retval(struct ejit_func *f, jit_state_t *j,
                           struct ejit_insn i)
{
	jit_gpr_t r0 = getgpr(f, i.r0, 0);
	jit_retval(j, r0);
	putloc(f, j, i.r0, r0);
}

static void compile_retval_f(struct ejit_func *f, jit_state_t *j,
                             struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_retval_f(j, r0);
	putloc_f(f, j, i.r0, r0);
}

static void compile_retval_d(struct ejit_func *f, jit_state_t *j,
                             struct ejit_insn i)
{
	jit_fpr_t r0 = getfpr(f, i.r0, 0);
	jit_retval_d(j, r0);
	putloc_d(f, j, i.r0, r0);
}

static enum jit_operand_abi jit_abi_from(enum ejit_type t)
{
	switch (t) {
	case EJIT_INT8: return JIT_OPERAND_ABI_INT8;
	case EJIT_INT16: return JIT_OPERAND_ABI_INT16;
	case EJIT_INT32: return JIT_OPERAND_ABI_INT32;
	case EJIT_INT64: return JIT_OPERAND_ABI_INT64;
	case EJIT_UINT8: return JIT_OPERAND_ABI_UINT8;
	case EJIT_UINT16: return JIT_OPERAND_ABI_UINT16;
	case EJIT_UINT32: return JIT_OPERAND_ABI_UINT32;
	case EJIT_UINT64: return JIT_OPERAND_ABI_UINT64;
	case EJIT_POINTER: return JIT_OPERAND_ABI_POINTER;
	case EJIT_FLOAT: return JIT_OPERAND_ABI_FLOAT;
	case EJIT_DOUBLE: return JIT_OPERAND_ABI_DOUBLE;
	default: break;
	}

	abort();
}

static size_t arg_offsetof(enum ejit_type t)
{
	switch (t) {
	case EJIT_INT8: return offsetof(struct ejit_arg, i8);
	case EJIT_INT16: return offsetof(struct ejit_arg, i16);
	case EJIT_INT32: return offsetof(struct ejit_arg, i32);
	case EJIT_INT64: return offsetof(struct ejit_arg, i64);
	case EJIT_UINT8: return offsetof(struct ejit_arg, u8);
	case EJIT_UINT16: return offsetof(struct ejit_arg, u16);
	case EJIT_UINT32: return offsetof(struct ejit_arg, u32);
	case EJIT_UINT64: return offsetof(struct ejit_arg, u64);
	case EJIT_POINTER: return offsetof(struct ejit_arg, p);
	case EJIT_FLOAT: return offsetof(struct ejit_arg, f);
	case EJIT_DOUBLE: return offsetof(struct ejit_arg, d);
	default: break;
	};

	abort();
}

static jit_off_t arg_offset(struct ejit_insn i)
{
	/* index of ejit_arg in stack and offset of whatever type we're dealing
	 * with */
	return (sizeof(struct ejit_arg) * i.r0) + arg_offsetof(i.r1);
}

static jit_off_t type_offset(struct ejit_insn i)
{
	return (sizeof(struct ejit_arg) * i.r0) + offsetof(struct ejit_arg,
	                                                   type);
}

static void fixup_operands(struct operands *operands, size_t fixup)
{
	foreach_vec(i, *operands) {
		jit_operand_t op = *operands_at(operands, i);
		if (op.kind != JIT_OPERAND_KIND_MEM)
			continue;

		op.loc.mem.offset += fixup;
		*operands_at(operands, i) = op;
	}
}

static void compile_imm_call(jit_state_t *j, struct operands *src, struct operands *dst,
                             void *addr, size_t argc, jit_operand_t args[argc])
{
	/* each move is type + arg, so twofold */
	size_t movec = operands_len(src) / 2;
	size_t fixup = jit_align_stack(j, movec * sizeof(struct ejit_arg));
	fixup_operands(src, fixup);
	/* note, do not fix up destination! */
	/* remember to move all operands */
	jit_move_operands(j, dst->buf, src->buf, movec * 2);

	jit_movr(j, JIT_R0, JIT_SP);
	jit_calli(j, addr, argc, args);

	jit_shrink_stack(j, fixup);
}

/* adds a header that converts from our external interface (params on stack) to
 * an internal format that's effectively just the underlying ABI, makes calls
 * between functions we know are compiled a bit faster. */
static void compile_trampoline(struct ejit_func *f, jit_state_t *j)
{
	/* very important, argc we don't really do anything with but JIT_R1
	 * contains the argument stack! */

	size_t frame = jit_enter_jit_abi(j, 0, 0, 0);
	jit_load_args_2(j,
	                jit_operand_gpr(JIT_OPERAND_ABI_WORD, JIT_R0),
	                jit_operand_gpr(JIT_OPERAND_ABI_POINTER, JIT_R1));

	struct operands args = operands_create(0);

	foreach_vec(ii, f->insns) {
		struct ejit_insn i = *insns_at(&f->insns, ii);
		switch (i.op) {
		case EJIT_OP_PARAM: {
			jit_operand_t p = jit_operand_mem(jit_abi_from(i.r1), JIT_R1, arg_offset(i));
			operands_append(&args, p);
			break;
		}

		case EJIT_OP_PARAM_F: {
			jit_operand_t p = jit_operand_mem(jit_abi_from(i.r1), JIT_R1, arg_offset(i));
			operands_append(&args, p);
			break;
		}

		case EJIT_OP_START: {
			/* callr should be smart enough to avoid JIT_R0 if some
			 * other register wants to write to it */
			jit_reloc_t r = jit_mov_addr(j, JIT_R0);
			jit_callr(j, JIT_R0, operands_len(&args), args.buf);
			jit_leave_jit_abi(j, 0, 0, frame);
			jit_ret(j); /* should just forward the return value */

			f->direct_call = jit_address_to_function_pointer(jit_address(j));
			jit_patch_here(j, r);

			operands_destroy(&args);
			return;
		}

		default: abort();
		}
	}
}

static void resolve_top_reloc(jit_state_t *j, struct relocs *relocs, struct addrs *addrs, size_t ii)
{
	assert(relocs_len(relocs) != 0);

	struct reloc_helper h = *relocs_back(relocs);
	if (h.to >= ii)
		return;

	jit_addr_t a = *addrs_at(addrs, h.to);
	jit_reloc_t r = h.r;

	assert(a);
	jit_patch_there(j, r, a);
	relocs_pop(relocs);
}

static void resolve_relocs(jit_state_t *j, struct relocs *relocs, struct addrs *addrs, size_t ii)
{
	foreach_vec(ri, *relocs) {
		struct reloc_helper h = *relocs_at(relocs, ri);
		if (h.to != ii)
			continue;

		jit_addr_t a = *addrs_at(addrs, ii);
		jit_reloc_t r = h.r;

		assert(a);
		jit_patch_there(j, r, a);

		/* 'shift' down */
		if (ri != relocs_len(relocs) - 1)
			*relocs_at(relocs, ri) = *relocs_back(relocs);

		assert(relocs_len(relocs) != 0);
		relocs_shrink(relocs, relocs_len(relocs) - 1);
	}
}

static size_t compile_fn_body(struct ejit_func *f, jit_state_t *j, void *arena,
                              size_t size)
{
	jit_begin(j, arena, size);
	compile_trampoline(f, j);

	size_t gprs = gpr_stats_len(&f->gpr) >= jit_v_num()
		? jit_v_num()
		: gpr_stats_len(&f->gpr);

	size_t fprs = fpr_stats_len(&f->fpr) >= jit_vf_num()
		? jit_vf_num()
		: fpr_stats_len(&f->fpr);

	size_t frame = jit_enter_jit_abi(j, gprs, fprs, 0);
	size_t stack = jit_align_stack(j, stack_size(f));

	struct operands src = operands_create();
	struct operands dst = operands_create();
	struct operands direct = operands_create();

	struct relocs relocs = relocs_create();
	struct addrs addrs = addrs_create();
	addrs_reserve(&addrs, insns_len(&f->insns));

	void *call = NULL;

	size_t label = 0;
	foreach_vec(ii, f->insns) {
		/* if we've hit a label, add it to our vector of label addresses */
		if (label < labels_len(&f->labels)) {
			if (*labels_at(&f->labels, label) == ii) {
				compile_label(j, ii, &addrs);
				resolve_relocs(j, &relocs, &addrs, ii);
				label++;
			}
		}

		struct ejit_insn i = *insns_at(&f->insns, ii);
		switch (i.op) {
		case EJIT_OP_MOVR: compile_movr(f, j, i); break;
		case EJIT_OP_MOVR_F: compile_movr_f(f, j, i); break;
		case EJIT_OP_MOVR_D: compile_movr_d(f, j, i); break;

		case EJIT_OP_MOVI: compile_movi(f, j, i); break;
		case EJIT_OP_MOVI_F: compile_movi_f(f, j, i); break;
		case EJIT_OP_MOVI_D: compile_movi_d(f, j, i); break;

		case EJIT_OP_ADDR: compile_addr(f, j, i); break;
		case EJIT_OP_ADDI: compile_addi(f, j, i); break;
		case EJIT_OP_ADDR_F: compile_addr_f(f, j, i); break;
		case EJIT_OP_ADDR_D: compile_addr_d(f, j, i); break;

		case EJIT_OP_SUBR: compile_subr(f, j, i); break;
		case EJIT_OP_SUBI: compile_subi(f, j, i); break;
		case EJIT_OP_SUBR_F: compile_subr_f(f, j, i); break;
		case EJIT_OP_SUBR_D: compile_subr_d(f, j, i); break;

		case EJIT_OP_MULR: compile_mulr(f, j, i); break;
		case EJIT_OP_MULR_F: compile_mulr_f(f, j, i); break;
		case EJIT_OP_MULR_D: compile_mulr_d(f, j, i); break;

		case EJIT_OP_ANDI: compile_andi(f, j, i); break;
		case EJIT_OP_ANDR: compile_andr(f, j, i); break;

		case EJIT_OP_ORI: compile_ori(f, j, i); break;
		case EJIT_OP_ORR: compile_orr(f, j, i); break;

		case EJIT_OP_XORI: compile_xori(f, j, i); break;
		case EJIT_OP_XORR: compile_xorr(f, j, i); break;

		case EJIT_OP_DIVR: compile_divr(f, j, i); break;
		case EJIT_OP_DIVR_U: compile_divr_u(f, j, i); break;
		case EJIT_OP_DIVR_F: compile_divr_f(f, j, i); break;
		case EJIT_OP_DIVR_D: compile_divr_d(f, j, i); break;

		case EJIT_OP_REMR: compile_remr(f, j, i); break;
		case EJIT_OP_REMR_U: compile_remr_u(f, j, i); break;

		case EJIT_OP_ABSR_F: compile_absr_f(f, j, i); break;
		case EJIT_OP_ABSR_D: compile_absr_d(f, j, i); break;

		case EJIT_OP_LSHI: compile_lshi(f, j, i); break;
		case EJIT_OP_LSHR: compile_lshr(f, j, i); break;
		case EJIT_OP_RSHI: compile_rshi(f, j, i); break;
		case EJIT_OP_RSHI_U: compile_rshi_u(f, j, i); break;
		case EJIT_OP_RSHR: compile_rshr(f, j, i); break;
		case EJIT_OP_RSHR_U: compile_rshr_u(f, j, i); break;

		case EJIT_OP_STI8: compile_sti8(f, j, i); break;
		case EJIT_OP_STI16: compile_sti16(f, j, i); break;
		case EJIT_OP_STI32: compile_sti32(f, j, i); break;
		case EJIT_OP_STI64: compile_sti64(f, j, i); break;
		case EJIT_OP_STIF: compile_stif(f, j, i); break;
		case EJIT_OP_STID: compile_stid(f, j, i); break;

		case EJIT_OP_STXI8: compile_stxi8(f, j, i); break;
		case EJIT_OP_STXI16: compile_stxi16(f, j, i); break;
		case EJIT_OP_STXI32: compile_stxi32(f, j, i); break;
		case EJIT_OP_STXI64: compile_stxi64(f, j, i); break;
		case EJIT_OP_STXIF: compile_stxif(f, j, i); break;
		case EJIT_OP_STXID: compile_stxid(f, j, i); break;

		case EJIT_OP_STXR8: compile_stxr8(f, j, i); break;
		case EJIT_OP_STXR16: compile_stxr16(f, j, i); break;
		case EJIT_OP_STXR32: compile_stxr32(f, j, i); break;
		case EJIT_OP_STXR64: compile_stxr64(f, j, i); break;
		case EJIT_OP_STXRF: compile_stxrf(f, j, i); break;
		case EJIT_OP_STXRD: compile_stxrd(f, j, i); break;

		case EJIT_OP_LDI8: compile_ldi8(f, j, i); break;
		case EJIT_OP_LDI16: compile_ldi16(f, j, i); break;
		case EJIT_OP_LDI32: compile_ldi32(f, j, i); break;
		case EJIT_OP_LDI64: compile_ldi64(f, j, i); break;
		case EJIT_OP_LDIU8: compile_ldiu8(f, j, i); break;
		case EJIT_OP_LDIU16: compile_ldiu16(f, j, i); break;
		case EJIT_OP_LDIU32: compile_ldiu32(f, j, i); break;
		case EJIT_OP_LDIU64: compile_ldiu64(f, j, i); break;
		case EJIT_OP_LDIF: compile_ldif(f, j, i); break;
		case EJIT_OP_LDID: compile_ldid(f, j, i); break;

		case EJIT_OP_LDXI8: compile_ldxi8(f, j, i); break;
		case EJIT_OP_LDXI16: compile_ldxi16(f, j, i); break;
		case EJIT_OP_LDXI32: compile_ldxi32(f, j, i); break;
		case EJIT_OP_LDXI64: compile_ldxi64(f, j, i); break;
		case EJIT_OP_LDXIU8: compile_ldxiu8(f, j, i); break;
		case EJIT_OP_LDXIU16: compile_ldxiu16(f, j, i); break;
		case EJIT_OP_LDXIU32: compile_ldxiu32(f, j, i); break;
		case EJIT_OP_LDXIU64: compile_ldxiu64(f, j, i); break;
		case EJIT_OP_LDXIF: compile_ldxif(f, j, i); break;
		case EJIT_OP_LDXID: compile_ldxid(f, j, i); break;

		case EJIT_OP_LDXR8: compile_ldxr8(f, j, i); break;
		case EJIT_OP_LDXR16: compile_ldxr16(f, j, i); break;
		case EJIT_OP_LDXR32: compile_ldxr32(f, j, i); break;
		case EJIT_OP_LDXR64: compile_ldxr64(f, j, i); break;
		case EJIT_OP_LDXRU8: compile_ldxru8(f, j, i); break;
		case EJIT_OP_LDXRU16: compile_ldxru16(f, j, i); break;
		case EJIT_OP_LDXRU32: compile_ldxru32(f, j, i); break;
		case EJIT_OP_LDXRU64: compile_ldxru64(f, j, i); break;
		case EJIT_OP_LDXRF: compile_ldxrf(f, j, i); break;
		case EJIT_OP_LDXRD: compile_ldxrd(f, j, i); break;

		case EJIT_OP_COMR: compile_comr(f, j, i); break;

		case EJIT_OP_NEGR: compile_negr(f, j, i); break;
		case EJIT_OP_NEGR_F: compile_negr_f(f, j, i); break;
		case EJIT_OP_NEGR_D: compile_negr_d(f, j, i); break;

		case EJIT_OP_EXTR8: compile_extr8(f, j, i); break;
		case EJIT_OP_EXTR16: compile_extr16(f, j, i); break;
		case EJIT_OP_EXTR32: compile_extr32(f, j, i); break;
		case EJIT_OP_EXTRU8: compile_extru8(f, j, i); break;
		case EJIT_OP_EXTRU16: compile_extru16(f, j, i); break;
		case EJIT_OP_EXTRU32: compile_extru32(f, j, i); break;
		case EJIT_OP_EXTRF: compile_extrf(f, j, i); break;
		case EJIT_OP_EXTRD: compile_extrd(f, j, i); break;

		case EJIT_OP_TRUNCR_D_32: compile_truncr_d_32(f, j, i); break;
		case EJIT_OP_TRUNCR_D_64: compile_truncr_d_64(f, j, i); break;
		case EJIT_OP_TRUNCR_F_32: compile_truncr_f_32(f, j, i); break;
		case EJIT_OP_TRUNCR_F_64: compile_truncr_f_64(f, j, i); break;

		case EJIT_OP_SQRTR_F: compile_sqrtr_f(f, j, i); break;
		case EJIT_OP_SQRTR_D: compile_sqrtr_d(f, j, i); break;

		case EJIT_OP_EQR: compile_eqr(f, j, i); break;
		case EJIT_OP_EQR_F: compile_eqr_f(f, j, i); break;
		case EJIT_OP_EQR_D: compile_eqr_d(f, j, i); break;

		case EJIT_OP_NER: compile_ner(f, j, i); break;
		case EJIT_OP_NER_F: compile_ner_f(f, j, i); break;
		case EJIT_OP_NER_D: compile_ner_d(f, j, i); break;

		case EJIT_OP_GER: compile_ger(f, j, i); break;
		case EJIT_OP_GER_U: compile_ger_u(f, j, i); break;
		case EJIT_OP_GER_F: compile_ger_f(f, j, i); break;
		case EJIT_OP_GER_D: compile_ger_d(f, j, i); break;

		case EJIT_OP_GTR: compile_gtr(f, j, i); break;
		case EJIT_OP_GTR_U: compile_gtr_u(f, j, i); break;
		case EJIT_OP_GTR_F: compile_gtr_f(f, j, i); break;
		case EJIT_OP_GTR_D: compile_gtr_d(f, j, i); break;

		case EJIT_OP_BMCI: {
			compile_bmci(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BMCR: {
			compile_bmcr(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BMSI: {
			compile_bmsi(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BMSR: {
			compile_bmsr(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BEQR: {
			compile_beqr(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BEQI: {
			compile_beqi(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BEQR_F: {
			compile_beqr_f(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BEQR_D: {
			compile_beqr_d(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BNER: {
			compile_bner(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BNEI: {
			compile_bnei(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BNER_F: {
			compile_bner_f(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BNER_D: {
			compile_bner_d(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BGER: {
			compile_bger(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BGER_U: {
			compile_bger_u(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BGEI: {
			compile_bgei(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BGEI_U: {
			compile_bgei_u(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BGER_F: {
			compile_bger_f(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BGER_D: {
			compile_bger_d(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BGTR: {
			compile_bgtr(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BGTR_U: {
			compile_bgtr_u(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BGTI: {
			compile_bgti(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BGTI_U: {
			compile_bgti_u(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BGTR_F: {
			compile_bgtr_f(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BGTR_D: {
			compile_bgtr_d(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BLEI: {
			compile_blei(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BLEI_U: {
			compile_blei_u(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BLTI: {
			compile_blti(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_BLTI_U: {
			compile_blti_u(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_JMP: {
			compile_jmp(f, j, i, &relocs);
			resolve_top_reloc(j, &relocs, &addrs, ii);
			break;
		}

		case EJIT_OP_ARG_I: {
			jit_operand_t type = jit_operand_imm(JIT_OPERAND_ABI_WORD, i.r1);
			jit_operand_t arg = jit_operand_imm(jit_abi_from(i.r1), i.o);
			operands_append(&src, type);
			operands_append(&src, arg);
			operands_append(&direct, arg);

			jit_operand_t to[2] = {
				jit_operand_mem(JIT_OPERAND_ABI_WORD, JIT_SP,
				                type_offset(i)),
				jit_operand_mem(jit_abi_from(i.r1), JIT_SP,
				                arg_offset(i))
			};

			operands_append(&dst, to[0]);
			operands_append(&dst, to[1]);
			break;
		}

		case EJIT_OP_ARG_FI: {
			assert(false && "immediate floats (currently?) not supported");
			abort();
			break;
		}

		case EJIT_OP_ARG: {
			size_t r2 = gpr_stats_at(&f->gpr, i.r2)->rno;
			jit_operand_t type = jit_operand_imm(JIT_OPERAND_ABI_WORD, i.r1);

			jit_operand_t arg;
			if (r2 < physgpr_count()) {
				/* regular register */
				arg = jit_operand_gpr(jit_abi_from(i.r1),
				                      physgpr_at(r2));
			}
			else {
				/* stack location, note that we'll fix up the SP
				 * offset before doing the actual call */
				arg = jit_operand_mem(jit_abi_from(i.r1),
				                      JIT_SP, stack_loc(r2));
			}

			operands_append(&src, type);
			operands_append(&src, arg);
			operands_append(&direct, arg);

			jit_operand_t to[2] = {
				jit_operand_mem(JIT_OPERAND_ABI_WORD, JIT_SP,
				                type_offset(i)),
				jit_operand_mem(jit_abi_from(i.r1), JIT_SP,
				                arg_offset(i))
			};

			operands_append(&dst, to[0]);
			operands_append(&dst, to[1]);
			break;
		}

		case EJIT_OP_ARG_F: {
			size_t f2 = fpr_stats_at(&f->fpr, i.r2)->fno;
			jit_operand_t type = jit_operand_imm(JIT_OPERAND_ABI_WORD, i.r1);

			jit_operand_t arg;
			if (f2 < physfpr_count()) {
				/* regular register */
				arg = jit_operand_fpr(jit_abi_from(i.r1),
				                      physfpr_at(f2));
			}
			else {
				/* stack location, note that we'll fix up the SP
				 * offset before doing the actual call */
				arg = jit_operand_mem(jit_abi_from(i.r1),
				                      JIT_SP,
				                      stack_loc_f(f, f2));
			}

			operands_append(&src, type);
			operands_append(&src, arg);
			operands_append(&direct, arg);

			jit_operand_t to[2] = {
				jit_operand_mem(JIT_OPERAND_ABI_WORD, JIT_SP,
				                type_offset(i)),
				jit_operand_mem(jit_abi_from(i.r1), JIT_SP,
				                arg_offset(i))
			};

			operands_append(&dst, to[0]);
			operands_append(&dst, to[1]);
			break;
		}

		case EJIT_OP_ESCAPEI_L:
#if __WORDSIZE == 64
			  /* fallthrough */
#else
			  assert(0 && "trying to compile escapei_l on 32bit arch");
			  break;
#endif

		case EJIT_OP_ESCAPEI_D:
		case EJIT_OP_ESCAPEI_F:
		case EJIT_OP_ESCAPEI_I: {
			save_caller_save_regs(f, j);

			jit_operand_t args[2] = {
				jit_operand_imm(JIT_OPERAND_ABI_WORD,
				                operands_len(&src) / 2),
				/* compile_imm_call populates JIT_R0 with the
				 * argument stack address */
				jit_operand_gpr(JIT_OPERAND_ABI_POINTER, JIT_R0)
			};
			compile_imm_call(j, &src, &dst, (void *)(uintptr_t)i.o, 2, args);
			restore_caller_save_regs(f, j);

			operands_reset(&src);
			operands_reset(&dst);
			operands_reset(&direct);
			break;
		}

		case EJIT_OP_CALLI_L:
#if __WORDSIZE == 64
			 call = checked_run_l; goto calli;
#else
			  assert(0 && "trying to compile calli_l on 32bit arch");
			  break;
#endif

		case EJIT_OP_CALLI_F: { call = checked_run_f; goto calli; }
		case EJIT_OP_CALLI_D: { call = checked_run_d; goto calli; }
		case EJIT_OP_CALLI_I: { call = checked_run_i; goto calli;
calli:
			save_caller_save_regs(f, j);

			struct ejit_func *f = (struct ejit_func *)(uintptr_t)i.o;
			if (f && f->direct_call) {
				jit_calli(j, f->direct_call, operands_len(&direct), direct.buf);
				restore_caller_save_regs(f, j);

				operands_reset(&src);
				operands_reset(&dst);
				operands_reset(&direct);
				break;
			}

			jit_operand_t args[3] = {
				jit_operand_imm(JIT_OPERAND_ABI_POINTER, i.o),
				jit_operand_imm(JIT_OPERAND_ABI_WORD,
				                operands_len(&src) / 2),
				/* compile_imm_call populates JIT_R0 with the
				 * argument stack address */
				jit_operand_gpr(JIT_OPERAND_ABI_POINTER, JIT_R0)
			};
			compile_imm_call(j, &src, &dst, call, 3, args);
			restore_caller_save_regs(f, j);

			operands_reset(&src);
			operands_reset(&dst);
			operands_reset(&direct);
			break;
		}

		case EJIT_OP_RETVAL: compile_retval(f, j, i); break;
		case EJIT_OP_RETVAL_F: compile_retval_f(f, j, i); break;
		case EJIT_OP_RETVAL_D: compile_retval_d(f, j, i); break;

		case EJIT_OP_RETR: {
			jit_gpr_t r = getloc(f, j, i.r1, 0);
			/* R0 won't get overwritten by jit_leave_jit_abi */
			jit_movr(j, JIT_R0, r);
			jit_shrink_stack(j, stack);
			jit_leave_jit_abi(j, gprs, fprs, frame);
			jit_retr(j, JIT_R0);
			break;
		}

		case EJIT_OP_RETR_F: {
			jit_fpr_t r = getloc_f(f, j, i.r1, 0);
			jit_movr_f(j, JIT_F0, r);
			jit_shrink_stack(j, stack);
			jit_leave_jit_abi(j, gprs, fprs, frame);
			jit_retr_f(j, JIT_F0);
			break;
		}

		case EJIT_OP_RETR_D: {
			jit_fpr_t r = getloc_d(f, j, i.r1, 0);
			jit_movr_d(j, JIT_F0, r);
			jit_shrink_stack(j, stack);
			jit_leave_jit_abi(j, gprs, fprs, frame);
			jit_retr_d(j, JIT_F0);
			break;
		}

		case EJIT_OP_RETI: {
			jit_shrink_stack(j, stack);
			jit_leave_jit_abi(j, gprs, fprs, frame);
			jit_reti(j, i.o);
			break;
		}

		case EJIT_OP_END: {
			/* 'void' return */
			jit_shrink_stack(j, stack);
			jit_leave_jit_abi(j, gprs, fprs, frame);
			jit_reti(j, 0);
			break;
		}

		case EJIT_OP_PARAM_F: {
			size_t f2 = fpr_stats_at(&f->fpr, i.r2)->fno;

			jit_operand_t to;
			if (f2 < physfpr_count()) {
				/* regular register */
				to = jit_operand_fpr(jit_abi_from(i.r1),
				                     physfpr_at(f2));
			}
			else {
				/* stack location */
				to = jit_operand_mem(jit_abi_from(i.r1), JIT_SP,
				                     stack_loc_f(f, f2));
			}

			operands_append(&dst, to);
			break;
		}

		case EJIT_OP_PARAM: {
			size_t r2 = gpr_stats_at(&f->gpr, i.r2)->rno;

			jit_operand_t to;
			if (r2 < physgpr_count()) {
				/* regular register */
				to = jit_operand_gpr(jit_abi_from(i.r1),
				                     physgpr_at(r2));
			}
			else {
				/* stack location */
				to = jit_operand_mem(jit_abi_from(i.r1), JIT_SP,
				                     stack_loc(r2));
			}

			operands_append(&dst, to);
			break;
		}

		case EJIT_OP_START: {
			/* parameters should be done by now */
			jit_load_args(j, operands_len(&dst), dst.buf);
			/* reuse for arguments */
			operands_reset(&dst);
			break;
		}

		default: abort();
		}
	}

	assert(relocs_len(&relocs) == 0);

	operands_destroy(&src);
	operands_destroy(&dst);
	operands_destroy(&direct);
	relocs_destroy(&relocs);
	addrs_destroy(&addrs);

	if ((f->extern_call = jit_end(j, &size)))
		return 0;

	return size;
}

struct alive_slot {
	long r;
	size_t cost;
	size_t idx;
	size_t remap;
};

#define VEC_NAME alive
#define VEC_TYPE struct alive_slot
#include "../vec.h"

static int spill_cost_sort(struct alive_slot *a, struct alive_slot *b)
{
	if (a->cost > b->cost)
		return -1;

	return a->cost < b->cost;
}

/* slightly more parameters than I would like but I guess it's fine */
static void calculate_alive(struct alive *alive, size_t idx,
		size_t prio, size_t start, size_t end, size_t *rno,
		void *regs, int (*dead)(void *regs, size_t idx, size_t start))
{
	/* single-shot registers go in the special reserved slot */
	if (end <= start + 1) {
		*rno = 0;

		struct alive_slot *a = alive_at(alive, 0);
		a->cost += prio;
		return;
	}

	/* kill registers whose lifetime has ended */
	long max_cost_idx = -1;
	size_t max_cost = 0;
	long counter = 0;
	foreach_vec(ai, *alive) {
		/* skip oneshot */
		if (ai == 0)
			goto next;

		struct alive_slot *a = alive_at(alive, ai);
		if (a->r >= 0 && dead(regs, a->r, start))
			a->r = -1; /* gravestone */

		if (a->r < 0 && a->cost > max_cost) {
			max_cost = a->cost;
			max_cost_idx = counter;
		}

next:
		counter++;
	}

	/* there's a suitable slot for us */
	if (max_cost_idx >= 0) {
		*rno = max_cost_idx;

		struct alive_slot *a = alive_at(alive, max_cost_idx);
		a->cost += prio;
		a->r = idx;
		return;
	}

	*rno = alive_len(alive);
	struct alive_slot a = {
		.cost = prio,
		.r = idx,
		.idx = *rno
	};
	alive_append(alive, a);
}

static int gpr_dead(void *regs, size_t idx, size_t start)
{
	struct gpr_stats *gprs = regs;
	return gpr_stats_at(gprs, idx)->end <= start;
}

static void linear_gpr_alloc(struct ejit_func *f)
{
	foreach_vec(gi, f->gpr) {
		gpr_stats_at(&f->gpr, gi)->rno = gi;
	}
}

/* there's a fair bit of repetition between this and the gpr case, hmm */
static void assign_gprs(struct ejit_func *f)
{
	/* everything fits into registers, no need to start optimizing */
	if (gpr_stats_len(&f->gpr) <= physgpr_count())
		return linear_gpr_alloc(f);

	struct alive alive = alive_create(gpr_stats_len(&f->gpr));

	/* special oneshot register class */
	struct alive_slot a = {.r = -1, .cost = 0, .idx = 0};
	alive_append(&alive, a);

	foreach_vec(gi, f->gpr) {
		struct gpr_stat *gpr = gpr_stats_at(&f->gpr, gi);
		calculate_alive(&alive, gi,
				gpr->prio, gpr->start, gpr->end, &gpr->rno,
				&f->gpr, gpr_dead);
	}

	/* sort so that the highest spill cost register classes are at the front and
	 * as such more likely to be placed in registers */
	alive_sort(&alive, (vec_comp_t)spill_cost_sort);

	/* update remapping info */
	for(size_t i = 0; i < alive_len(&alive); ++i) {
		struct alive_slot *a = alive_at(&alive, i);
		alive_at(&alive, a->idx)->remap = i;
	}

	/* remap locations */
	for (size_t i = 0; i < gpr_stats_len(&f->gpr); ++i) {
		struct gpr_stat *gpr = gpr_stats_at(&f->gpr, i);
		struct alive_slot *a = alive_at(&alive, gpr->rno);
		gpr->rno = a->remap;
	}

	alive_destroy(&alive);
}

static int fpr_dead(void *regs, size_t idx, size_t start)
{
	struct fpr_stats *fprs = regs;
	return fpr_stats_at(fprs, idx)->end <= start;
}

static void linear_fpr_alloc(struct ejit_func *f)
{
	foreach_vec(fi, f->fpr) {
		fpr_stats_at(&f->fpr, fi)->fno = fi;
	}
}

static void assign_fprs(struct ejit_func *f)
{
	/* everything fits into registers, no need to start optimizing */
	if (fpr_stats_len(&f->fpr) <= physfpr_count())
		return linear_fpr_alloc(f);

	struct alive alive = alive_create(fpr_stats_len(&f->fpr));

	/* special oneshot register class */
	struct alive_slot a = {.r = -1, .cost = 0, .idx = 0};
	alive_append(&alive, a);

	foreach_vec(fi, f->fpr) {
		struct fpr_stat *fpr = fpr_stats_at(&f->fpr, fi);
		calculate_alive(&alive, fi,
				fpr->prio, fpr->start, fpr->end, &fpr->fno,
				&f->fpr, fpr_dead);
	}

	/* sort so that the highest spill cost register classes are at the front and
	 * as such more likely to be placed in registers */
	alive_sort(&alive, (vec_comp_t)spill_cost_sort);

	/* update remapping info */
	for(size_t i = 0; i < alive_len(&alive); ++i) {
		struct alive_slot *a = alive_at(&alive, i);
		alive_at(&alive, a->idx)->remap = i;
	}

	/* remap locations */
	for (size_t i = 0; i < fpr_stats_len(&f->fpr); ++i) {
		struct fpr_stat *fpr = fpr_stats_at(&f->fpr, i);
		struct alive_slot *a = alive_at(&alive, fpr->fno);
		fpr->fno = a->remap;
	}

	alive_destroy(&alive);
}

static size_t align_up(size_t a, size_t n)
{
	if (a % n == 0)
		return a;

	return a + (a % n);
}

bool ejit_compile(struct ejit_func *f, bool use_64, bool im_scawed)
{
	(void)use_64;
#if __WORDSIZE != 64
	/* can't compile 64bit code on 32bit systems, give up early */
	if (use_64)
		return false;
#endif
	if (!init_jit())
		return false;

	assign_gprs(f);
	assign_fprs(f);

	/* the main overhead of compilation seems to be the syscall to mmap a
	 * new arena, I might look into allocating a big buffer at once and
	 * caching it to be reused later, might allow us to compile many small
	 * functions faster */
	jit_state_t *j = jit_new_state(NULL, NULL);
	assert(j);

	void *arena = NULL;
	size_t pagesize = sysconf(_SC_PAGE_SIZE);
	size_t size = pagesize;

	while (1) {
		arena = alloc_arena(size, im_scawed);
		if (arena == (void *)(-1)) {
			jit_destroy_state(j);
			return false;
		}

		size_t required_size = compile_fn_body(f, j, arena, size);
		if (required_size == 0)
			break;

		free_arena(arena, size);
		size = align_up(required_size + pagesize, pagesize);
	}

	jit_destroy_state(j);

	if (im_scawed && mprotect(arena, size, PROT_EXEC | PROT_READ)) {
		free_arena(arena, size);
		return false;
	}

	f->arena = arena;
	f->size = size;
	return true;
}