#include <math.h>
#include <stdio.h>
#include <sys/mman.h>

#include <posthaste/compile.h>
#include <posthaste/lower.h>
#include <posthaste/utils.h>
#include <posthaste/date.h>

#include <lightening/lightening.h>

/* we have a couple assumptions running through this code:
 * 1) For each branch/jump, there's always a corresponding label
 * 2) The global value array is in the virtual register JIT_V0
 * 3) At the start of a function, we immediately allocate f->max_sp slots, and
 * slot indexes are relative to JIT_SP
 *
 * Lightening is a very cool library that provides a kind of virtual assembler,
 * as in there's a number of virtual registers that map to physical registers
 * and each virtual instruction maps to some number of physical instructions.
 * This means that the user is required to handle register allocations, (virtual)
 * code generation and possible optimizations, but with some clever tricks we can
 * pretty much ignore these limitations completely.
 *
 * Each bytecode instruction gets compiled down to some number of virtual
 * Lightening instructions that perform the same task 'as if' the instruction was
 * being interpreted. This means each operation first loads its arguments from
 * memory to virtual registers, does whatever with them, and stores the result into
 * some memory location. This is pretty inefficient but very easy to implement, and
 * even a poor JIT is almost always faster than the best bytecode.
 *
 * As for register allocations, as long as we ensure that each bytecode instruction
 * effectively 'frees' the registers it used once it's been executed, we don't
 * need to worry about it. The values are safely stored in memory where the
 * input/output locs can fetch them later, and there are no register-register
 * dependencies between bytecode instructions.
 *
 * Each operation uses at most two caller-save registers to perform its operation.
 * The only callee-save register we use is JIT_V0 for the global array, so
 * we can ask Lightening to not save the other callee-save registers, saving a bit of
 * overhead when entering/exiting functions.
 *
 * There's no particularly good way to view the generated machine code, as it would
 * require adding something like binutils' BFD library as a dependency. You can
 * always just dump each code arena to a file and look at it later, but I
 * haven't implemented it.
 */

static void compile_fn(struct fn *f);

static void *alloc_arena(size_t size)
{
	return mmap(NULL, size,
	            PROT_EXEC | PROT_READ | PROT_WRITE,
	            MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
}

static void free_arena(void *arena, size_t size)
{
	munmap(arena, size);
}

static jit_operand_t formal_at(size_t i)
{
	return jit_operand_mem(JIT_OPERAND_ABI_INT64, JIT_SP,
	                       i * sizeof(int64_t));
}

/* load value from global array/stack location l and place it into virtual register r */
static void get(jit_state_t *j, jit_gpr_t r, struct loc l)
{
	if (l.l) {
		jit_ldxi_l(j, r, JIT_SP, l.o * sizeof(int64_t));
		return;
	}

	jit_ldxi_l(j, r, JIT_V0, l.o * sizeof(int64_t));
}

/* store value from virtual register r to global array/stack location l*/
static void put(jit_state_t *j, jit_gpr_t r, struct loc l)
{
	if (l.l) {
		jit_stxi_l(j, l.o * sizeof(int64_t), JIT_SP, r);
		return;
	}

	jit_stxi_l(j, l.o * sizeof(int64_t), JIT_V0, r);
}

static void compile_add(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	get(j, JIT_R1, i.i1);
	jit_addr(j, JIT_R0, JIT_R0, JIT_R1);
	put(j, JIT_R0, i.o);
}

static void compile_sub(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	get(j, JIT_R1, i.i1);
	jit_subr(j, JIT_R0, JIT_R0, JIT_R1);
	put(j, JIT_R0, i.o);
}

static void compile_mul(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	get(j, JIT_R1, i.i1);
	jit_mulr(j, JIT_R0, JIT_R0, JIT_R1);
	put(j, JIT_R0, i.o);
}

static void compile_div(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	get(j, JIT_R1, i.i1);
	jit_divr(j, JIT_R0, JIT_R0, JIT_R1);
	put(j, JIT_R0, i.o);
}

static void compile_move(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	put(j, JIT_R0, i.o);
}

static void compile_const(jit_state_t *j, struct insn i)
{
	jit_movi(j, JIT_R0, i.v);
	put(j, JIT_R0, i.o);
}

static void compile_eq(jit_state_t *j, struct insn i)
{
	/* Lightening doesn't really have any register-register comparison
	 * instructions, so implement them as local branches */
	get(j, JIT_R0, i.i0);
	get(j, JIT_R1, i.i1);
	jit_reloc_t branch = jit_beqr(j, JIT_R0, JIT_R1);

	jit_movi(j, JIT_R0, 0);
	jit_reloc_t jump = jit_jmp(j);
	jit_patch_there(j, branch, jit_address(j));

	jit_movi(j, JIT_R0, 1);
	jit_patch_there(j, jump, jit_address(j));

	put(j, JIT_R0, i.o);
}

static void compile_lt(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	get(j, JIT_R1, i.i1);
	jit_reloc_t branch = jit_bltr(j, JIT_R0, JIT_R1);

	jit_movi(j, JIT_R0, 0);
	jit_reloc_t jump = jit_jmp(j);
	jit_patch_there(j, branch, jit_address(j));

	jit_movi(j, JIT_R0, 1);
	jit_patch_there(j, jump, jit_address(j));

	put(j, JIT_R0, i.o);
}

/* helper function for compiling PRINT_DATE, Lightening doesn't handle variadics
 * so do the heavy lifting with GCC (or Clang or whatever) */
static void print_date(int64_t date)
{
	char str[11];
	date_to_string(str, (ph_date_t)date);
	printf("%s", str);
}

static void compile_print_date(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	jit_calli_1(j, print_date,
	            jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_R0));
}

static void print_int(int64_t i)
{
	printf("%lli", (long long)i);
}

static void compile_print_int(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	jit_calli_1(j, print_int,
	            jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_R0));
}

static void print_string(const char *s)
{
	printf("%s", s);
}

static void compile_print_string(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	jit_calli_1(j, print_string,
	            jit_operand_gpr(JIT_OPERAND_ABI_POINTER, JIT_R0));
}

static void print_bool(int64_t b)
{
	printf("%s", b ? "true" : "false");
}

static void compile_print_bool(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	jit_calli_1(j, print_bool,
	            jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_R0));
}

static void compile_print_newline(jit_state_t *j, struct insn i)
{
	UNUSED(i);
	jit_calli_1(j, putchar, jit_operand_imm(JIT_OPERAND_ABI_INT8, '\n'));
}

static void compile_print_space(jit_state_t *j, struct insn i)
{
	UNUSED(i);
	jit_calli_1(j, putchar, jit_operand_imm(JIT_OPERAND_ABI_INT8, ' '));
}

static void compile_label(jit_state_t *j, size_t ii, struct vec *labels)
{
	/* add a mapping between instruction index (ii) and current address */
	vect_at(jit_addr_t, *labels, ii) = jit_address(j);
}

struct reloc_helper {
	jit_reloc_t r;
	size_t to;
};

static void compile_j(jit_state_t *j, struct insn i, struct vec *relocs)
{
	jit_reloc_t r = jit_jmp(j);
	struct reloc_helper h = {.r = r, .to = i.v};
	vect_append(struct reloc_helper, *relocs, &h);
}

static void compile_b(jit_state_t *j, struct insn i, struct vec *relocs)
{
	get(j, JIT_R0, i.i0);
	jit_reloc_t r = jit_bnei(j, JIT_R0, 0);
	struct reloc_helper h = {.r = r, .to = i.v};
	vect_append(struct reloc_helper, *relocs, &h);
}

static void compile_bz(jit_state_t *j, struct insn i, struct vec *relocs)
{
	get(j, JIT_R0, i.i0);
	jit_reloc_t r = jit_beqi(j, JIT_R0, 0);
	struct reloc_helper h = {.r = r, .to = i.v};
	vect_append(struct reloc_helper, *relocs, &h);
}

static void compile_arg(struct insn i, struct vec *params)
{
	jit_operand_t operand;
	struct loc l = i.i0;
	/* Lightening allows us to specify memory locations as arguments to
	 * function calls, and will automatically move the value from memory to
	 * the correct register according to the ABI */
	if (l.l) {
		operand = jit_operand_mem(JIT_OPERAND_ABI_INT64, JIT_SP,
		                          l.o * sizeof(int64_t));
	}
	else {
		operand = jit_operand_mem(JIT_OPERAND_ABI_INT64, JIT_V0,
		                          l.o * sizeof(int64_t));
	}

	vect_append(jit_operand_t, *params, &operand);
}

static void compile_call(jit_state_t *j, struct insn i, struct vec *params)
{
	jit_operand_t gp = jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_V0);
	vect_append(jit_operand_t, *params, &gp);

	struct fn *f = find_fn(i.v);
	assert(f);

	/* if we haven't already compiled this function, do it now */
	if (!f->arena)
		compile_fn(f);

	jit_calli(j, f->arena, vec_len(params), params->buf);
	/* argument instructions are only ever directly before a call
	 * instruction, so there's no risk of messing up some other call's
	 * args and we can avoid allocating a new vector */
	vec_reset(params);
}

static void compile_retval(jit_state_t *j, struct insn i)
{
	jit_retval_l(j, JIT_R0);
	put(j, JIT_R0, i.o);
}

static void compile_neg(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	jit_negr(j, JIT_R0, JIT_R0);
	put(j, JIT_R0, i.o);
}

static void compile_today(jit_state_t *j, struct insn i)
{
	jit_calli_0(j, current_date);
	jit_retval_l(j, JIT_R0);
	put(j, JIT_R0, i.o);
}

/* lots of these date handling instructions I've implemented as helper functions, mostly
 * just for my own sanity */
static ph_date_t date_add(int64_t i0, int64_t i1)
{
	struct tm time = tm_from_date((ph_date_t)i0);
	time.tm_mday += i1;
	mktime(&time);

	return date_from_tm(time);
}

static void compile_date_add(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	get(j, JIT_R1, i.i1);
	jit_calli_2(j, date_add,
	            jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_R0),
	            jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_R1));

	jit_retval_l(j, JIT_R0);
	put(j, JIT_R0, i.o);
}

static ph_date_t date_sub(int64_t i0, int64_t i1)
{
	struct tm time = tm_from_date((ph_date_t)i0);
	time.tm_mday -= i1;
	mktime(&time);

	return date_from_tm(time);
}

static void compile_date_sub(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	get(j, JIT_R1, i.i1);
	jit_calli_2(j, date_sub,
	            jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_R0),
	            jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_R1));

	jit_retval_l(j, JIT_R0);
	put(j, JIT_R0, i.o);
}

static int64_t date_diff(int64_t i0, int64_t i1)
{
	struct tm tm0 = tm_from_date((ph_date_t)i0);
	struct tm tm1 = tm_from_date((ph_date_t)i1);

	time_t t0 = mktime(&tm0);
	time_t t1 = mktime(&tm1);

	double seconds = difftime(t0, t1);
	return round(seconds / 86400);
}

static void compile_date_diff(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	get(j, JIT_R1, i.i1);

	jit_calli_2(j, date_diff,
	            jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_R0),
	            jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_R1));

	jit_retval_l(j, JIT_R0);
	put(j, JIT_R0, i.o);
}

static int64_t load_year(int64_t i0)
{
	unsigned year = 0;
	date_split((ph_date_t)i0, &year, NULL, NULL);
	return year;
}

static void compile_load_year(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	jit_calli_1(j, load_year,
	            jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_R0));

	jit_retval_l(j, JIT_R0);
	put(j, JIT_R0, i.o);
}

static int64_t load_month(int64_t i0)
{
	unsigned month = 0;
	date_split((ph_date_t)i0, NULL, &month, NULL);
	return month;
}

static void compile_load_month(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	jit_calli_1(j, load_month,
	            jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_R0));

	jit_retval_l(j, JIT_R0);
	put(j, JIT_R0, i.o);
}

static int64_t load_day(int64_t i0)
{
	unsigned day = 0;
	date_split((ph_date_t)i0, NULL, NULL, &day);
	return day;
}

static void compile_load_day(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	jit_calli_1(j, load_day,
	            jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_R0));

	jit_retval_l(j, JIT_R0);
	put(j, JIT_R0, i.o);
}

static int64_t load_weekday(int64_t i0)
{
	struct tm time = tm_from_date((ph_date_t)i0);
	return (int64_t)time.tm_wday;
}

static void compile_load_weekday(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	jit_calli_1(j, load_weekday,
	            jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_R0));

	jit_retval_l(j, JIT_R0);
	put(j, JIT_R0, i.o);
}

static int64_t load_weeknum(int64_t i0)
{
	struct tm time = tm_from_date((ph_date_t)i0);
	return time.tm_yday / 7;
}

static void compile_load_weeknum(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	jit_calli_1(j, load_weeknum,
	            jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_R0));

	jit_retval_l(j, JIT_R0);
	put(j, JIT_R0, i.o);
}

static int64_t store_year(int64_t i0, int64_t i1)
{
	unsigned month = 0;
	unsigned day = 0;
	date_split((ph_date_t)i0, NULL, &month, &day);
	return date_from_numbers(i1, month, day);
}

static void compile_store_year(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	get(j, JIT_R1, i.i1);
	jit_calli_2(j, store_year,
	            jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_R0),
	            jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_R1));

	jit_retval_l(j, JIT_R0);
	put(j, JIT_R0, i.o);
}

static int64_t store_month(int64_t i0, int64_t i1)
{
	unsigned year = 0;
	unsigned day = 0;
	date_split((ph_date_t)i0, &year, NULL, &day);
	return date_from_numbers(year, i1, day);
}

static void compile_store_month(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	get(j, JIT_R1, i.i1);
	jit_calli_2(j, store_month,
	            jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_R0),
	            jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_R1));

	jit_retval_l(j, JIT_R0);
	put(j, JIT_R0, i.o);
}

static int64_t store_day(int64_t i0, int64_t i1)
{
	unsigned year = 0;
	unsigned month = 0;
	date_split((ph_date_t)i0, &year, &month, NULL);
	return date_from_numbers(year, month, i1);
}

static void compile_store_day(jit_state_t *j, struct insn i)
{
	get(j, JIT_R0, i.i0);
	get(j, JIT_R1, i.i1);
	jit_calli_2(j, store_day,
	            jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_R0),
	            jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_R1));

	jit_retval_l(j, JIT_R0);
	put(j, JIT_R0, i.o);
}

static size_t compile_fn_body(struct fn *f, jit_state_t *j)
{
	jit_begin(j, f->arena, f->size);
	size_t frame = jit_enter_jit_abi(j, 1, 0, 0);
	/* easy off-by-one to miss, damn */
	size_t stack = jit_align_stack(j, (f->max_sp + 1) * sizeof(int64_t));

	/* move parameters to where they belong */
	struct vec params = vec_create(sizeof(jit_operand_t));
	for (size_t i = 0; i < f->params; ++i) {
		jit_operand_t operand = formal_at(i);
		vec_append(&params, &operand);
	}

	jit_operand_t gp = jit_operand_gpr(JIT_OPERAND_ABI_INT64, JIT_V0);
	vec_append(&params, &gp);
	jit_load_args(j, f->params + 1, params.buf);

	/* reuse vector as argument vector */
	vec_reset(&params);

	struct vec relocs = vec_create(sizeof(struct reloc_helper));
	struct vec labels = vec_create(sizeof(jit_addr_t));
	vec_reserve(&labels, vec_len(&f->insns));

	foreach_vec(ii, f->insns) {
		struct insn i = vect_at(struct insn, f->insns, ii);
		switch (i.k) {
		case LABEL: compile_label(j, ii, &labels); break;
		case J: compile_j(j, i, &relocs); break;
		case B: compile_b(j, i, &relocs); break;
		case BZ: compile_bz(j, i, &relocs); break;
		case ADD: compile_add(j, i); break;
		case SUB: compile_sub(j, i); break;
		case MUL: compile_mul(j, i); break;
		case DIV: compile_div(j, i); break;
		case MOVE: compile_move(j, i); break;
		case CONST: compile_const(j, i); break;
		case EQ: compile_eq(j, i); break;
		case LT: compile_lt(j, i); break;

		case PRINT_DATE: compile_print_date(j, i); break;
		case PRINT_INT: compile_print_int(j, i); break;
		case PRINT_STRING: compile_print_string(j, i); break;
		case PRINT_BOOL: compile_print_bool(j, i); break;
		case PRINT_NEWLINE: compile_print_newline(j, i); break;
		case PRINT_SPACE: compile_print_space(j, i); break;

		case ARG: compile_arg(i, &params); break;
		case CALL: compile_call(j, i, &params); break;
		case RETVAL: compile_retval(j, i); break;

		case NEG: compile_neg(j, i); break;

		case TODAY: compile_today(j, i); break;

		case DATE_ADD: compile_date_add(j, i); break;
		case DATE_SUB: compile_date_sub(j, i); break;
		case DATE_DIFF: compile_date_diff(j, i); break;

		case LOAD_YEAR: compile_load_year(j, i); break;
		case LOAD_MONTH: compile_load_month(j, i); break;
		case LOAD_DAY: compile_load_day(j, i); break;
		case LOAD_WEEKDAY: compile_load_weekday(j, i); break;
		case LOAD_WEEKNUM: compile_load_weeknum(j, i); break;

		case STORE_YEAR: compile_store_year(j, i); break;
		case STORE_MONTH: compile_store_month(j, i); break;
		case STORE_DAY: compile_store_day(j, i); break;

		case RET: {
			get(j, JIT_R0, i.i0);
			jit_shrink_stack(j, stack);
			jit_leave_jit_abi(j, 1, 0, frame);
			jit_retr(j, JIT_R0);
			break;
		}

		case STOP: {
			jit_shrink_stack(j, stack);
			jit_leave_jit_abi(j, 1, 0, frame);
			jit_ret(j);
			break;
		}
		}
	}

	/* fix relocs */
	foreach_vec(ri, relocs) {
		struct reloc_helper h = vect_at(struct reloc_helper, relocs,
		                                ri);
		jit_addr_t a = vect_at(jit_addr_t, labels, h.to);
		jit_reloc_t r = h.r;

		assert(a);
		jit_patch_there(j, r, a);
	}

	vec_destroy(&relocs);
	vec_destroy(&labels);
	vec_destroy(&params);

	size_t size = 0;
	void *p = jit_end(j, &size);
	/* if we had enough room to finish compilation, return 0 to signify that
	 * we don't have to try to compile anything again */
	if (p)
		return 0;

	/* otherwise, return how many bytes lightening estimates that we would
	 * need to succesfully compile this function */
	return size;
}

static void compile_fn(struct fn *f)
{
	init_jit();
	jit_state_t *j = jit_new_state(NULL, NULL);
	assert(j);

	void *arena_base = NULL;
	size_t arena_size = 4096;
	/* technically there should probably be a limit to how many times we
	 * attempt compilation, but in practice we're unlikely to ever need it.
	 */
	while (1) {
		arena_base = alloc_arena(arena_size);
		assert(arena_base &&
		       "failed allocating executable arena, aborting");

		f->arena = arena_base;
		f->size = arena_size;

		size_t required_size = compile_fn_body(f, j);
		if (required_size == 0)
			break;

		free_arena(arena_base, arena_size);
		/* give at least one page more than necessary to be on the safe
		 * side */
		arena_size = required_size + 4096;
	}

	jit_destroy_state(j);
}

void compile()
{
	struct fn *f = find_fn(0);
	compile_fn(f);
}