lib94/lib94/warrior.cpp

761 lines
25 KiB
C++

#include <lib94/lib94.hpp>
#include <functional>
#include <cassert>
#include <fstream>
#include <cctype>
#include <memory>
#include <map>
//warrior compilation takes place in three stages:
// stage 1: preprocessing
// in this stage, comments and blank lines are extracted,
// and inline macros (equ's) are found and are processed.
// special comments are also found and processed (although
// assertion comments aren't actually checked until stage 3).
// stage 2: parsing
// in this step, the cleaned lines from stage 1 are parsed
// into opcodes, modifiers, addressing modes, and field expressions.
// the field expressions are a tree that can have labels and such
// in them. the expressions aren't evaluated until stage 4.
// labels are also found and stored at this stage.
// stage 3: assertion checking
// now that we have the values of the labels, we can check all
// of the assertions that were found in stage 1. note that all
// assertions run as though they were on the first line for label
// purposes, although this does not effect differences of labels
// stage 4: field expression evaluation
// now the field expressions are evaluted, using the label information.
namespace lib94 {
static const std::string opcode_strings[] = {
"dat", "mov", "add", "sub", "mul", "div", "mod", "jmp",
"jmz", "jmn", "djn", "seq", "sne", "slt", "spl", "nop"
};
static const std::string modifier_strings[] = {
"a", "b", "ab", "ba", "f", "x", "i"
};
static const char mode_chars[] = "#$*@{<}>";
std::string instruction_to_string(const instruction &instr) {
return
opcode_strings[instr.op] + '.' + modifier_strings[instr.mod] + ' ' +
mode_chars[instr.amode] + std::to_string(instr.anumber) + ", " +
mode_chars[instr.bmode] + std::to_string(instr.bnumber);
}
[[noreturn]] static void throw_compiler_exception(unsigned source_line_number, std::string message) {
compiler_exception ex;
ex.source_line_number = source_line_number;
ex.message = message;
throw ex;
}
typedef long intermediate_t;
typedef std::map<std::string, number_t> label_offset_set;
//this abstract class represents expression fields extracted in stage 2 and evaluted in stage 4.
class expr {
public:
unsigned source_line_number;
number_t offset;
virtual intermediate_t evaluate(const label_offset_set &label_offsets) const = 0;
};
//this abstract class represents assertions fields extracted in stage 1 and evaluated in stage 3
class assertion {
public:
unsigned source_line_number;
virtual bool check(const label_offset_set &label_offsets) const = 0;
};
struct string_with_line_number {
std::string string;
unsigned source_line_number;
};
//internal state used by stage 1. also records some information passed to later stages.
struct preprocessor_state {
std::optional<std::string> name;
std::optional<std::string> author;
std::map<std::string, std::string> macros;
std::vector<std::unique_ptr<assertion>> assertions;
unsigned current_source_line_number = 1;
};
static std::string remove_spaces(std::string from) {
size_t first_nonspace_pos = from.find_first_not_of(' ');
if (first_nonspace_pos == std::string::npos)
return "";
size_t last_nonspace_pos = from.find_last_not_of(' ');
return from.substr(first_nonspace_pos, last_nonspace_pos + 1 - first_nonspace_pos);
}
static std::string to_lower_case(std::string from) {
std::string new_string(from);
for (char &ch : new_string)
ch = tolower(ch);
return new_string;
}
//must be non-empty, first character must be letter or underscore,
//every other character must be letter, number, or underscore
static bool is_valid_identifier(std::string candidate) {
if (candidate.size() == 0 || isdigit(candidate[0]))
return false;
for (const char &ch : candidate)
if (!isalnum(ch) && ch != '_')
return false;
return true;
}
class binop_expr : public expr {
public:
std::unique_ptr<expr> left_expression;
std::unique_ptr<expr> right_expression;
std::function<intermediate_t (intermediate_t, intermediate_t)> operation;
bool division_or_mod;
intermediate_t evaluate(const label_offset_set &label_offsets) const override {
intermediate_t left = left_expression->evaluate(label_offsets);
intermediate_t right = right_expression->evaluate(label_offsets);
if (division_or_mod && right == 0)
throw_compiler_exception(source_line_number, "division or modulo by zero");
return operation(left, right);
}
};
class unop_expr : public expr {
public:
std::unique_ptr<expr> child_expression;
std::function<intermediate_t (intermediate_t)> operation;
intermediate_t evaluate(const label_offset_set &label_offsets) const override {
intermediate_t child = child_expression->evaluate(label_offsets);
return operation(child);
}
};
class label_expr : public expr {
public:
std::string the_label;
intermediate_t evaluate(const label_offset_set &label_offsets) const override {
auto result = label_offsets.find(the_label);
if (result == label_offsets.end())
throw_compiler_exception(source_line_number, "unknown label");
return (intermediate_t)result->second - (intermediate_t)offset;
}
};
class literal_expr : public expr {
public:
intermediate_t value;
intermediate_t evaluate(const label_offset_set &) const override {
return value;
}
};
static std::unique_ptr<expr> parse_expression(number_t offset, std::string from, unsigned source_line_number);
static const std::string plus_minus_scan_left_special = "+-*/%(";
static const std::map<char, std::function<intermediate_t (intermediate_t, intermediate_t)>> binary_operator_conversion = {
{'+', [](intermediate_t a, intermediate_t b) {return a + b;}},
{'-', [](intermediate_t a, intermediate_t b) {return a - b;}},
{'*', [](intermediate_t a, intermediate_t b) {return a * b;}},
{'/', [](intermediate_t a, intermediate_t b) {return a / b;}},
{'%', [](intermediate_t a, intermediate_t b) {return a % b;}}
};
//searched right to left outside parentheses for any character in connectives.
//on the first one found, returns a new expression split there.
//if none is found, returns an empty unique_ptr.
//there is some special handling on + and - to make sure they aren't unary operators.
static std::unique_ptr<expr> maybe_parse_binop_expression(number_t offset, std::string from, const char *connectives, unsigned source_line_number) {
unsigned parenthesis_layers = 0;
for (int i = from.size() - 1; i >= 0; --i) {
if (from[i] == ')')
++parenthesis_layers;
else if (from[i] == '(')
--parenthesis_layers;
else if (parenthesis_layers == 0)
for (const char *ch = connectives; *ch; ++ch)
if (from[i] == *ch) {
if (*ch == '+' || *ch == '-') {
bool okay = true;
//scan left - if we hit a binary connective, an open parenthesis, or the start of the string,
//then this is probably supposed to be a unary operator, not a binary one.
//if we hit something else, then this is probably indeed binary.
for (int j = i - 1; ; --j)
if (j < 0 || plus_minus_scan_left_special.find(from[j]) != std::string::npos) {
okay = false;
break;
}
else if (from[j] != ' ')
break;
if (!okay)
continue;
}
//this is our connective!
auto expression = std::make_unique<binop_expr>();
expression->left_expression = parse_expression(offset, from.substr(0, i), source_line_number);
expression->right_expression = parse_expression(offset, from.substr(i + 1), source_line_number);
expression->operation = binary_operator_conversion.find(*ch)->second;
expression->division_or_mod = *ch == '/' || *ch == '%';
expression->source_line_number = source_line_number;
expression->offset = offset;
return expression;
}
}
return {};
}
//parses an expression in stage 1 or 2 to be evaluated in stage 3 or 4.
static std::unique_ptr<expr> parse_expression(number_t offset, std::string from, unsigned source_line_number) {
auto binop_expression = maybe_parse_binop_expression(offset, from, "+-", source_line_number);
if (binop_expression)
return binop_expression;
binop_expression = maybe_parse_binop_expression(offset, from, "*/%", source_line_number);
if (binop_expression)
return binop_expression;
from = remove_spaces(from);
if (from.starts_with('(') && from.ends_with(')'))
return parse_expression(offset, from.substr(1, from.size() - 2), source_line_number);
if (from.starts_with('+'))
return parse_expression(offset, from.substr(1), source_line_number);
if (from.starts_with('-')) {
auto unop_expression = std::make_unique<unop_expr>();
unop_expression->child_expression = parse_expression(offset, from.substr(1), source_line_number);
unop_expression->operation = [](intermediate_t x) {return -x;};
unop_expression->source_line_number = source_line_number;
unop_expression->offset = offset;
return unop_expression;
}
if (is_valid_identifier(from)) {
auto label_expression = std::make_unique<label_expr>();
label_expression->the_label = from;
label_expression->source_line_number = source_line_number;
label_expression->offset = offset;
return label_expression;
}
size_t value_length = 0;
unsigned value = 0;
try {
value = std::stoul(from, &value_length);
}
catch (std::invalid_argument &ex) {}
catch (std::out_of_range &ex) {}
if (value_length == from.size() && value_length) {
auto literal_expression = std::make_unique<literal_expr>();
literal_expression->value = value;
literal_expression->source_line_number = source_line_number;
literal_expression->offset = offset;
return literal_expression;
}
throw_compiler_exception(source_line_number, "unknown expression form");
}
class comparison_assertion : public assertion {
public:
std::unique_ptr<expr> left_expression;
std::unique_ptr<expr> right_expression;
std::function<bool (intermediate_t, intermediate_t)> f;
bool check(const label_offset_set &label_offsets) const override {
intermediate_t left = left_expression->evaluate(label_offsets);
intermediate_t right = right_expression->evaluate(label_offsets);
return f(left, right);
}
};
static const std::map<std::string, std::function<bool (intermediate_t, intermediate_t)>> comparison_conversion = {
{"==", [](intermediate_t l, intermediate_t r) {return l == r;}},
{">=", [](intermediate_t l, intermediate_t r) {return l >= r;}},
{"<=", [](intermediate_t l, intermediate_t r) {return l <= r;}},
{"!=", [](intermediate_t l, intermediate_t r) {return l != r;}},
{">" , [](intermediate_t l, intermediate_t r) {return l > r;}},
{"<", [](intermediate_t l, intermediate_t r) {return l < r;}}
};
//parses an assertion in stage 1 to be evaluated in stage 3.
static std::unique_ptr<assertion> parse_assertion(std::string from, unsigned source_line_number) {
for (const auto &pair : comparison_conversion) {
size_t pos = from.find(pair.first);
if (pos != std::string::npos) {
std::string left = from.substr(0, pos);
std::string right = from.substr(pos + pair.first.size());
auto a = std::make_unique<comparison_assertion>();
a->left_expression = parse_expression(0, left, source_line_number);
a->right_expression = parse_expression(0, right, source_line_number);
a->f = pair.second;
a->source_line_number = source_line_number;
return a;
}
}
throw_compiler_exception(source_line_number, "unknown assertion expression form");
}
//this is the driver for stage 1. if stop_at is empty, it processes all of the remaining lines. if stop_at contains a string,
//it processes lines until it hits a line that looks like that after processing, and returns with the state pointing to that line.
//this function also stores information found from special comments, and finds equs. the processing done by this function is roughly:
// 1. find and replace previous equs in this line.
// 2. remove (and process special) comments.
// 3. if this line is blank, go to the next one.
// 4. if this line is an equ, store it and then go to the next one.
// 5. if this line is a for, recurse starting at the next line with stop_at set to rof,
// then go to the line after the rof.
// 6. finally, if we reach this step, store the processed line in output.
static void preprocess(const std::vector<std::string> &source_lines, preprocessor_state &state, std::vector<string_with_line_number> &output, std::optional<std::string> stop_at = {}) {
--state.current_source_line_number;
while (true) {
++state.current_source_line_number;
if (state.current_source_line_number == source_lines.size() + 1) {
if (stop_at)
throw_compiler_exception(state.current_source_line_number, "end of source where " + *stop_at + " expected");
return;
}
std::string line = source_lines[state.current_source_line_number - 1];
//replace macros:
for (const auto &macro_def : state.macros) {
size_t from = 0;
while (true) {
size_t pos = line.find(macro_def.first, from);
if (pos == std::string::npos)
break;
line.replace(pos, macro_def.first.size(), macro_def.second);
from = pos + macro_def.second.size();
}
}
//check for comment:
size_t semicolon_pos = line.find(';');
if (semicolon_pos != std::string::npos) {
std::string comment = remove_spaces(line.substr(semicolon_pos + 1));
std::string lower_case_comment = to_lower_case(comment);
line = line.substr(0, semicolon_pos);
if (lower_case_comment.starts_with("name ")) {
if (state.name)
throw_compiler_exception(state.current_source_line_number, "duplicate name comment");
state.name = remove_spaces(comment.substr(5));
}
else if (lower_case_comment.starts_with("author ")) {
if (state.author)
throw_compiler_exception(state.current_source_line_number, "duplicate author comment");
state.author = remove_spaces(comment.substr(7));
}
else if (lower_case_comment.starts_with("assert "))
state.assertions.push_back(parse_assertion(comment.substr(7), state.current_source_line_number));
}
//if it's blank, go to the next one:
line = remove_spaces(line);
if (line == "")
continue;
std::string lower_case_line = to_lower_case(line);
//if we have a stop_at and this is that, then consume and stop:
if (stop_at && line == *stop_at)
return;
//check for equ:
size_t equ_pos = lower_case_line.find(" equ ");
if (equ_pos != std::string::npos) {
std::string macro_name = line.substr(0, equ_pos);
std::string macro_content = line.substr(equ_pos + 5);
if (!is_valid_identifier(macro_name))
throw_compiler_exception(state.current_source_line_number, "bad macro name");
if (!state.macros.insert({macro_name, macro_content}).second)
throw_compiler_exception(state.current_source_line_number, "duplicate macro");
continue;
}
//check for for:
if (lower_case_line.starts_with("for ")) {
std::string for_arg = remove_spaces(line.substr(4));
size_t count_length = 0;
unsigned count = 0;
try {
count = std::stoul(for_arg, &count_length);
}
catch (std::invalid_argument &ex) {}
catch (std::out_of_range &ex) {}
if (count_length != for_arg.size() || !count_length)
throw_compiler_exception(state.current_source_line_number, "bad for argument");
std::vector<string_with_line_number> for_contents;
++state.current_source_line_number;
preprocess(source_lines, state, for_contents, "rof");
for (unsigned i = 0; i < count; ++i)
for (const auto &piece : for_contents)
output.push_back(piece);
continue;
}
//just a normal line:
output.push_back({.string = line, .source_line_number = state.current_source_line_number});
}
}
struct parsed_line {
opcode op;
modifier mod;
mode amode;
mode bmode;
std::unique_ptr<expr> aexpr;
std::unique_ptr<expr> bexpr;
};
static const std::map<std::string, opcode> opcode_conversion = {
{"dat", DAT}, {"mov", MOV}, {"add", ADD}, {"sub", SUB},
{"mul", MUL}, {"div", DIV}, {"mod", MOD}, {"jmp", JMP},
{"jmz", JMZ}, {"jmn", JMN}, {"djn", DJN}, {"seq", SEQ},
{"sne", SNE}, {"slt", SLT}, {"spl", SPL}, {"nop", NOP},
{"cmp", SEQ}
};
static const std::map<std::string, modifier> modifier_conversion = {
{"a", A}, {"b", B}, {"ab", AB}, {"ba", BA}, {"f", F}, {"x", X}, {"i", I}
};
static const std::map<char, mode> mode_conversion = {
{'#', IMMEDIATE}, {'$', DIRECT}, {'*', A_INDIRECT}, {'@', B_INDIRECT},
{'{', A_DECREMENT}, {'<', B_DECREMENT}, {'}', A_INCREMENT}, {'>', B_INCREMENT}
};
static void parse_field(number_t offset, std::string from, mode &mode, std::unique_ptr<expr> &expr, unsigned source_line_number) {
if (from == "") {
mode = DIRECT;
expr = parse_expression(offset, "0", source_line_number);
return;
}
auto mode_result = mode_conversion.find(from[0]);
if (mode_result == mode_conversion.end()) {
mode = DIRECT;
expr = parse_expression(offset, from, source_line_number);
}
else {
mode = mode_result->second;
if (from.size() == 1)
expr = parse_expression(offset, "0", source_line_number);
else
expr = parse_expression(offset, from.substr(1), source_line_number);
}
}
//some information collected in stage 2
struct parser_state {
label_offset_set label_offsets;
std::unique_ptr<expr> org_expr;
};
//the driver for stage 2.
//if the line given in from has an instruction, that is put into into, and true is returned.
//otherwise, false is returned. either way, any labels that are found are also stored.
//additionally, any orgs/ends are processed and stored.
static bool maybe_parse_line(const string_with_line_number &from, parsed_line &into, number_t current_offset, parser_state &state) {
std::string remainder = from.string;
while (true) {
if (remainder == "")
return false;
size_t potential_opcode_end = remainder.find_first_of(" .");
if (potential_opcode_end == std::string::npos)
potential_opcode_end = remainder.size();
std::string potential_opcode = remainder.substr(0, potential_opcode_end);
std::string lower_case_potential_opcode = to_lower_case(potential_opcode);
remainder = remove_spaces(remainder.substr(potential_opcode_end));
if (lower_case_potential_opcode == "org" || lower_case_potential_opcode == "end") {
if (state.org_expr)
throw_compiler_exception(from.source_line_number, "duplicate org/end");
state.org_expr = parse_expression(current_offset, remainder, from.source_line_number);
return false;
}
auto opcode_result = opcode_conversion.find(lower_case_potential_opcode);
if (opcode_result == opcode_conversion.end()) {
//maybe we're a label
if (!is_valid_identifier(potential_opcode))
throw_compiler_exception(from.source_line_number, "bad label or opcode");
if (!state.label_offsets.insert({potential_opcode, current_offset}).second)
throw_compiler_exception(from.source_line_number, "duplicate label");
continue;
}
into.op = opcode_result->second;
break;
}
//got an opcode :)
//now check for a modifier
bool have_modifier = false;
if (remainder.size() > 0 && remainder[0] == '.') {
remainder = remove_spaces(remainder.substr(1));
have_modifier = true;
size_t modifier_end = remainder.find(' ');
if (modifier_end == std::string::npos)
modifier_end = remainder.size();
std::string modifier = to_lower_case(remainder.substr(0, modifier_end));
remainder = remove_spaces(remainder.substr(modifier_end));
auto modifier_result = modifier_conversion.find(modifier);
if (modifier_result == modifier_conversion.end())
throw_compiler_exception(from.source_line_number, "bad modifier");
into.mod = modifier_result->second;
}
//field time
size_t comma_pos = remainder.find(',');
std::string a_field = comma_pos == std::string::npos ? remainder : remove_spaces(remainder.substr(0, comma_pos));
std::string b_field = comma_pos == std::string::npos ? "" : remove_spaces(remainder.substr(comma_pos + 1));
parse_field(current_offset, a_field, into.amode, into.aexpr, from.source_line_number);
parse_field(current_offset, b_field, into.bmode, into.bexpr, from.source_line_number);
//if we didn't get a modifier before, determine default
if (!have_modifier)
switch (into.op) {
case DAT:
into.mod = F;
break;
case MOV:
case SEQ:
case SNE:
if (into.amode == IMMEDIATE)
into.mod = AB;
else if (into.bmode == IMMEDIATE)
into.mod = B;
else
into.mod = I;
break;
case ADD:
case SUB:
case MUL:
case DIV:
case MOD:
if (into.amode == IMMEDIATE)
into.mod = AB;
else if (into.bmode == IMMEDIATE)
into.mod = B;
else
into.mod = F;
break;
case SLT:
if (into.amode == IMMEDIATE)
into.mod = AB;
else
into.mod = B;
break;
case JMP:
case JMZ:
case JMN:
case DJN:
case SPL:
case NOP:
into.mod = B;
break;
}
//we got an instruction :)
return true;
}
warrior *compile_warrior(std::string source) {
std::vector<std::string> source_lines;
while (source != "") {
std::string line;
size_t line_end = source.find('\n');
if (line_end == std::string::npos) {
line = source;
source = "";
}
else {
line = source.substr(0, line_end);
source = source.substr(line_end + 1);
}
for (char &ch : line)
if (ch == '\t' || ch == '\r')
ch = ' ';
source_lines.push_back(line);
}
//got lines, time to preprocess
preprocessor_state pp_state;
pp_state.macros.insert({"CORESIZE", std::to_string(LIB94_CORE_SIZE)});
std::vector<string_with_line_number> preprocessed_lines;
preprocess(source_lines, pp_state, preprocessed_lines);
if (!pp_state.name)
throw_compiler_exception(pp_state.current_source_line_number, "no name comment");
if (!pp_state.author)
throw_compiler_exception(pp_state.current_source_line_number, "no author comment");
//now line parsing
parser_state p_state;
std::vector<parsed_line> parsed_lines;
unsigned offset = 0;
for (const string_with_line_number &line : preprocessed_lines) {
parsed_line p_line;
if (maybe_parse_line(line, p_line, offset, p_state)) {
parsed_lines.push_back(std::move(p_line));
++offset;
}
}
//stage 3: check assertions
for (const auto &assertion : pp_state.assertions)
if (!assertion->check(p_state.label_offsets))
throw_compiler_exception(assertion->source_line_number, "assertion failed");
//stage 4: evaluate expressions
std::unique_ptr<warrior> w = std::make_unique<warrior>();
for (const auto &line : parsed_lines) {
instruction i;
i.op = line.op;
i.mod = line.mod;
i.amode = line.amode;
i.bmode = line.bmode;
i.anumber = line.aexpr->evaluate(p_state.label_offsets);
i.bnumber = line.bexpr->evaluate(p_state.label_offsets);
i.anumber = (i.anumber % LIB94_CORE_SIZE + LIB94_CORE_SIZE) % LIB94_CORE_SIZE;
i.bnumber = (i.bnumber % LIB94_CORE_SIZE + LIB94_CORE_SIZE) % LIB94_CORE_SIZE;
w->instructions.push_back(i);
}
//stage 5 ;)
if (p_state.org_expr) {
w->org = p_state.org_expr->evaluate(p_state.label_offsets) + p_state.org_expr->offset;
w->org = (w->org % LIB94_CORE_SIZE + LIB94_CORE_SIZE) % LIB94_CORE_SIZE;
}
w->name = *pp_state.name;
w->author = *pp_state.author;
return w.release();
}
}