From 014062cd1e034badfebe86951bb055f3fc224f3c Mon Sep 17 00:00:00 2001 From: Jason Gross Date: Mon, 14 Aug 2017 12:21:51 -0400 Subject: Update scheduler to know about implicit mulx arg Now it prefers putting together mulx with the same implicit arg (approximated as the same variable with the lower number). --- etc/compile-by-zinc/femulScheduled.log | 132 ++++++++++++++++---------------- etc/compile-by-zinc/heuristic-search.py | 66 +++++++++++----- 2 files changed, 111 insertions(+), 87 deletions(-) (limited to 'etc/compile-by-zinc') diff --git a/etc/compile-by-zinc/femulScheduled.log b/etc/compile-by-zinc/femulScheduled.log index 6c9281748..40559671b 100644 --- a/etc/compile-by-zinc/femulScheduled.log +++ b/etc/compile-by-zinc/femulScheduled.log @@ -1,74 +1,74 @@ INPUT: (x10, x11, x9, x7, x5, x18, x19, x17, x15, x13) uint128_t x20 = (uint128_t) x5 * x13; // MULX r64,r64,r64, start: 0, end: 4 -uint128_t x21 = (uint128_t) x5 * x15; // MULX r64,r64,r64, start: 0, end: 4 -uint128_t x22 = (uint128_t) x7 * x13; // MULX r64,r64,r64, start: 1, end: 5 +uint128_t x21 = (uint128_t) x5 * x15; // MULX r64,r64,r64, start: 1, end: 5 uint128_t x24 = (uint128_t) x5 * x17; // MULX r64,r64,r64, start: 2, end: 6 -uint128_t x25 = (uint128_t) x9 * x13; // MULX r64,r64,r64, start: 3, end: 7 -uint128_t x27 = (uint128_t) x7 * x15; // MULX r64,r64,r64, start: 4, end: 8 -uint128_t x23 = x21 + x22; // ADD; ADC(X), start: 5, end: 7 -uint128_t x29 = (uint128_t) x5 * x19; // MULX r64,r64,r64, start: 6, end: 10 -uint128_t x30 = (uint128_t) x11 * x13; // MULX r64,r64,r64, start: 6, end: 10 -uint128_t x26 = x24 + x25; // ADD; ADC(X), start: 7, end: 9 -uint128_t x32 = (uint128_t) x7 * x17; // MULX r64,r64,r64, start: 8, end: 12 -uint128_t x34 = (uint128_t) x9 * x15; // MULX r64,r64,r64, start: 8, end: 12 -uint128_t x28 = x26 + x27; // ADD; ADC(X), start: 9, end: 11 -uint128_t x36 = (uint128_t) x5 * x18; // MULX r64,r64,r64, start: 10, end: 14 -uint128_t x31 = x29 + x30; // ADD; ADC(X), start: 10, end: 12 -uint128_t x37 = (uint128_t) x10 * x13; // MULX r64,r64,r64, start: 11, end: 15 -uint128_t x39 = (uint128_t) x11 * x15; // MULX r64,r64,r64, start: 11, end: 15 -uint128_t x33 = x31 + x32; // ADD; ADC(X), start: 12, end: 14 -uint128_t x41 = (uint128_t) x7 * x19; // MULX r64,r64,r64, start: 13, end: 17 -uint128_t x43 = (uint128_t) x9 * x17; // MULX r64,r64,r64, start: 13, end: 17 -uint128_t x35 = x33 + x34; // ADD; ADC(X), start: 14, end: 16 -uint128_t x38 = x36 + x37; // ADD; ADC(X), start: 15, end: 17 +uint128_t x29 = (uint128_t) x5 * x19; // MULX r64,r64,r64, start: 3, end: 7 +uint128_t x36 = (uint128_t) x5 * x18; // MULX r64,r64,r64, start: 4, end: 8 +uint128_t x22 = (uint128_t) x7 * x13; // MULX r64,r64,r64, start: 5, end: 9 +uint128_t x27 = (uint128_t) x7 * x15; // MULX r64,r64,r64, start: 6, end: 10 +uint128_t x32 = (uint128_t) x7 * x17; // MULX r64,r64,r64, start: 7, end: 11 +uint128_t x41 = (uint128_t) x7 * x19; // MULX r64,r64,r64, start: 8, end: 12 +uint128_t x23 = x21 + x22; // ADD; ADC(X), start: 9, end: 11 +uint128_t x25 = (uint128_t) x9 * x13; // MULX r64,r64,r64, start: 9, end: 13 +uint128_t x34 = (uint128_t) x9 * x15; // MULX r64,r64,r64, start: 10, end: 14 +uint128_t x43 = (uint128_t) x9 * x17; // MULX r64,r64,r64, start: 11, end: 15 +uint128_t x30 = (uint128_t) x11 * x13; // MULX r64,r64,r64, start: 12, end: 16 +uint128_t x26 = x24 + x25; // ADD; ADC(X), start: 13, end: 15 +uint128_t x39 = (uint128_t) x11 * x15; // MULX r64,r64,r64, start: 13, end: 17 +uint128_t x37 = (uint128_t) x10 * x13; // MULX r64,r64,r64, start: 14, end: 18 +uint128_t x28 = x26 + x27; // ADD; ADC(X), start: 15, end: 17 uint64_t x45 = x10 * 0x13; // IMUL r64,r64,i, start: 15, end: 18 -uint64_t x46 = x7 * 0x13; // IMUL r64,r64,i, start: 15, end: 18 -uint128_t x40 = x38 + x39; // ADD; ADC(X), start: 16, end: 18 +uint128_t x31 = x29 + x30; // ADD; ADC(X), start: 16, end: 18 +uint64_t x48 = x11 * 0x13; // IMUL r64,r64,i, start: 16, end: 19 uint64_t x47 = x9 * 0x13; // IMUL r64,r64,i, start: 17, end: 20 -uint64_t x48 = x11 * 0x13; // IMUL r64,r64,i, start: 17, end: 20 -uint128_t x42 = x40 + x41; // ADD; ADC(X), start: 18, end: 20 +uint128_t x33 = x31 + x32; // ADD; ADC(X), start: 18, end: 20 +uint128_t x38 = x36 + x37; // ADD; ADC(X), start: 18, end: 20 +uint64_t x46 = x7 * 0x13; // IMUL r64,r64,i, start: 18, end: 21 uint128_t x49 = (uint128_t) x45 * x15; // MULX r64,r64,r64, start: 19, end: 23 -uint128_t x51 = (uint128_t) x46 * x18; // MULX r64,r64,r64, start: 19, end: 23 -uint128_t x44 = x42 + x43; // ADD; ADC(X), start: 20, end: 22 -uint128_t x53 = (uint128_t) x47 * x19; // MULX r64,r64,r64, start: 21, end: 25 -uint128_t x55 = (uint128_t) x48 * x17; // MULX r64,r64,r64, start: 21, end: 25 -uint128_t x50 = x20 + x49; // ADD; ADC(X), start: 22, end: 24 -uint128_t x57 = (uint128_t) x45 * x17; // MULX r64,r64,r64, start: 23, end: 27 -uint128_t x59 = (uint128_t) x47 * x18; // MULX r64,r64,r64, start: 23, end: 27 -uint128_t x52 = x50 + x51; // ADD; ADC(X), start: 24, end: 26 -uint128_t x61 = (uint128_t) x48 * x19; // MULX r64,r64,r64, start: 25, end: 29 -uint128_t x63 = (uint128_t) x45 * x19; // MULX r64,r64,r64, start: 25, end: 29 -uint128_t x54 = x52 + x53; // ADD; ADC(X), start: 26, end: 28 -uint128_t x58 = x23 + x57; // ADD; ADC(X), start: 27, end: 29 -uint128_t x65 = (uint128_t) x48 * x18; // MULX r64,r64,r64, start: 27, end: 31 -uint128_t x67 = (uint128_t) x45 * x18; // MULX r64,r64,r64, start: 27, end: 31 -uint128_t x56 = x54 + x55; // ADD; ADC(X), start: 28, end: 30 -uint128_t x60 = x58 + x59; // ADD; ADC(X), start: 29, end: 31 -uint128_t x62 = x60 + x61; // ADD; ADC(X), start: 29, end: 31 -uint128_t x64 = x28 + x63; // ADD; ADC(X), start: 31, end: 33 -uint64_t x69 = (uint64_t) (x56 >> 0x33); // SHRD r,r,i, start: 31, end: 34 -uint64_t x70 = (uint64_t) x56 & 0x7ffffffffffff; // AND, start: 31, end: 32 -uint128_t x66 = x64 + x65; // ADD; ADC(X), start: 31, end: 33 +uint128_t x35 = x33 + x34; // ADD; ADC(X), start: 20, end: 22 +uint128_t x40 = x38 + x39; // ADD; ADC(X), start: 20, end: 22 +uint128_t x53 = (uint128_t) x47 * x19; // MULX r64,r64,r64, start: 20, end: 24 +uint128_t x61 = (uint128_t) x48 * x19; // MULX r64,r64,r64, start: 21, end: 25 +uint128_t x42 = x40 + x41; // ADD; ADC(X), start: 22, end: 24 +uint128_t x63 = (uint128_t) x45 * x19; // MULX r64,r64,r64, start: 22, end: 26 +uint128_t x50 = x20 + x49; // ADD; ADC(X), start: 23, end: 25 +uint128_t x51 = (uint128_t) x46 * x18; // MULX r64,r64,r64, start: 23, end: 27 +uint128_t x44 = x42 + x43; // ADD; ADC(X), start: 24, end: 26 +uint128_t x59 = (uint128_t) x47 * x18; // MULX r64,r64,r64, start: 24, end: 28 +uint128_t x65 = (uint128_t) x48 * x18; // MULX r64,r64,r64, start: 25, end: 29 +uint128_t x55 = (uint128_t) x48 * x17; // MULX r64,r64,r64, start: 26, end: 30 +uint128_t x64 = x28 + x63; // ADD; ADC(X), start: 26, end: 28 +uint128_t x52 = x50 + x51; // ADD; ADC(X), start: 27, end: 29 +uint128_t x57 = (uint128_t) x45 * x17; // MULX r64,r64,r64, start: 27, end: 31 +uint128_t x67 = (uint128_t) x45 * x18; // MULX r64,r64,r64, start: 28, end: 32 +uint128_t x54 = x52 + x53; // ADD; ADC(X), start: 29, end: 31 +uint128_t x66 = x64 + x65; // ADD; ADC(X), start: 29, end: 31 +uint128_t x56 = x54 + x55; // ADD; ADC(X), start: 31, end: 33 +uint128_t x58 = x23 + x57; // ADD; ADC(X), start: 31, end: 33 +uint128_t x60 = x58 + x59; // ADD; ADC(X), start: 33, end: 35 uint128_t x68 = x35 + x67; // ADD; ADC(X), start: 33, end: 35 -uint128_t x71 = x69 + x62; // ADD; ADC(X), start: 33, end: 35 -uint64_t x72 = (uint64_t) (x71 >> 0x33); // SHRD r,r,i, start: 35, end: 38 -uint64_t x73 = (uint64_t) x71 & 0x7ffffffffffff; // AND, start: 37, end: 38 -uint128_t x74 = x72 + x66; // ADD; ADC(X), start: 37, end: 39 -uint64_t x75 = (uint64_t) (x74 >> 0x33); // SHRD r,r,i, start: 40, end: 43 -uint64_t x76 = (uint64_t) x74 & 0x7ffffffffffff; // AND, start: 42, end: 43 -uint128_t x77 = x75 + x68; // ADD; ADC(X), start: 42, end: 44 -uint64_t x78 = (uint64_t) (x77 >> 0x33); // SHRD r,r,i, start: 45, end: 48 -uint64_t x79 = (uint64_t) x77 & 0x7ffffffffffff; // AND, start: 47, end: 48 -uint128_t x80 = x78 + x44; // ADD; ADC(X), start: 47, end: 49 -uint64_t x81 = (uint64_t) (x80 >> 0x33); // SHRD r,r,i, start: 50, end: 53 -uint64_t x82 = (uint64_t) x80 & 0x7ffffffffffff; // AND, start: 52, end: 53 -uint64_t x83 = 0x13 * x81; // IMUL r64,r64,i, start: 52, end: 55 -uint64_t x84 = x70 + x83; // ADD, start: 55, end: 56 -uint64_t x85 = x84 >> 0x33; // SHR r,i, start: 58, end: 59 -uint64_t x86 = x84 & 0x7ffffffffffff; // AND, start: 59, end: 60 -uint64_t x87 = x85 + x73; // ADD, start: 59, end: 60 -uint64_t x88 = x87 >> 0x33; // SHR r,i, start: 60, end: 61 -uint64_t x89 = x87 & 0x7ffffffffffff; // AND, start: 61, end: 62 -uint64_t x90 = x88 + x76; // ADD, start: 61, end: 62 +uint64_t x69 = (uint64_t) (x56 >> 0x33); // SHRD r,r,i, start: 33, end: 36 +uint64_t x70 = (uint64_t) x56 & 0x7ffffffffffff; // AND, start: 33, end: 34 +uint128_t x62 = x60 + x61; // ADD; ADC(X), start: 35, end: 37 +uint128_t x71 = x69 + x62; // ADD; ADC(X), start: 37, end: 39 +uint64_t x72 = (uint64_t) (x71 >> 0x33); // SHRD r,r,i, start: 39, end: 42 +uint64_t x73 = (uint64_t) x71 & 0x7ffffffffffff; // AND, start: 39, end: 40 +uint128_t x74 = x72 + x66; // ADD; ADC(X), start: 42, end: 44 +uint64_t x75 = (uint64_t) (x74 >> 0x33); // SHRD r,r,i, start: 44, end: 47 +uint64_t x76 = (uint64_t) x74 & 0x7ffffffffffff; // AND, start: 44, end: 45 +uint128_t x77 = x75 + x68; // ADD; ADC(X), start: 47, end: 49 +uint64_t x78 = (uint64_t) (x77 >> 0x33); // SHRD r,r,i, start: 49, end: 52 +uint64_t x79 = (uint64_t) x77 & 0x7ffffffffffff; // AND, start: 49, end: 50 +uint128_t x80 = x78 + x44; // ADD; ADC(X), start: 52, end: 54 +uint64_t x81 = (uint64_t) (x80 >> 0x33); // SHRD r,r,i, start: 54, end: 57 +uint64_t x82 = (uint64_t) x80 & 0x7ffffffffffff; // AND, start: 54, end: 55 +uint64_t x83 = 0x13 * x81; // IMUL r64,r64,i, start: 57, end: 60 +uint64_t x84 = x70 + x83; // ADD, start: 60, end: 61 +uint64_t x85 = x84 >> 0x33; // SHR r,i, start: 61, end: 62 +uint64_t x86 = x84 & 0x7ffffffffffff; // AND, start: 61, end: 62 +uint64_t x87 = x85 + x73; // ADD, start: 62, end: 63 +uint64_t x88 = x87 >> 0x33; // SHR r,i, start: 63, end: 64 +uint64_t x89 = x87 & 0x7ffffffffffff; // AND, start: 63, end: 64 +uint64_t x90 = x88 + x76; // ADD, start: 64, end: 65 Return (x82, x79, x90, x89, x86) -// end: 63 +// end: 65 diff --git a/etc/compile-by-zinc/heuristic-search.py b/etc/compile-by-zinc/heuristic-search.py index ea528a957..ba979f5dd 100755 --- a/etc/compile-by-zinc/heuristic-search.py +++ b/etc/compile-by-zinc/heuristic-search.py @@ -222,7 +222,8 @@ def schedule(data, basepoint, do_print): core_remaining_cycle_count = dict([(core_name, [0] * core_count) for core_name, core_count in CORE_DATA] + [(core_name, [0]) for core_name in REGISTERS]) cur_instructions_in_cycle = 0 - return vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle + register_vals = dict((var, None) for var in REGISTERS) + return vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle, register_vals def freeze_gen(v, rec=(lambda x:x)): if isinstance(v, list): @@ -252,10 +253,19 @@ def schedule(data, basepoint, do_print): def unfreeze(v): return unfreeze_gen(v, unfreeze) + def update_register_vals_with_core_args(core, args, register_vals): + new_rdx = register_vals['RDX'] + if 'MULX' in core['instruction']: + new_rdx = sorted(args, key=(lambda x: int(x.lstrip('0x'))))[0] + changed = (register_vals['RDX'] != new_rdx) + register_vals['RDX'] = new_rdx + return changed, register_vals + @memoize - def update_core_state(var, core, core_state): + def update_core_state(var, core, args, core_state): core = unfreeze(core) - (vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle) = unfreeze(core_state) + (vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle, register_vals) = unfreeze(core_state) + changed, register_vals = update_register_vals_with_core_args(core, args, register_vals) cost = 0 if cur_instructions_in_cycle >= INSTRUCTIONS_PER_CYCLE: cost += 1 @@ -282,37 +292,50 @@ def schedule(data, basepoint, do_print): assert core_remaining_cycle_count[port['name']][0] == 0 core_remaining_cycle_count[port['name']][0] = port['latency'] core_remaining_cycle_count[port['name']] = sorted(core_remaining_cycle_count[port['name']]) - return (cost, freeze((vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle))) + return (cost, freeze((vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle, register_vals))) @memoize def evaluate_cost_memo(arg): schedule, core_state = unfreeze_gen(arg) schedule = unfreeze(schedule) - (vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle) = unfreeze(core_state) + (vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle, register_vals) = unfreeze(core_state) if len(schedule) == 0: return max([0] + list(vars_remaining_cycles.values())) - (var, core), schedule = schedule[0], schedule[1:] - cost, core_state = update_core_state(var, freeze(core), core_state) + (var, core, args), schedule = schedule[0], schedule[1:] + cost, core_state = update_core_state(var, freeze(core), args, core_state) return cost + evaluate_cost_memo(freeze_gen((freeze(schedule), core_state))) def get_wait_times(var_cores, core_state): - for var, core in var_cores: - cost, new_core_state = update_core_state(var, freeze(core), core_state) - yield (cost, var, core, new_core_state) + for var, core, args in var_cores: + (vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle, register_vals) = unfreeze(core_state) + changed, register_vals = update_register_vals_with_core_args(core, args, register_vals) + cost, new_core_state = update_core_state(var, freeze(core), args, core_state) + yield (cost, -len(reverse_dependencies.get(var, [])), changed, -core['latency'], var, core, args, new_core_state) + + def cmp_wait_time(v1, v2): + (cost1, neg_len_deps1, changed1, neg_latency1, var1, core1, args1, new_core_state1) = v1 + (cost2, neg_len_deps2, changed2, neg_latency2, var2, core2, args2, new_core_state2) = v2 + if cost1 != cost2: return cmp(cost1, cost2) + if core1['instruction'] == core2['instruction']: + if changed1 != changed2: return cmp(changed1, changed2) + if neg_len_deps1 != neg_len_deps2: return cmp(neg_len_deps1, neg_len_deps2) + if neg_latency1 != neg_latency2: return cmp(neg_latency1, neg_latency2) + if var1 != var2: return cmp(var1, var2) + return cmp(v1, v2) def get_sorted_next_statements(next_var_cores, core_state): - return sorted(get_wait_times(next_var_cores, core_state)) + return sorted(get_wait_times(next_var_cores, core_state), cmp=cmp_wait_time) def add_cycle_info(schedule): core_state = freeze(make_initial_core_state()) schedule_with_cycle_info = [] cur_cycle = 0 - for var, core in schedule: - cost, core_state = update_core_state(var, freeze(core), core_state) + for var, core, args in schedule: + cost, core_state = update_core_state(var, freeze(core), args, core_state) + cur_cycle += cost schedule_with_cycle_info.append((var, {'start':cur_cycle, 'finish':cur_cycle + core['latency']}, core)) - cur_cycle += cost return schedule_with_cycle_info def evaluate_cost(schedule_with_cycle_info): @@ -324,26 +347,27 @@ def schedule(data, basepoint, do_print): @memoize def schedule_remaining(remaining_indices, core_state): - def make_schedule(var, core): - cost, new_core_state = update_core_state(var, freeze(core), core_state) + def make_schedule(var, core, args): + cost, new_core_state = update_core_state(var, freeze(core), args, core_state) extra_cost, schedule = schedule_remaining(tuple(i for i in remaining_indices if i != var), new_core_state) - return cost + extra_cost, ([(var, core)] + schedule) + return cost + extra_cost, ([(var, core, args)] + schedule) next_statements = get_possible_next_statements(remaining_indices, dependencies) min_cost, min_schedule = None, None - var_cores = [(var, core) + var_cores = [(var, core, (lines[var]['args'] if var in lines.keys() else (var,))) for var in next_statements for core in (lines[var]['cores'] if var in lines.keys() else possible_cores_for_line({'op':'LOAD', 'type':'uint64_t'}))] + sorted_subset_next_statements = sorted_next_statements = get_sorted_next_statements(var_cores, core_state) if len(sorted_next_statements) > 0: pre_min_cost = sorted_next_statements[0][0] # print((pre_min_cost, tuple(var for cost2, var, core, new_core_state in sorted_next_statements if pre_min_cost == cost2))) sorted_subset_next_statements \ - = tuple((cost, var, core, new_core_state) for cost, var, core, new_core_state in sorted_next_statements + = tuple((cost, var, core, args, new_core_state) for cost, reverse_dep_count, changed, neg_latency, var, core, args, new_core_state in sorted_next_statements if pre_min_cost == cost) sorted_subset_next_statements = sorted_subset_next_statements[:1] if pre_min_cost == 0: sorted_subset_next_statements = sorted_subset_next_statements[:1] - for cost, var, core, new_core_state in sorted_subset_next_statements: - cost, schedule = make_schedule(var, core) + for cost, var, core, args, new_core_state in sorted_subset_next_statements: + cost, schedule = make_schedule(var, core, args) if min_cost is None or cost < min_cost: min_cost, min_schedule = cost, schedule # return min_cost, min_schedule -- cgit v1.2.3