aboutsummaryrefslogtreecommitdiff
path: root/etc
diff options
context:
space:
mode:
authorGravatar Jason Gross <jgross@mit.edu>2017-08-14 12:21:51 -0400
committerGravatar Jason Gross <jgross@mit.edu>2017-08-14 12:21:51 -0400
commit014062cd1e034badfebe86951bb055f3fc224f3c (patch)
tree1cfa9ae8cc0bf19847665c8574c60f3ef0d7cecc /etc
parent57018f66cd5358ca3350597ef8cdf94fdc60244a (diff)
Update scheduler to know about implicit mulx arg
Now it prefers putting together mulx with the same implicit arg (approximated as the same variable with the lower number).
Diffstat (limited to 'etc')
-rw-r--r--etc/compile-by-zinc/femulScheduled.log132
-rwxr-xr-xetc/compile-by-zinc/heuristic-search.py66
2 files changed, 111 insertions, 87 deletions
diff --git a/etc/compile-by-zinc/femulScheduled.log b/etc/compile-by-zinc/femulScheduled.log
index 6c9281748..40559671b 100644
--- a/etc/compile-by-zinc/femulScheduled.log
+++ b/etc/compile-by-zinc/femulScheduled.log
@@ -1,74 +1,74 @@
INPUT: (x10, x11, x9, x7, x5, x18, x19, x17, x15, x13)
uint128_t x20 = (uint128_t) x5 * x13; // MULX r64,r64,r64, start: 0, end: 4
-uint128_t x21 = (uint128_t) x5 * x15; // MULX r64,r64,r64, start: 0, end: 4
-uint128_t x22 = (uint128_t) x7 * x13; // MULX r64,r64,r64, start: 1, end: 5
+uint128_t x21 = (uint128_t) x5 * x15; // MULX r64,r64,r64, start: 1, end: 5
uint128_t x24 = (uint128_t) x5 * x17; // MULX r64,r64,r64, start: 2, end: 6
-uint128_t x25 = (uint128_t) x9 * x13; // MULX r64,r64,r64, start: 3, end: 7
-uint128_t x27 = (uint128_t) x7 * x15; // MULX r64,r64,r64, start: 4, end: 8
-uint128_t x23 = x21 + x22; // ADD; ADC(X), start: 5, end: 7
-uint128_t x29 = (uint128_t) x5 * x19; // MULX r64,r64,r64, start: 6, end: 10
-uint128_t x30 = (uint128_t) x11 * x13; // MULX r64,r64,r64, start: 6, end: 10
-uint128_t x26 = x24 + x25; // ADD; ADC(X), start: 7, end: 9
-uint128_t x32 = (uint128_t) x7 * x17; // MULX r64,r64,r64, start: 8, end: 12
-uint128_t x34 = (uint128_t) x9 * x15; // MULX r64,r64,r64, start: 8, end: 12
-uint128_t x28 = x26 + x27; // ADD; ADC(X), start: 9, end: 11
-uint128_t x36 = (uint128_t) x5 * x18; // MULX r64,r64,r64, start: 10, end: 14
-uint128_t x31 = x29 + x30; // ADD; ADC(X), start: 10, end: 12
-uint128_t x37 = (uint128_t) x10 * x13; // MULX r64,r64,r64, start: 11, end: 15
-uint128_t x39 = (uint128_t) x11 * x15; // MULX r64,r64,r64, start: 11, end: 15
-uint128_t x33 = x31 + x32; // ADD; ADC(X), start: 12, end: 14
-uint128_t x41 = (uint128_t) x7 * x19; // MULX r64,r64,r64, start: 13, end: 17
-uint128_t x43 = (uint128_t) x9 * x17; // MULX r64,r64,r64, start: 13, end: 17
-uint128_t x35 = x33 + x34; // ADD; ADC(X), start: 14, end: 16
-uint128_t x38 = x36 + x37; // ADD; ADC(X), start: 15, end: 17
+uint128_t x29 = (uint128_t) x5 * x19; // MULX r64,r64,r64, start: 3, end: 7
+uint128_t x36 = (uint128_t) x5 * x18; // MULX r64,r64,r64, start: 4, end: 8
+uint128_t x22 = (uint128_t) x7 * x13; // MULX r64,r64,r64, start: 5, end: 9
+uint128_t x27 = (uint128_t) x7 * x15; // MULX r64,r64,r64, start: 6, end: 10
+uint128_t x32 = (uint128_t) x7 * x17; // MULX r64,r64,r64, start: 7, end: 11
+uint128_t x41 = (uint128_t) x7 * x19; // MULX r64,r64,r64, start: 8, end: 12
+uint128_t x23 = x21 + x22; // ADD; ADC(X), start: 9, end: 11
+uint128_t x25 = (uint128_t) x9 * x13; // MULX r64,r64,r64, start: 9, end: 13
+uint128_t x34 = (uint128_t) x9 * x15; // MULX r64,r64,r64, start: 10, end: 14
+uint128_t x43 = (uint128_t) x9 * x17; // MULX r64,r64,r64, start: 11, end: 15
+uint128_t x30 = (uint128_t) x11 * x13; // MULX r64,r64,r64, start: 12, end: 16
+uint128_t x26 = x24 + x25; // ADD; ADC(X), start: 13, end: 15
+uint128_t x39 = (uint128_t) x11 * x15; // MULX r64,r64,r64, start: 13, end: 17
+uint128_t x37 = (uint128_t) x10 * x13; // MULX r64,r64,r64, start: 14, end: 18
+uint128_t x28 = x26 + x27; // ADD; ADC(X), start: 15, end: 17
uint64_t x45 = x10 * 0x13; // IMUL r64,r64,i, start: 15, end: 18
-uint64_t x46 = x7 * 0x13; // IMUL r64,r64,i, start: 15, end: 18
-uint128_t x40 = x38 + x39; // ADD; ADC(X), start: 16, end: 18
+uint128_t x31 = x29 + x30; // ADD; ADC(X), start: 16, end: 18
+uint64_t x48 = x11 * 0x13; // IMUL r64,r64,i, start: 16, end: 19
uint64_t x47 = x9 * 0x13; // IMUL r64,r64,i, start: 17, end: 20
-uint64_t x48 = x11 * 0x13; // IMUL r64,r64,i, start: 17, end: 20
-uint128_t x42 = x40 + x41; // ADD; ADC(X), start: 18, end: 20
+uint128_t x33 = x31 + x32; // ADD; ADC(X), start: 18, end: 20
+uint128_t x38 = x36 + x37; // ADD; ADC(X), start: 18, end: 20
+uint64_t x46 = x7 * 0x13; // IMUL r64,r64,i, start: 18, end: 21
uint128_t x49 = (uint128_t) x45 * x15; // MULX r64,r64,r64, start: 19, end: 23
-uint128_t x51 = (uint128_t) x46 * x18; // MULX r64,r64,r64, start: 19, end: 23
-uint128_t x44 = x42 + x43; // ADD; ADC(X), start: 20, end: 22
-uint128_t x53 = (uint128_t) x47 * x19; // MULX r64,r64,r64, start: 21, end: 25
-uint128_t x55 = (uint128_t) x48 * x17; // MULX r64,r64,r64, start: 21, end: 25
-uint128_t x50 = x20 + x49; // ADD; ADC(X), start: 22, end: 24
-uint128_t x57 = (uint128_t) x45 * x17; // MULX r64,r64,r64, start: 23, end: 27
-uint128_t x59 = (uint128_t) x47 * x18; // MULX r64,r64,r64, start: 23, end: 27
-uint128_t x52 = x50 + x51; // ADD; ADC(X), start: 24, end: 26
-uint128_t x61 = (uint128_t) x48 * x19; // MULX r64,r64,r64, start: 25, end: 29
-uint128_t x63 = (uint128_t) x45 * x19; // MULX r64,r64,r64, start: 25, end: 29
-uint128_t x54 = x52 + x53; // ADD; ADC(X), start: 26, end: 28
-uint128_t x58 = x23 + x57; // ADD; ADC(X), start: 27, end: 29
-uint128_t x65 = (uint128_t) x48 * x18; // MULX r64,r64,r64, start: 27, end: 31
-uint128_t x67 = (uint128_t) x45 * x18; // MULX r64,r64,r64, start: 27, end: 31
-uint128_t x56 = x54 + x55; // ADD; ADC(X), start: 28, end: 30
-uint128_t x60 = x58 + x59; // ADD; ADC(X), start: 29, end: 31
-uint128_t x62 = x60 + x61; // ADD; ADC(X), start: 29, end: 31
-uint128_t x64 = x28 + x63; // ADD; ADC(X), start: 31, end: 33
-uint64_t x69 = (uint64_t) (x56 >> 0x33); // SHRD r,r,i, start: 31, end: 34
-uint64_t x70 = (uint64_t) x56 & 0x7ffffffffffff; // AND, start: 31, end: 32
-uint128_t x66 = x64 + x65; // ADD; ADC(X), start: 31, end: 33
+uint128_t x35 = x33 + x34; // ADD; ADC(X), start: 20, end: 22
+uint128_t x40 = x38 + x39; // ADD; ADC(X), start: 20, end: 22
+uint128_t x53 = (uint128_t) x47 * x19; // MULX r64,r64,r64, start: 20, end: 24
+uint128_t x61 = (uint128_t) x48 * x19; // MULX r64,r64,r64, start: 21, end: 25
+uint128_t x42 = x40 + x41; // ADD; ADC(X), start: 22, end: 24
+uint128_t x63 = (uint128_t) x45 * x19; // MULX r64,r64,r64, start: 22, end: 26
+uint128_t x50 = x20 + x49; // ADD; ADC(X), start: 23, end: 25
+uint128_t x51 = (uint128_t) x46 * x18; // MULX r64,r64,r64, start: 23, end: 27
+uint128_t x44 = x42 + x43; // ADD; ADC(X), start: 24, end: 26
+uint128_t x59 = (uint128_t) x47 * x18; // MULX r64,r64,r64, start: 24, end: 28
+uint128_t x65 = (uint128_t) x48 * x18; // MULX r64,r64,r64, start: 25, end: 29
+uint128_t x55 = (uint128_t) x48 * x17; // MULX r64,r64,r64, start: 26, end: 30
+uint128_t x64 = x28 + x63; // ADD; ADC(X), start: 26, end: 28
+uint128_t x52 = x50 + x51; // ADD; ADC(X), start: 27, end: 29
+uint128_t x57 = (uint128_t) x45 * x17; // MULX r64,r64,r64, start: 27, end: 31
+uint128_t x67 = (uint128_t) x45 * x18; // MULX r64,r64,r64, start: 28, end: 32
+uint128_t x54 = x52 + x53; // ADD; ADC(X), start: 29, end: 31
+uint128_t x66 = x64 + x65; // ADD; ADC(X), start: 29, end: 31
+uint128_t x56 = x54 + x55; // ADD; ADC(X), start: 31, end: 33
+uint128_t x58 = x23 + x57; // ADD; ADC(X), start: 31, end: 33
+uint128_t x60 = x58 + x59; // ADD; ADC(X), start: 33, end: 35
uint128_t x68 = x35 + x67; // ADD; ADC(X), start: 33, end: 35
-uint128_t x71 = x69 + x62; // ADD; ADC(X), start: 33, end: 35
-uint64_t x72 = (uint64_t) (x71 >> 0x33); // SHRD r,r,i, start: 35, end: 38
-uint64_t x73 = (uint64_t) x71 & 0x7ffffffffffff; // AND, start: 37, end: 38
-uint128_t x74 = x72 + x66; // ADD; ADC(X), start: 37, end: 39
-uint64_t x75 = (uint64_t) (x74 >> 0x33); // SHRD r,r,i, start: 40, end: 43
-uint64_t x76 = (uint64_t) x74 & 0x7ffffffffffff; // AND, start: 42, end: 43
-uint128_t x77 = x75 + x68; // ADD; ADC(X), start: 42, end: 44
-uint64_t x78 = (uint64_t) (x77 >> 0x33); // SHRD r,r,i, start: 45, end: 48
-uint64_t x79 = (uint64_t) x77 & 0x7ffffffffffff; // AND, start: 47, end: 48
-uint128_t x80 = x78 + x44; // ADD; ADC(X), start: 47, end: 49
-uint64_t x81 = (uint64_t) (x80 >> 0x33); // SHRD r,r,i, start: 50, end: 53
-uint64_t x82 = (uint64_t) x80 & 0x7ffffffffffff; // AND, start: 52, end: 53
-uint64_t x83 = 0x13 * x81; // IMUL r64,r64,i, start: 52, end: 55
-uint64_t x84 = x70 + x83; // ADD, start: 55, end: 56
-uint64_t x85 = x84 >> 0x33; // SHR r,i, start: 58, end: 59
-uint64_t x86 = x84 & 0x7ffffffffffff; // AND, start: 59, end: 60
-uint64_t x87 = x85 + x73; // ADD, start: 59, end: 60
-uint64_t x88 = x87 >> 0x33; // SHR r,i, start: 60, end: 61
-uint64_t x89 = x87 & 0x7ffffffffffff; // AND, start: 61, end: 62
-uint64_t x90 = x88 + x76; // ADD, start: 61, end: 62
+uint64_t x69 = (uint64_t) (x56 >> 0x33); // SHRD r,r,i, start: 33, end: 36
+uint64_t x70 = (uint64_t) x56 & 0x7ffffffffffff; // AND, start: 33, end: 34
+uint128_t x62 = x60 + x61; // ADD; ADC(X), start: 35, end: 37
+uint128_t x71 = x69 + x62; // ADD; ADC(X), start: 37, end: 39
+uint64_t x72 = (uint64_t) (x71 >> 0x33); // SHRD r,r,i, start: 39, end: 42
+uint64_t x73 = (uint64_t) x71 & 0x7ffffffffffff; // AND, start: 39, end: 40
+uint128_t x74 = x72 + x66; // ADD; ADC(X), start: 42, end: 44
+uint64_t x75 = (uint64_t) (x74 >> 0x33); // SHRD r,r,i, start: 44, end: 47
+uint64_t x76 = (uint64_t) x74 & 0x7ffffffffffff; // AND, start: 44, end: 45
+uint128_t x77 = x75 + x68; // ADD; ADC(X), start: 47, end: 49
+uint64_t x78 = (uint64_t) (x77 >> 0x33); // SHRD r,r,i, start: 49, end: 52
+uint64_t x79 = (uint64_t) x77 & 0x7ffffffffffff; // AND, start: 49, end: 50
+uint128_t x80 = x78 + x44; // ADD; ADC(X), start: 52, end: 54
+uint64_t x81 = (uint64_t) (x80 >> 0x33); // SHRD r,r,i, start: 54, end: 57
+uint64_t x82 = (uint64_t) x80 & 0x7ffffffffffff; // AND, start: 54, end: 55
+uint64_t x83 = 0x13 * x81; // IMUL r64,r64,i, start: 57, end: 60
+uint64_t x84 = x70 + x83; // ADD, start: 60, end: 61
+uint64_t x85 = x84 >> 0x33; // SHR r,i, start: 61, end: 62
+uint64_t x86 = x84 & 0x7ffffffffffff; // AND, start: 61, end: 62
+uint64_t x87 = x85 + x73; // ADD, start: 62, end: 63
+uint64_t x88 = x87 >> 0x33; // SHR r,i, start: 63, end: 64
+uint64_t x89 = x87 & 0x7ffffffffffff; // AND, start: 63, end: 64
+uint64_t x90 = x88 + x76; // ADD, start: 64, end: 65
Return (x82, x79, x90, x89, x86)
-// end: 63
+// end: 65
diff --git a/etc/compile-by-zinc/heuristic-search.py b/etc/compile-by-zinc/heuristic-search.py
index ea528a957..ba979f5dd 100755
--- a/etc/compile-by-zinc/heuristic-search.py
+++ b/etc/compile-by-zinc/heuristic-search.py
@@ -222,7 +222,8 @@ def schedule(data, basepoint, do_print):
core_remaining_cycle_count = dict([(core_name, [0] * core_count) for core_name, core_count in CORE_DATA]
+ [(core_name, [0]) for core_name in REGISTERS])
cur_instructions_in_cycle = 0
- return vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle
+ register_vals = dict((var, None) for var in REGISTERS)
+ return vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle, register_vals
def freeze_gen(v, rec=(lambda x:x)):
if isinstance(v, list):
@@ -252,10 +253,19 @@ def schedule(data, basepoint, do_print):
def unfreeze(v):
return unfreeze_gen(v, unfreeze)
+ def update_register_vals_with_core_args(core, args, register_vals):
+ new_rdx = register_vals['RDX']
+ if 'MULX' in core['instruction']:
+ new_rdx = sorted(args, key=(lambda x: int(x.lstrip('0x'))))[0]
+ changed = (register_vals['RDX'] != new_rdx)
+ register_vals['RDX'] = new_rdx
+ return changed, register_vals
+
@memoize
- def update_core_state(var, core, core_state):
+ def update_core_state(var, core, args, core_state):
core = unfreeze(core)
- (vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle) = unfreeze(core_state)
+ (vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle, register_vals) = unfreeze(core_state)
+ changed, register_vals = update_register_vals_with_core_args(core, args, register_vals)
cost = 0
if cur_instructions_in_cycle >= INSTRUCTIONS_PER_CYCLE:
cost += 1
@@ -282,37 +292,50 @@ def schedule(data, basepoint, do_print):
assert core_remaining_cycle_count[port['name']][0] == 0
core_remaining_cycle_count[port['name']][0] = port['latency']
core_remaining_cycle_count[port['name']] = sorted(core_remaining_cycle_count[port['name']])
- return (cost, freeze((vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle)))
+ return (cost, freeze((vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle, register_vals)))
@memoize
def evaluate_cost_memo(arg):
schedule, core_state = unfreeze_gen(arg)
schedule = unfreeze(schedule)
- (vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle) = unfreeze(core_state)
+ (vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle, register_vals) = unfreeze(core_state)
if len(schedule) == 0: return max([0] + list(vars_remaining_cycles.values()))
- (var, core), schedule = schedule[0], schedule[1:]
- cost, core_state = update_core_state(var, freeze(core), core_state)
+ (var, core, args), schedule = schedule[0], schedule[1:]
+ cost, core_state = update_core_state(var, freeze(core), args, core_state)
return cost + evaluate_cost_memo(freeze_gen((freeze(schedule), core_state)))
def get_wait_times(var_cores, core_state):
- for var, core in var_cores:
- cost, new_core_state = update_core_state(var, freeze(core), core_state)
- yield (cost, var, core, new_core_state)
+ for var, core, args in var_cores:
+ (vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle, register_vals) = unfreeze(core_state)
+ changed, register_vals = update_register_vals_with_core_args(core, args, register_vals)
+ cost, new_core_state = update_core_state(var, freeze(core), args, core_state)
+ yield (cost, -len(reverse_dependencies.get(var, [])), changed, -core['latency'], var, core, args, new_core_state)
+
+ def cmp_wait_time(v1, v2):
+ (cost1, neg_len_deps1, changed1, neg_latency1, var1, core1, args1, new_core_state1) = v1
+ (cost2, neg_len_deps2, changed2, neg_latency2, var2, core2, args2, new_core_state2) = v2
+ if cost1 != cost2: return cmp(cost1, cost2)
+ if core1['instruction'] == core2['instruction']:
+ if changed1 != changed2: return cmp(changed1, changed2)
+ if neg_len_deps1 != neg_len_deps2: return cmp(neg_len_deps1, neg_len_deps2)
+ if neg_latency1 != neg_latency2: return cmp(neg_latency1, neg_latency2)
+ if var1 != var2: return cmp(var1, var2)
+ return cmp(v1, v2)
def get_sorted_next_statements(next_var_cores, core_state):
- return sorted(get_wait_times(next_var_cores, core_state))
+ return sorted(get_wait_times(next_var_cores, core_state), cmp=cmp_wait_time)
def add_cycle_info(schedule):
core_state = freeze(make_initial_core_state())
schedule_with_cycle_info = []
cur_cycle = 0
- for var, core in schedule:
- cost, core_state = update_core_state(var, freeze(core), core_state)
+ for var, core, args in schedule:
+ cost, core_state = update_core_state(var, freeze(core), args, core_state)
+ cur_cycle += cost
schedule_with_cycle_info.append((var,
{'start':cur_cycle, 'finish':cur_cycle + core['latency']},
core))
- cur_cycle += cost
return schedule_with_cycle_info
def evaluate_cost(schedule_with_cycle_info):
@@ -324,26 +347,27 @@ def schedule(data, basepoint, do_print):
@memoize
def schedule_remaining(remaining_indices, core_state):
- def make_schedule(var, core):
- cost, new_core_state = update_core_state(var, freeze(core), core_state)
+ def make_schedule(var, core, args):
+ cost, new_core_state = update_core_state(var, freeze(core), args, core_state)
extra_cost, schedule = schedule_remaining(tuple(i for i in remaining_indices if i != var), new_core_state)
- return cost + extra_cost, ([(var, core)] + schedule)
+ return cost + extra_cost, ([(var, core, args)] + schedule)
next_statements = get_possible_next_statements(remaining_indices, dependencies)
min_cost, min_schedule = None, None
- var_cores = [(var, core)
+ var_cores = [(var, core, (lines[var]['args'] if var in lines.keys() else (var,)))
for var in next_statements
for core in (lines[var]['cores'] if var in lines.keys() else possible_cores_for_line({'op':'LOAD', 'type':'uint64_t'}))]
+
sorted_subset_next_statements = sorted_next_statements = get_sorted_next_statements(var_cores, core_state)
if len(sorted_next_statements) > 0:
pre_min_cost = sorted_next_statements[0][0]
# print((pre_min_cost, tuple(var for cost2, var, core, new_core_state in sorted_next_statements if pre_min_cost == cost2)))
sorted_subset_next_statements \
- = tuple((cost, var, core, new_core_state) for cost, var, core, new_core_state in sorted_next_statements
+ = tuple((cost, var, core, args, new_core_state) for cost, reverse_dep_count, changed, neg_latency, var, core, args, new_core_state in sorted_next_statements
if pre_min_cost == cost)
sorted_subset_next_statements = sorted_subset_next_statements[:1]
if pre_min_cost == 0: sorted_subset_next_statements = sorted_subset_next_statements[:1]
- for cost, var, core, new_core_state in sorted_subset_next_statements:
- cost, schedule = make_schedule(var, core)
+ for cost, var, core, args, new_core_state in sorted_subset_next_statements:
+ cost, schedule = make_schedule(var, core, args)
if min_cost is None or cost < min_cost:
min_cost, min_schedule = cost, schedule
# return min_cost, min_schedule