aboutsummaryrefslogtreecommitdiff
path: root/etc
diff options
context:
space:
mode:
Diffstat (limited to 'etc')
-rw-r--r--etc/compile-by-zinc/femulScheduled.log132
-rwxr-xr-xetc/compile-by-zinc/heuristic-search.py66
2 files changed, 111 insertions, 87 deletions
diff --git a/etc/compile-by-zinc/femulScheduled.log b/etc/compile-by-zinc/femulScheduled.log
index 6c9281748..40559671b 100644
--- a/etc/compile-by-zinc/femulScheduled.log
+++ b/etc/compile-by-zinc/femulScheduled.log
@@ -1,74 +1,74 @@
INPUT: (x10, x11, x9, x7, x5, x18, x19, x17, x15, x13)
uint128_t x20 = (uint128_t) x5 * x13; // MULX r64,r64,r64, start: 0, end: 4
-uint128_t x21 = (uint128_t) x5 * x15; // MULX r64,r64,r64, start: 0, end: 4
-uint128_t x22 = (uint128_t) x7 * x13; // MULX r64,r64,r64, start: 1, end: 5
+uint128_t x21 = (uint128_t) x5 * x15; // MULX r64,r64,r64, start: 1, end: 5
uint128_t x24 = (uint128_t) x5 * x17; // MULX r64,r64,r64, start: 2, end: 6
-uint128_t x25 = (uint128_t) x9 * x13; // MULX r64,r64,r64, start: 3, end: 7
-uint128_t x27 = (uint128_t) x7 * x15; // MULX r64,r64,r64, start: 4, end: 8
-uint128_t x23 = x21 + x22; // ADD; ADC(X), start: 5, end: 7
-uint128_t x29 = (uint128_t) x5 * x19; // MULX r64,r64,r64, start: 6, end: 10
-uint128_t x30 = (uint128_t) x11 * x13; // MULX r64,r64,r64, start: 6, end: 10
-uint128_t x26 = x24 + x25; // ADD; ADC(X), start: 7, end: 9
-uint128_t x32 = (uint128_t) x7 * x17; // MULX r64,r64,r64, start: 8, end: 12
-uint128_t x34 = (uint128_t) x9 * x15; // MULX r64,r64,r64, start: 8, end: 12
-uint128_t x28 = x26 + x27; // ADD; ADC(X), start: 9, end: 11
-uint128_t x36 = (uint128_t) x5 * x18; // MULX r64,r64,r64, start: 10, end: 14
-uint128_t x31 = x29 + x30; // ADD; ADC(X), start: 10, end: 12
-uint128_t x37 = (uint128_t) x10 * x13; // MULX r64,r64,r64, start: 11, end: 15
-uint128_t x39 = (uint128_t) x11 * x15; // MULX r64,r64,r64, start: 11, end: 15
-uint128_t x33 = x31 + x32; // ADD; ADC(X), start: 12, end: 14
-uint128_t x41 = (uint128_t) x7 * x19; // MULX r64,r64,r64, start: 13, end: 17
-uint128_t x43 = (uint128_t) x9 * x17; // MULX r64,r64,r64, start: 13, end: 17
-uint128_t x35 = x33 + x34; // ADD; ADC(X), start: 14, end: 16
-uint128_t x38 = x36 + x37; // ADD; ADC(X), start: 15, end: 17
+uint128_t x29 = (uint128_t) x5 * x19; // MULX r64,r64,r64, start: 3, end: 7
+uint128_t x36 = (uint128_t) x5 * x18; // MULX r64,r64,r64, start: 4, end: 8
+uint128_t x22 = (uint128_t) x7 * x13; // MULX r64,r64,r64, start: 5, end: 9
+uint128_t x27 = (uint128_t) x7 * x15; // MULX r64,r64,r64, start: 6, end: 10
+uint128_t x32 = (uint128_t) x7 * x17; // MULX r64,r64,r64, start: 7, end: 11
+uint128_t x41 = (uint128_t) x7 * x19; // MULX r64,r64,r64, start: 8, end: 12
+uint128_t x23 = x21 + x22; // ADD; ADC(X), start: 9, end: 11
+uint128_t x25 = (uint128_t) x9 * x13; // MULX r64,r64,r64, start: 9, end: 13
+uint128_t x34 = (uint128_t) x9 * x15; // MULX r64,r64,r64, start: 10, end: 14
+uint128_t x43 = (uint128_t) x9 * x17; // MULX r64,r64,r64, start: 11, end: 15
+uint128_t x30 = (uint128_t) x11 * x13; // MULX r64,r64,r64, start: 12, end: 16
+uint128_t x26 = x24 + x25; // ADD; ADC(X), start: 13, end: 15
+uint128_t x39 = (uint128_t) x11 * x15; // MULX r64,r64,r64, start: 13, end: 17
+uint128_t x37 = (uint128_t) x10 * x13; // MULX r64,r64,r64, start: 14, end: 18
+uint128_t x28 = x26 + x27; // ADD; ADC(X), start: 15, end: 17
uint64_t x45 = x10 * 0x13; // IMUL r64,r64,i, start: 15, end: 18
-uint64_t x46 = x7 * 0x13; // IMUL r64,r64,i, start: 15, end: 18
-uint128_t x40 = x38 + x39; // ADD; ADC(X), start: 16, end: 18
+uint128_t x31 = x29 + x30; // ADD; ADC(X), start: 16, end: 18
+uint64_t x48 = x11 * 0x13; // IMUL r64,r64,i, start: 16, end: 19
uint64_t x47 = x9 * 0x13; // IMUL r64,r64,i, start: 17, end: 20
-uint64_t x48 = x11 * 0x13; // IMUL r64,r64,i, start: 17, end: 20
-uint128_t x42 = x40 + x41; // ADD; ADC(X), start: 18, end: 20
+uint128_t x33 = x31 + x32; // ADD; ADC(X), start: 18, end: 20
+uint128_t x38 = x36 + x37; // ADD; ADC(X), start: 18, end: 20
+uint64_t x46 = x7 * 0x13; // IMUL r64,r64,i, start: 18, end: 21
uint128_t x49 = (uint128_t) x45 * x15; // MULX r64,r64,r64, start: 19, end: 23
-uint128_t x51 = (uint128_t) x46 * x18; // MULX r64,r64,r64, start: 19, end: 23
-uint128_t x44 = x42 + x43; // ADD; ADC(X), start: 20, end: 22
-uint128_t x53 = (uint128_t) x47 * x19; // MULX r64,r64,r64, start: 21, end: 25
-uint128_t x55 = (uint128_t) x48 * x17; // MULX r64,r64,r64, start: 21, end: 25
-uint128_t x50 = x20 + x49; // ADD; ADC(X), start: 22, end: 24
-uint128_t x57 = (uint128_t) x45 * x17; // MULX r64,r64,r64, start: 23, end: 27
-uint128_t x59 = (uint128_t) x47 * x18; // MULX r64,r64,r64, start: 23, end: 27
-uint128_t x52 = x50 + x51; // ADD; ADC(X), start: 24, end: 26
-uint128_t x61 = (uint128_t) x48 * x19; // MULX r64,r64,r64, start: 25, end: 29
-uint128_t x63 = (uint128_t) x45 * x19; // MULX r64,r64,r64, start: 25, end: 29
-uint128_t x54 = x52 + x53; // ADD; ADC(X), start: 26, end: 28
-uint128_t x58 = x23 + x57; // ADD; ADC(X), start: 27, end: 29
-uint128_t x65 = (uint128_t) x48 * x18; // MULX r64,r64,r64, start: 27, end: 31
-uint128_t x67 = (uint128_t) x45 * x18; // MULX r64,r64,r64, start: 27, end: 31
-uint128_t x56 = x54 + x55; // ADD; ADC(X), start: 28, end: 30
-uint128_t x60 = x58 + x59; // ADD; ADC(X), start: 29, end: 31
-uint128_t x62 = x60 + x61; // ADD; ADC(X), start: 29, end: 31
-uint128_t x64 = x28 + x63; // ADD; ADC(X), start: 31, end: 33
-uint64_t x69 = (uint64_t) (x56 >> 0x33); // SHRD r,r,i, start: 31, end: 34
-uint64_t x70 = (uint64_t) x56 & 0x7ffffffffffff; // AND, start: 31, end: 32
-uint128_t x66 = x64 + x65; // ADD; ADC(X), start: 31, end: 33
+uint128_t x35 = x33 + x34; // ADD; ADC(X), start: 20, end: 22
+uint128_t x40 = x38 + x39; // ADD; ADC(X), start: 20, end: 22
+uint128_t x53 = (uint128_t) x47 * x19; // MULX r64,r64,r64, start: 20, end: 24
+uint128_t x61 = (uint128_t) x48 * x19; // MULX r64,r64,r64, start: 21, end: 25
+uint128_t x42 = x40 + x41; // ADD; ADC(X), start: 22, end: 24
+uint128_t x63 = (uint128_t) x45 * x19; // MULX r64,r64,r64, start: 22, end: 26
+uint128_t x50 = x20 + x49; // ADD; ADC(X), start: 23, end: 25
+uint128_t x51 = (uint128_t) x46 * x18; // MULX r64,r64,r64, start: 23, end: 27
+uint128_t x44 = x42 + x43; // ADD; ADC(X), start: 24, end: 26
+uint128_t x59 = (uint128_t) x47 * x18; // MULX r64,r64,r64, start: 24, end: 28
+uint128_t x65 = (uint128_t) x48 * x18; // MULX r64,r64,r64, start: 25, end: 29
+uint128_t x55 = (uint128_t) x48 * x17; // MULX r64,r64,r64, start: 26, end: 30
+uint128_t x64 = x28 + x63; // ADD; ADC(X), start: 26, end: 28
+uint128_t x52 = x50 + x51; // ADD; ADC(X), start: 27, end: 29
+uint128_t x57 = (uint128_t) x45 * x17; // MULX r64,r64,r64, start: 27, end: 31
+uint128_t x67 = (uint128_t) x45 * x18; // MULX r64,r64,r64, start: 28, end: 32
+uint128_t x54 = x52 + x53; // ADD; ADC(X), start: 29, end: 31
+uint128_t x66 = x64 + x65; // ADD; ADC(X), start: 29, end: 31
+uint128_t x56 = x54 + x55; // ADD; ADC(X), start: 31, end: 33
+uint128_t x58 = x23 + x57; // ADD; ADC(X), start: 31, end: 33
+uint128_t x60 = x58 + x59; // ADD; ADC(X), start: 33, end: 35
uint128_t x68 = x35 + x67; // ADD; ADC(X), start: 33, end: 35
-uint128_t x71 = x69 + x62; // ADD; ADC(X), start: 33, end: 35
-uint64_t x72 = (uint64_t) (x71 >> 0x33); // SHRD r,r,i, start: 35, end: 38
-uint64_t x73 = (uint64_t) x71 & 0x7ffffffffffff; // AND, start: 37, end: 38
-uint128_t x74 = x72 + x66; // ADD; ADC(X), start: 37, end: 39
-uint64_t x75 = (uint64_t) (x74 >> 0x33); // SHRD r,r,i, start: 40, end: 43
-uint64_t x76 = (uint64_t) x74 & 0x7ffffffffffff; // AND, start: 42, end: 43
-uint128_t x77 = x75 + x68; // ADD; ADC(X), start: 42, end: 44
-uint64_t x78 = (uint64_t) (x77 >> 0x33); // SHRD r,r,i, start: 45, end: 48
-uint64_t x79 = (uint64_t) x77 & 0x7ffffffffffff; // AND, start: 47, end: 48
-uint128_t x80 = x78 + x44; // ADD; ADC(X), start: 47, end: 49
-uint64_t x81 = (uint64_t) (x80 >> 0x33); // SHRD r,r,i, start: 50, end: 53
-uint64_t x82 = (uint64_t) x80 & 0x7ffffffffffff; // AND, start: 52, end: 53
-uint64_t x83 = 0x13 * x81; // IMUL r64,r64,i, start: 52, end: 55
-uint64_t x84 = x70 + x83; // ADD, start: 55, end: 56
-uint64_t x85 = x84 >> 0x33; // SHR r,i, start: 58, end: 59
-uint64_t x86 = x84 & 0x7ffffffffffff; // AND, start: 59, end: 60
-uint64_t x87 = x85 + x73; // ADD, start: 59, end: 60
-uint64_t x88 = x87 >> 0x33; // SHR r,i, start: 60, end: 61
-uint64_t x89 = x87 & 0x7ffffffffffff; // AND, start: 61, end: 62
-uint64_t x90 = x88 + x76; // ADD, start: 61, end: 62
+uint64_t x69 = (uint64_t) (x56 >> 0x33); // SHRD r,r,i, start: 33, end: 36
+uint64_t x70 = (uint64_t) x56 & 0x7ffffffffffff; // AND, start: 33, end: 34
+uint128_t x62 = x60 + x61; // ADD; ADC(X), start: 35, end: 37
+uint128_t x71 = x69 + x62; // ADD; ADC(X), start: 37, end: 39
+uint64_t x72 = (uint64_t) (x71 >> 0x33); // SHRD r,r,i, start: 39, end: 42
+uint64_t x73 = (uint64_t) x71 & 0x7ffffffffffff; // AND, start: 39, end: 40
+uint128_t x74 = x72 + x66; // ADD; ADC(X), start: 42, end: 44
+uint64_t x75 = (uint64_t) (x74 >> 0x33); // SHRD r,r,i, start: 44, end: 47
+uint64_t x76 = (uint64_t) x74 & 0x7ffffffffffff; // AND, start: 44, end: 45
+uint128_t x77 = x75 + x68; // ADD; ADC(X), start: 47, end: 49
+uint64_t x78 = (uint64_t) (x77 >> 0x33); // SHRD r,r,i, start: 49, end: 52
+uint64_t x79 = (uint64_t) x77 & 0x7ffffffffffff; // AND, start: 49, end: 50
+uint128_t x80 = x78 + x44; // ADD; ADC(X), start: 52, end: 54
+uint64_t x81 = (uint64_t) (x80 >> 0x33); // SHRD r,r,i, start: 54, end: 57
+uint64_t x82 = (uint64_t) x80 & 0x7ffffffffffff; // AND, start: 54, end: 55
+uint64_t x83 = 0x13 * x81; // IMUL r64,r64,i, start: 57, end: 60
+uint64_t x84 = x70 + x83; // ADD, start: 60, end: 61
+uint64_t x85 = x84 >> 0x33; // SHR r,i, start: 61, end: 62
+uint64_t x86 = x84 & 0x7ffffffffffff; // AND, start: 61, end: 62
+uint64_t x87 = x85 + x73; // ADD, start: 62, end: 63
+uint64_t x88 = x87 >> 0x33; // SHR r,i, start: 63, end: 64
+uint64_t x89 = x87 & 0x7ffffffffffff; // AND, start: 63, end: 64
+uint64_t x90 = x88 + x76; // ADD, start: 64, end: 65
Return (x82, x79, x90, x89, x86)
-// end: 63
+// end: 65
diff --git a/etc/compile-by-zinc/heuristic-search.py b/etc/compile-by-zinc/heuristic-search.py
index ea528a957..ba979f5dd 100755
--- a/etc/compile-by-zinc/heuristic-search.py
+++ b/etc/compile-by-zinc/heuristic-search.py
@@ -222,7 +222,8 @@ def schedule(data, basepoint, do_print):
core_remaining_cycle_count = dict([(core_name, [0] * core_count) for core_name, core_count in CORE_DATA]
+ [(core_name, [0]) for core_name in REGISTERS])
cur_instructions_in_cycle = 0
- return vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle
+ register_vals = dict((var, None) for var in REGISTERS)
+ return vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle, register_vals
def freeze_gen(v, rec=(lambda x:x)):
if isinstance(v, list):
@@ -252,10 +253,19 @@ def schedule(data, basepoint, do_print):
def unfreeze(v):
return unfreeze_gen(v, unfreeze)
+ def update_register_vals_with_core_args(core, args, register_vals):
+ new_rdx = register_vals['RDX']
+ if 'MULX' in core['instruction']:
+ new_rdx = sorted(args, key=(lambda x: int(x.lstrip('0x'))))[0]
+ changed = (register_vals['RDX'] != new_rdx)
+ register_vals['RDX'] = new_rdx
+ return changed, register_vals
+
@memoize
- def update_core_state(var, core, core_state):
+ def update_core_state(var, core, args, core_state):
core = unfreeze(core)
- (vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle) = unfreeze(core_state)
+ (vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle, register_vals) = unfreeze(core_state)
+ changed, register_vals = update_register_vals_with_core_args(core, args, register_vals)
cost = 0
if cur_instructions_in_cycle >= INSTRUCTIONS_PER_CYCLE:
cost += 1
@@ -282,37 +292,50 @@ def schedule(data, basepoint, do_print):
assert core_remaining_cycle_count[port['name']][0] == 0
core_remaining_cycle_count[port['name']][0] = port['latency']
core_remaining_cycle_count[port['name']] = sorted(core_remaining_cycle_count[port['name']])
- return (cost, freeze((vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle)))
+ return (cost, freeze((vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle, register_vals)))
@memoize
def evaluate_cost_memo(arg):
schedule, core_state = unfreeze_gen(arg)
schedule = unfreeze(schedule)
- (vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle) = unfreeze(core_state)
+ (vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle, register_vals) = unfreeze(core_state)
if len(schedule) == 0: return max([0] + list(vars_remaining_cycles.values()))
- (var, core), schedule = schedule[0], schedule[1:]
- cost, core_state = update_core_state(var, freeze(core), core_state)
+ (var, core, args), schedule = schedule[0], schedule[1:]
+ cost, core_state = update_core_state(var, freeze(core), args, core_state)
return cost + evaluate_cost_memo(freeze_gen((freeze(schedule), core_state)))
def get_wait_times(var_cores, core_state):
- for var, core in var_cores:
- cost, new_core_state = update_core_state(var, freeze(core), core_state)
- yield (cost, var, core, new_core_state)
+ for var, core, args in var_cores:
+ (vars_remaining_cycles, core_remaining_cycle_count, cur_instructions_in_cycle, register_vals) = unfreeze(core_state)
+ changed, register_vals = update_register_vals_with_core_args(core, args, register_vals)
+ cost, new_core_state = update_core_state(var, freeze(core), args, core_state)
+ yield (cost, -len(reverse_dependencies.get(var, [])), changed, -core['latency'], var, core, args, new_core_state)
+
+ def cmp_wait_time(v1, v2):
+ (cost1, neg_len_deps1, changed1, neg_latency1, var1, core1, args1, new_core_state1) = v1
+ (cost2, neg_len_deps2, changed2, neg_latency2, var2, core2, args2, new_core_state2) = v2
+ if cost1 != cost2: return cmp(cost1, cost2)
+ if core1['instruction'] == core2['instruction']:
+ if changed1 != changed2: return cmp(changed1, changed2)
+ if neg_len_deps1 != neg_len_deps2: return cmp(neg_len_deps1, neg_len_deps2)
+ if neg_latency1 != neg_latency2: return cmp(neg_latency1, neg_latency2)
+ if var1 != var2: return cmp(var1, var2)
+ return cmp(v1, v2)
def get_sorted_next_statements(next_var_cores, core_state):
- return sorted(get_wait_times(next_var_cores, core_state))
+ return sorted(get_wait_times(next_var_cores, core_state), cmp=cmp_wait_time)
def add_cycle_info(schedule):
core_state = freeze(make_initial_core_state())
schedule_with_cycle_info = []
cur_cycle = 0
- for var, core in schedule:
- cost, core_state = update_core_state(var, freeze(core), core_state)
+ for var, core, args in schedule:
+ cost, core_state = update_core_state(var, freeze(core), args, core_state)
+ cur_cycle += cost
schedule_with_cycle_info.append((var,
{'start':cur_cycle, 'finish':cur_cycle + core['latency']},
core))
- cur_cycle += cost
return schedule_with_cycle_info
def evaluate_cost(schedule_with_cycle_info):
@@ -324,26 +347,27 @@ def schedule(data, basepoint, do_print):
@memoize
def schedule_remaining(remaining_indices, core_state):
- def make_schedule(var, core):
- cost, new_core_state = update_core_state(var, freeze(core), core_state)
+ def make_schedule(var, core, args):
+ cost, new_core_state = update_core_state(var, freeze(core), args, core_state)
extra_cost, schedule = schedule_remaining(tuple(i for i in remaining_indices if i != var), new_core_state)
- return cost + extra_cost, ([(var, core)] + schedule)
+ return cost + extra_cost, ([(var, core, args)] + schedule)
next_statements = get_possible_next_statements(remaining_indices, dependencies)
min_cost, min_schedule = None, None
- var_cores = [(var, core)
+ var_cores = [(var, core, (lines[var]['args'] if var in lines.keys() else (var,)))
for var in next_statements
for core in (lines[var]['cores'] if var in lines.keys() else possible_cores_for_line({'op':'LOAD', 'type':'uint64_t'}))]
+
sorted_subset_next_statements = sorted_next_statements = get_sorted_next_statements(var_cores, core_state)
if len(sorted_next_statements) > 0:
pre_min_cost = sorted_next_statements[0][0]
# print((pre_min_cost, tuple(var for cost2, var, core, new_core_state in sorted_next_statements if pre_min_cost == cost2)))
sorted_subset_next_statements \
- = tuple((cost, var, core, new_core_state) for cost, var, core, new_core_state in sorted_next_statements
+ = tuple((cost, var, core, args, new_core_state) for cost, reverse_dep_count, changed, neg_latency, var, core, args, new_core_state in sorted_next_statements
if pre_min_cost == cost)
sorted_subset_next_statements = sorted_subset_next_statements[:1]
if pre_min_cost == 0: sorted_subset_next_statements = sorted_subset_next_statements[:1]
- for cost, var, core, new_core_state in sorted_subset_next_statements:
- cost, schedule = make_schedule(var, core)
+ for cost, var, core, args, new_core_state in sorted_subset_next_statements:
+ cost, schedule = make_schedule(var, core, args)
if min_cost is None or cost < min_cost:
min_cost, min_schedule = cost, schedule
# return min_cost, min_schedule