aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/compute/hs/gen/transpose.c
blob: 095f53d33052808dda4a894eb8f6649495323d85 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
/*
 * Copyright 2018 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 *
 */

//
//
//

#include "transpose.h"
#include "common/macros.h"

//
// Rows must be an even number.  This is enforced elsewhere.
//
// The transpose requires (cols_log2 * rows/2) row-pair blends.
//
void
hsg_transpose(uint32_t                   const cols_log2,
              uint32_t                   const rows,
              void (*pfn_blend)(uint32_t const cols_log2,
                                uint32_t const row_ll, // lower-left
                                uint32_t const row_ur, // upper-right
                                void *         blend),
              void *                           blend,
              void (*pfn_remap)(uint32_t const row_from,
                                uint32_t const row_to,
                                void *         remap),
              void *                           remap)
{
  // get mapping array
  uint32_t * map_curr = ALLOCA_MACRO(rows * sizeof(*map_curr));
  uint32_t * map_next = ALLOCA_MACRO(rows * sizeof(*map_next));

  // init the mapping array
  for (uint32_t ii=0; ii<rows; ii++)
    map_curr[ii] = ii;

  // successively transpose rows using blends
  for (uint32_t cc=1; cc<=cols_log2; cc++)
    {
      uint32_t const mask = BITS_TO_MASK(cc);

      for (uint32_t ii=0; ii<rows; ii++)
        {
          uint32_t const left = map_curr[ii];
          uint32_t const stay = left & ~mask;

          if (left != stay) // will be swapped away
            {
              for (uint32_t jj=0; jj<rows; jj++)
                {
                  if (map_curr[jj] == stay)
                    {
                      map_next[jj] = stay;
                      map_next[ii] = stay + (rows << (cc-1));

                      pfn_blend(cc,ii,jj,blend); // log2,left,right,payload

                      break;
                    }
                }
            }
        }

      uint32_t * tmp = map_curr;

      map_curr = map_next;
      map_next = tmp;
    }

  // write out the remapping
  for (uint32_t ii=0; ii<rows; ii++)
    pfn_remap(ii,map_curr[ii] >> cols_log2,remap);
}

//
// test it!
//

#ifdef HS_TRANSPOSE_DEBUG

#include <stdio.h>

static uint32_t cols; // implicit on SIMD/GPU

static
void
hsg_debug_blend(uint32_t const cols_log2,
                uint32_t const row_ll, // lower-left
                uint32_t const row_ur, // upper-right
                uint32_t *     b)
{
  fprintf(stdout,"BLEND( %u, %3u, %3u )\n",cols_log2,row_ll,row_ur);

  uint32_t * const ll = ALLOCA(cols * sizeof(*b));
  uint32_t * const ur = ALLOCA(cols * sizeof(*b));

  memcpy(ll,b+row_ll*cols,cols * sizeof(*b));
  memcpy(ur,b+row_ur*cols,cols * sizeof(*b));

  for (uint32_t ii=0; ii<cols; ii++)
    b[row_ll*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii] : ur[ii^(1<<cols_log2-1)];

  for (uint32_t ii=0; ii<cols; ii++)
    b[row_ur*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii^(1<<cols_log2-1)] : ur[ii];
}

static
void
hsg_debug_remap(uint32_t   const row_from,
                uint32_t   const row_to,
                uint32_t * const r)
{
  fprintf(stdout,"REMAP( %3u, %3u )\n",row_from,row_to);

  r[row_to] = row_from;
}

static
void
hsg_debug_print(uint32_t         const rows,
                uint32_t const * const m,
                uint32_t const * const r)
{
  for (uint32_t rr=0; rr<rows; rr++) {
    for (uint32_t cc=0; cc<cols; cc++)
      fprintf(stdout,"%4u ",m[r[rr]*cols + cc]);
    fprintf(stdout,"\n");
  }
}

int
main(int argc, char * argv[])
{
  uint32_t const cols_log2 = (argc <= 1) ? 3 : strtoul(argv[1],NULL,0);
  uint32_t const rows      = (argc <= 2) ? 6 : strtoul(argv[2],NULL,0);

  if (rows & 1)
    return;

  cols = 1 << cols_log2;

  uint32_t * const b = ALLOCA(cols * rows * sizeof(*b));
  uint32_t * const r = ALLOCA(       rows * sizeof(*r));

  for (uint32_t rr=0; rr<rows; rr++) {
    r[rr] = rr;
    for (uint32_t cc=0; cc<cols; cc++)
      b[rr*cols+cc] = cc*rows+rr;
  }

  hsg_debug_print(rows,b,r);

  hsg_transpose(cols_log2,rows,
                hsg_debug_blend,b,
                hsg_debug_remap,r);

  hsg_debug_print(rows,b,r);

  return 0;
}

#endif

//
//
//