// Start of copy.h // Cache-oblivious map-transpose function. #define GEN_MAP_TRANSPOSE(NAME, ELEM_TYPE) \ static void map_transpose_##NAME \ (ELEM_TYPE* dst, ELEM_TYPE* src, \ int64_t k, int64_t m, int64_t n, \ int64_t cb, int64_t ce, int64_t rb, int64_t re) \ { \ int32_t r = re - rb; \ int32_t c = ce - cb; \ if (k == 1) { \ if (r <= 64 && c <= 64) { \ for (int64_t j = 0; j < c; j++) { \ for (int64_t i = 0; i < r; i++) { \ dst[(j + cb) * n + (i + rb)] = src[(i + rb) * m + (j + cb)]; \ } \ } \ } else if (c <= r) { \ map_transpose_##NAME(dst, src, k, m, n, cb, ce, rb, rb + r/2); \ map_transpose_##NAME(dst, src, k, m, n, cb, ce, rb + r/2, re); \ } else { \ map_transpose_##NAME(dst, src, k, m, n, cb, cb + c/2, rb, re); \ map_transpose_##NAME(dst, src, k, m, n, cb + c/2, ce, rb, re); \ } \ } else { \ for (int64_t i = 0; i < k; i++) { \ map_transpose_##NAME(dst + i * m * n, src + i * m * n, 1, m, n, cb, ce, rb, re); \ }\ } \ } // Straightforward LMAD copy function. #define GEN_LMAD_COPY_ELEMENTS(NAME, ELEM_TYPE) \ static void lmad_copy_elements_##NAME(int r, \ ELEM_TYPE* dst, int64_t dst_strides[r], \ ELEM_TYPE *src, int64_t src_strides[r], \ int64_t shape[r]) { \ if (r == 1) { \ for (int i = 0; i < shape[0]; i++) { \ dst[i*dst_strides[0]] = src[i*src_strides[0]]; \ } \ } else if (r > 1) { \ for (int i = 0; i < shape[0]; i++) { \ lmad_copy_elements_##NAME(r-1, \ dst+i*dst_strides[0], dst_strides+1, \ src+i*src_strides[0], src_strides+1, \ shape+1); \ } \ } \ } \ // Check whether this LMAD can be seen as a transposed 2D array. This // is done by checking every possible splitting point. static bool lmad_is_tr(int64_t *n_out, int64_t *m_out, int r, const int64_t strides[r], const int64_t shape[r]) { for (int i = 1; i < r; i++) { int n = 1, m = 1; bool ok = true; int64_t expected = 1; // Check strides before 'i'. for (int j = i-1; j >= 0; j--) { ok = ok && strides[j] == expected; expected *= shape[j]; n *= shape[j]; } // Check strides after 'i'. for (int j = r-1; j >= i; j--) { ok = ok && strides[j] == expected; expected *= shape[j]; m *= shape[j]; } if (ok) { *n_out = n; *m_out = m; return true; } } return false; } // This function determines whether the a 'dst' LMAD is row-major and // 'src' LMAD is column-major. Both LMADs are for arrays of the same // shape. Both LMADs are allowed to have additional dimensions "on // top". Essentially, this function determines whether a copy from // 'src' to 'dst' is a "map(transpose)" that we know how to implement // efficiently. The LMADs can have arbitrary rank, and the main // challenge here is checking whether the src LMAD actually // corresponds to a 2D column-major layout by morally collapsing // dimensions. There is a lot of looping here, but the actual trip // count is going to be very low in practice. // // Returns true if this is indeed a map(transpose), and writes the // number of arrays, and moral array size to appropriate output // parameters. static bool lmad_map_tr(int64_t *num_arrays_out, int64_t *n_out, int64_t *m_out, int r, const int64_t dst_strides[r], const int64_t src_strides[r], const int64_t shape[r]) { int64_t rowmajor_strides[r]; rowmajor_strides[r-1] = 1; for (int i = r-2; i >= 0; i--) { rowmajor_strides[i] = rowmajor_strides[i+1] * shape[i+1]; } // map_r will be the number of mapped dimensions on top. int map_r = 0; int64_t num_arrays = 1; for (int i = 0; i < r; i++) { if (dst_strides[i] != rowmajor_strides[i] || src_strides[i] != rowmajor_strides[i]) { break; } else { num_arrays *= shape[i]; map_r++; } } *num_arrays_out = num_arrays; if (memcmp(&rowmajor_strides[map_r], &dst_strides[map_r], sizeof(int64_t)*(r-map_r)) == 0) { return lmad_is_tr(n_out, m_out, r-map_r, src_strides+map_r, shape+map_r); } else if (memcmp(&rowmajor_strides[map_r], &src_strides[map_r], sizeof(int64_t)*(r-map_r)) == 0) { return lmad_is_tr(m_out, n_out, r-map_r, dst_strides+map_r, shape+map_r); } return false; } // Check if the strides correspond to row-major strides of *any* // permutation of the shape. This is done by recursive search with // backtracking. This is worst-case exponential, but hopefully the // arrays we encounter do not have that many dimensions. static bool lmad_contiguous_search(int checked, int64_t expected, int r, int64_t strides[r], int64_t shape[r], bool used[r]) { for (int i = 0; i < r; i++) { for (int j = 0; j < r; j++) { if (!used[j] && strides[j] == expected && strides[j] >= 0) { used[j] = true; if (checked+1 == r || lmad_contiguous_search(checked+1, expected * shape[j], r, strides, shape, used)) { return true; } used[j] = false; } } } return false; } // Does this LMAD correspond to an array with positive strides and no // holes? static bool lmad_contiguous(int r, int64_t strides[r], int64_t shape[r]) { bool used[r]; for (int i = 0; i < r; i++) { used[i] = false; } return lmad_contiguous_search(0, 1, r, strides, shape, used); } // Does this copy correspond to something that could be done with a // memcpy()-like operation? I.e. do the LMADs actually represent the // same in-memory layout and are they contiguous? static bool lmad_memcpyable(int r, int64_t dst_strides[r], int64_t src_strides[r], int64_t shape[r]) { if (!lmad_contiguous(r, dst_strides, shape)) { return false; } for (int i = 0; i < r; i++) { if (dst_strides[i] != src_strides[i] && shape[i] != 1) { return false; } } return true; } static void log_copy(struct futhark_context* ctx, const char *kind, int r, int64_t dst_offset, int64_t dst_strides[r], int64_t src_offset, int64_t src_strides[r], int64_t shape[r]) { if (ctx->logging) { fprintf(ctx->log, "\n# Copy %s\n", kind); fprintf(ctx->log, "Shape: "); for (int i = 0; i < r; i++) { fprintf(ctx->log, "[%ld]", (long int)shape[i]); } fprintf(ctx->log, "\n"); fprintf(ctx->log, "Dst offset: %ld\n", dst_offset); fprintf(ctx->log, "Dst strides:"); for (int i = 0; i < r; i++) { fprintf(ctx->log, " %ld", (long int)dst_strides[i]); } fprintf(ctx->log, "\n"); fprintf(ctx->log, "Src offset: %ld\n", src_offset); fprintf(ctx->log, "Src strides:"); for (int i = 0; i < r; i++) { fprintf(ctx->log, " %ld", (long int)src_strides[i]); } fprintf(ctx->log, "\n"); } } static void log_transpose(struct futhark_context* ctx, int64_t k, int64_t n, int64_t m) { if (ctx->logging) { fprintf(ctx->log, "## Transpose\n"); fprintf(ctx->log, "Arrays : %ld\n", (long int)k); fprintf(ctx->log, "X elements : %ld\n", (long int)m); fprintf(ctx->log, "Y elements : %ld\n", (long int)n); fprintf(ctx->log, "\n"); } } #define GEN_LMAD_COPY(NAME, ELEM_TYPE) \ static void lmad_copy_##NAME \ (struct futhark_context *ctx, int r, \ ELEM_TYPE* dst, int64_t dst_offset, int64_t dst_strides[r], \ ELEM_TYPE *src, int64_t src_offset, int64_t src_strides[r], \ int64_t shape[r]) { \ log_copy(ctx, "CPU to CPU", r, dst_offset, dst_strides, \ src_offset, src_strides, shape); \ int64_t size = 1; \ for (int i = 0; i < r; i++) { size *= shape[i]; } \ if (size == 0) { return; } \ int64_t k, n, m; \ if (lmad_map_tr(&k, &n, &m, \ r, dst_strides, src_strides, shape)) { \ log_transpose(ctx, k, n, m); \ map_transpose_##NAME \ (dst+dst_offset, src+src_offset, k, n, m, 0, n, 0, m); \ } else if (lmad_memcpyable(r, dst_strides, src_strides, shape)) { \ if (ctx->logging) {fprintf(ctx->log, "## Flat copy\n\n");} \ memcpy(dst+dst_offset, src+src_offset, size*sizeof(*dst)); \ } else { \ if (ctx->logging) {fprintf(ctx->log, "## General copy\n\n");} \ lmad_copy_elements_##NAME \ (r, \ dst+dst_offset, dst_strides, \ src+src_offset, src_strides, shape); \ } \ } GEN_MAP_TRANSPOSE(1b, uint8_t) GEN_MAP_TRANSPOSE(2b, uint16_t) GEN_MAP_TRANSPOSE(4b, uint32_t) GEN_MAP_TRANSPOSE(8b, uint64_t) GEN_LMAD_COPY_ELEMENTS(1b, uint8_t) GEN_LMAD_COPY_ELEMENTS(2b, uint16_t) GEN_LMAD_COPY_ELEMENTS(4b, uint32_t) GEN_LMAD_COPY_ELEMENTS(8b, uint64_t) GEN_LMAD_COPY(1b, uint8_t) GEN_LMAD_COPY(2b, uint16_t) GEN_LMAD_COPY(4b, uint32_t) GEN_LMAD_COPY(8b, uint64_t) // End of copy.h