summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWladimir J. van der Laan <laanwj@gmail.com>2017-07-13 15:43:09 +0000
committerRobert Foss <robert.foss@collabora.com>2017-07-14 16:33:51 +0200
commiteb8af0b4a19a2fce13d88adfc47341c1c5299f5e (patch)
treeaedd9e3a62680aafbaa566a0a036537372f06961
parentef657dc4037787d71aae7938fd38337ce08f9f0e (diff)
downloadmesa-wlad.tar.gz
mesa-wlad.tar.xz
etnaviv: NEON implementations of tiling/untilingwlad
Experiemental NEON implementation of tiling/untiling that adds specialized NEON function for tiling 8, 16, 32 bit per element 4x4 tiles. To optimize memory read/write sizes, there are functions that process multiple horizontically adjactent tiles as well. These are automatically picked when the width is an appropriate multiple. To make this work, Mesa needs to be compiled with CFLAGS -mfpu=neon, otherwise a compile error will be thrown.
-rw-r--r--src/gallium/drivers/etnaviv/Makefile.sources2
-rw-r--r--src/gallium/drivers/etnaviv/etnaviv_tiling.c449
2 files changed, 434 insertions, 17 deletions
diff --git a/src/gallium/drivers/etnaviv/Makefile.sources b/src/gallium/drivers/etnaviv/Makefile.sources
index 60275c9..3bad4f5 100644
--- a/src/gallium/drivers/etnaviv/Makefile.sources
+++ b/src/gallium/drivers/etnaviv/Makefile.sources
@@ -45,7 +45,7 @@ C_SOURCES := \
etnaviv_surface.h \
etnaviv_texture.c \
etnaviv_texture.h \
- etnaviv_tiling.c \
+ etnaviv_tiling.c.neon \
etnaviv_tiling.h \
etnaviv_transfer.c \
etnaviv_transfer.h \
diff --git a/src/gallium/drivers/etnaviv/etnaviv_tiling.c b/src/gallium/drivers/etnaviv/etnaviv_tiling.c
index f4f85c1..97d1a21 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_tiling.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_tiling.c
@@ -63,19 +63,408 @@
} \
}
+/** NEON specializations */
+
+/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==4 */
+inline void tile32_1x_impl(void *gpu, const void *cpu, uint32_t cpu_stride)
+{
+ __asm__ volatile (
+ "vld1.8 {d0,d1}, [%0], %r2;\n"
+ "vld1.8 {d2,d3}, [%0], %r2;\n"
+ "vld1.8 {d4,d5}, [%0], %r2;\n"
+ "vld1.8 {d6,d7}, [%0], %r2;\n"
+ "vstm %1, {q0, q1, q2, q3};\n"
+ : "=r"(cpu) /* changed */
+ : "r"(gpu), "r"(cpu_stride), "0"(cpu)
+ : "q0", "q1", "q2", "q3");
+}
+
+/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==4, two consecutive tiles */
+inline void tile32_2x_impl(void *gpu, const void *cpu, uint32_t cpu_stride)
+{
+ const void *cpunext = cpu + 16;
+ __asm__ volatile (
+ "vld1.8 {d0,d1}, [%0], %r3;\n"
+ "vld1.8 {d8,d9}, [%1], %r3;\n"
+ "vld1.8 {d2,d3}, [%0], %r3;\n"
+ "vld1.8 {d10,d11}, [%1], %r3;\n"
+ "vld1.8 {d4,d5}, [%0], %r3;\n"
+ "vld1.8 {d12,d13}, [%1], %r3;\n"
+ "vld1.8 {d6,d7}, [%0], %r3;\n"
+ "vld1.8 {d14,d15}, [%1], %r3;\n"
+ "vstm %2, {q0, q1, q2, q3, q4, q5, q6, q7};\n"
+ : "=r"(cpu), "=r"(cpunext)
+ : "r"(gpu), "r"(cpu_stride), "0"(cpu), "1"(cpunext)
+ : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+}
+
+/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==2 */
+inline void tile16_1x_impl(void *gpu, const void *cpu, uint32_t cpu_stride)
+{
+ __asm__ volatile (
+ "vld1.8 {d0}, [%0], %r2;\n"
+ "vld1.8 {d1}, [%0], %r2;\n"
+ "vld1.8 {d2}, [%0], %r2;\n"
+ "vld1.8 {d3}, [%0], %r2;\n"
+ "vstm %1, {q0, q1};\n"
+ : "=r"(cpu) /* changed */
+ : "r"(gpu), "r"(cpu_stride), "0"(cpu)
+ : "q0", "q1");
+}
+
+/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==2, two tiles (in X) at once */
+inline void tile16_2x_impl(void *gpu, const void *cpu, uint32_t cpu_stride)
+{
+ const void *cpunext = cpu + 8;
+ __asm__ volatile (
+ /* load two adjacent tiles from untiled */
+ "vld1.8 {d0}, [%0], %r3;\n"
+ "vld1.8 {d4}, [%1], %r3;\n"
+ "vld1.8 {d1}, [%0], %r3;\n"
+ "vld1.8 {d5}, [%1], %r3;\n"
+ "vld1.8 {d2}, [%0], %r3;\n"
+ "vld1.8 {d6}, [%1], %r3;\n"
+ "vld1.8 {d3}, [%0], %r3;\n"
+ "vld1.8 {d7}, [%1], %r3;\n"
+ /* store two adjacent tiles, tiled */
+ "vstm %2, {q0, q1, q2, q3};\n"
+ : "=r"(cpu), "=r"(cpunext)
+ : "r"(gpu), "r"(cpu_stride), "0"(cpu), "1"(cpunext)
+ : "q0", "q1", "q2", "q3");
+}
+
+/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==1 */
+inline void tile8_1x_impl(void *gpu, const void *cpu, uint32_t cpu_stride)
+{
+ __asm__ volatile (
+ "vld1.32 {d0[0]}, [%0], %r2;\n"
+ "vld1.32 {d0[1]}, [%0], %r2;\n"
+ "vld1.32 {d1[0]}, [%0], %r2;\n"
+ "vld1.32 {d1[1]}, [%0], %r2;\n"
+ "vstm %1, {d0-d1};\n"
+ : "=r"(cpu)
+ : "r"(gpu), "r"(cpu_stride), "0"(cpu)
+ : "q0", "q1");
+}
+
+/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==1, two tiles (in X) at once */
+inline void tile8_2x_impl(void *gpu, const void *cpu, uint32_t cpu_stride)
+{
+ __asm__ volatile (
+ /* load two adjacent tiles, from untiled */
+ "vld1.8 {d0}, [%0], %r2;\n"
+ "vld1.8 {d1}, [%0], %r2;\n"
+ "vld1.8 {d2}, [%0], %r2;\n"
+ "vld1.8 {d3}, [%0], %r2;\n"
+ /* Transpose:
+ * Start
+ * [d0] x1 x0
+ * [d1] x3 x2
+ * [d2] x5 x4
+ * [d3] x7 x6
+ */
+ "vtrn.32 d0, d1;\n"
+ "vtrn.32 d2, d3;\n"
+ /* [d0] x2 x0
+ * [d1] x3 x1
+ * [d2] x6 x4
+ * [d3] x7 x5
+ */
+ "vswp d1, d2;\n"
+ /* [d0] x2 x0
+ * [d1] x6 x4
+ * [d2] x3 x1
+ * [d3] x7 x5
+ */
+ /* store two adjacent tiles, to tiled */
+ "vstm %1, {d0-d3};\n"
+ : "=r"(cpu)
+ : "r"(gpu), "r"(cpu_stride), "0"(cpu)
+ : "q0", "q1");
+}
+
+/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==1, four tiles (in X) at once */
+inline void tile8_4x_impl(void *gpu, const void *cpu, uint32_t cpu_stride)
+{
+ __asm__ volatile (
+ /* load four adjacent tiles, from untiled */
+ "vld1.8 {d0,d1}, [%0], %r2;\n"
+ "vld1.8 {d2,d3}, [%0], %r2;\n"
+ "vld1.8 {d4,d5}, [%0], %r2;\n"
+ "vld1.8 {d6,d7}, [%0], %r2;\n"
+ /* Transpose:
+ * Start
+ * [q0] x3 x2 x1 x0
+ * [q1] x7 x6 x5 x4
+ * [q2] x11 x10 x9 x8
+ * [q3] x15 x14 x13 x12
+ */
+ "vtrn.32 q0, q1;\n"
+ "vtrn.32 q2, q3;\n"
+ /* [q0] x6 x2 x4 x0
+ * [q1] x7 x3 x5 x1
+ * [q2] x14 x10 x12 x8
+ * [q3] x15 x11 x13 x9
+ */
+ "vswp d1, d4;\n"
+ "vswp d3, d6;\n"
+ /* [q0] x12 x8 x4 x0
+ * [q1] x13 x9 x5 x1
+ * [q2] x14 x10 x6 x2
+ * [q3] x15 x11 x7 x3
+ */
+ /* store four adjacent tiles, to tiled */
+ "vstm %1, {q0, q1, q2, q3};\n"
+ : "=r"(cpu)
+ : "r"(gpu), "r"(cpu_stride), "0"(cpu)
+ : "q0", "q1", "q2", "q3");
+}
+
+/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==4 */
+inline void untile32_1x_impl(const void *gpu, void *cpu, uint32_t cpu_stride)
+{
+ __asm__ volatile (
+ "vldm %1, {q0, q1, q2, q3};\n"
+ "vst1.8 {d0,d1}, [%0], %r2;\n"
+ "vst1.8 {d2,d3}, [%0], %r2;\n"
+ "vst1.8 {d4,d5}, [%0], %r2;\n"
+ "vst1.8 {d6,d7}, [%0], %r2;\n"
+ : "=r"(cpu)
+ : "r"(gpu), "r"(cpu_stride), "0"(cpu)
+ : "q0", "q1", "q2", "q3");
+}
+
+/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==4, two consecutive tiles */
+inline void untile32_2x_impl(const void *gpu, void *cpu, uint32_t cpu_stride)
+{
+ void *cpunext = cpu + 16;
+ __asm__ volatile (
+ "vldm %2, {q0, q1, q2, q3, q4, q5, q6, q7};\n"
+ "vst1.8 {d0,d1}, [%0], %r3;\n"
+ "vst1.8 {d8,d9}, [%1], %r3;\n"
+ "vst1.8 {d2,d3}, [%0], %r3;\n"
+ "vst1.8 {d10,d11}, [%1], %r3;\n"
+ "vst1.8 {d4,d5}, [%0], %r3;\n"
+ "vst1.8 {d12,d13}, [%1], %r3;\n"
+ "vst1.8 {d6,d7}, [%0], %r3;\n"
+ "vst1.8 {d14,d15}, [%1], %r3;\n"
+ : "=r"(cpu), "=r"(cpunext)
+ : "r"(gpu), "r"(cpu_stride), "0"(cpu), "1"(cpunext)
+ : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+}
+
+/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==2 */
+inline void untile16_1x_impl(const void *gpu, void *cpu, uint32_t cpu_stride)
+{
+ __asm__ volatile (
+ "vldm %1, {q0, q1};\n"
+ "vst1.8 {d0}, [%0], %r2;\n"
+ "vst1.8 {d1}, [%0], %r2;\n"
+ "vst1.8 {d2}, [%0], %r2;\n"
+ "vst1.8 {d3}, [%0], %r2;\n"
+ : "=r"(cpu)
+ : "r"(gpu), "r"(cpu_stride), "0"(cpu)
+ : "q0", "q1");
+}
+
+/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==2, two tiles (in X) at once */
+inline void untile16_2x_impl(const void *gpu, void *cpu, uint32_t cpu_stride)
+{
+ void *cpunext = cpu + 8;
+ __asm__ volatile (
+ /* load two adjacent tiles, tiled */
+ "vldm %2, {q0, q1, q2, q3};\n"
+ /* store two adjacent tiles, untiled */
+ "vst1.8 {d0}, [%0], %r3;\n"
+ "vst1.8 {d4}, [%1], %r3;\n"
+ "vst1.8 {d1}, [%0], %r3;\n"
+ "vst1.8 {d5}, [%1], %r3;\n"
+ "vst1.8 {d2}, [%0], %r3;\n"
+ "vst1.8 {d6}, [%1], %r3;\n"
+ "vst1.8 {d3}, [%0], %r3;\n"
+ "vst1.8 {d7}, [%1], %r3;\n"
+ : "=r"(cpu), "=r"(cpunext)
+ : "r"(gpu), "r"(cpu_stride), "0"(cpu), "1"(cpunext)
+ : "q0", "q1", "q2", "q3");
+}
+
+/* tile: special implementation for basex%4==0 and basey%4==0, elmtsize==1 */
+inline void untile8_1x_impl(const void *gpu, void *cpu, uint32_t cpu_stride)
+{
+ __asm__ volatile (
+ "vldm %1, {d0-d1};\n"
+ "vst1.32 {d0[0]}, [%0], %r2;\n"
+ "vst1.32 {d0[1]}, [%0], %r2;\n"
+ "vst1.32 {d1[0]}, [%0], %r2;\n"
+ "vst1.32 {d1[1]}, [%0], %r2;\n"
+ : "=r"(cpu)
+ : "r"(gpu), "r"(cpu_stride), "0"(cpu)
+ : "q0", "q1");
+}
+
+
+/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==1, two tiles (in X) at once */
+inline void untile8_2x_impl(const void *gpu, void *cpu, uint32_t cpu_stride)
+{
+ __asm__ volatile (
+ /* load two adjacent tiles, from tiled */
+ "vldm %1, {d0-d3};\n"
+ /* Transpose:
+ * Start
+ * [d0] x2 x0
+ * [d1] x6 x4
+ * [d2] x3 x1
+ * [d3] x7 x5
+ */
+ "vswp d1, d2;\n"
+ /* [d0] x2 x0
+ * [d1] x3 x1
+ * [d2] x6 x4
+ * [d3] x7 x5
+ */
+ "vtrn.32 d0, d1;\n"
+ "vtrn.32 d2, d3;\n"
+ /* [d0] x1 x0
+ * [d1] x3 x2
+ * [d2] x5 x4
+ * [d3] x7 x6
+ */
+ /* store two adjacent tiles, to untiled */
+ "vst1.8 {d0}, [%0], %r2;\n"
+ "vst1.8 {d1}, [%0], %r2;\n"
+ "vst1.8 {d2}, [%0], %r2;\n"
+ "vst1.8 {d3}, [%0], %r2;\n"
+ : "=r"(cpu)
+ : "r"(gpu), "r"(cpu_stride), "0"(cpu)
+ : "q0", "q1");
+}
+
+/* untile: special implementation for basex%4==0 and basey%4==0, elmtsize==1, four tiles (in X) at once */
+inline void untile8_4x_impl(const void *gpu, void *cpu, uint32_t cpu_stride)
+{
+ __asm__ volatile (
+ /* load four adjacent tiles, from tiled */
+ "vldm %1, {q0, q1, q2, q3};\n"
+ /* Transpose:
+ * Start
+ * [q0] x12 x8 x4 x0
+ * [q1] x13 x9 x5 x1
+ * [q2] x14 x10 x6 x2
+ * [q3] x15 x11 x7 x3
+ */
+ "vswp d1, d4;\n"
+ "vswp d3, d6;\n"
+ /* [q0] x6 x2 x4 x0
+ * [q1] x7 x3 x5 x1
+ * [q2] x14 x10 x12 x8
+ * [q3] x15 x11 x13 x9
+ */
+ "vtrn.32 q0, q1;\n"
+ "vtrn.32 q2, q3;\n"
+ /* [q0] x3 x2 x1 x0
+ * [q1] x7 x6 x5 x4
+ * [q2] x11 x10 x9 x8
+ * [q3] x15 x14 x13 x12
+ */
+ /* store four adjacent tiles, to untiled */
+ "vst1.8 {d0,d1}, [%0], %r2;\n"
+ "vst1.8 {d2,d3}, [%0], %r2;\n"
+ "vst1.8 {d4,d5}, [%0], %r2;\n"
+ "vst1.8 {d6,d7}, [%0], %r2;\n"
+ : "=r"(cpu)
+ : "r"(gpu), "r"(cpu_stride), "0"(cpu)
+ : "q0", "q1", "q2", "q3");
+}
+
+/*** Tile visitor functions */
+#define TILE_FUNC(elmtsize,htiles,func) \
+ static void func(void *gpu, const void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height) \
+ { \
+ for (uint32_t y=0; y<height; y+=4) { \
+ void *gpu_tile = gpu; \
+ const void *cpu_tile = cpu; \
+ for (uint32_t x=0; x<width; x+=htiles*4) { \
+ func##_impl(gpu_tile, cpu_tile, cpu_stride);\
+ gpu_tile += htiles*elmtsize/8*16; \
+ cpu_tile += htiles*elmtsize/8*4; \
+ } \
+ gpu += gpu_stride; \
+ cpu += cpu_stride*4; \
+ } \
+ }
+
+#define UNTILE_FUNC(elmtsize,htiles,func) \
+ static void func(const void *gpu, void *cpu, uint32_t gpu_stride, uint32_t cpu_stride, uint32_t width, uint32_t height) \
+ { \
+ for (uint32_t y=0; y<height; y+=4) { \
+ const void *gpu_tile = gpu; \
+ void *cpu_tile = cpu; \
+ for (uint32_t x=0; x<width; x+=htiles*4) { \
+ func##_impl(gpu_tile, cpu_tile, cpu_stride);\
+ gpu_tile += htiles*elmtsize/8*16; \
+ cpu_tile += htiles*elmtsize/8*4; \
+ } \
+ gpu += gpu_stride; \
+ cpu += cpu_stride*4; \
+ } \
+ }
+
+TILE_FUNC(32, 1, tile32_1x);
+TILE_FUNC(32, 2, tile32_2x);
+TILE_FUNC(16, 1, tile16_1x);
+TILE_FUNC(16, 2, tile16_2x);
+TILE_FUNC(8, 1, tile8_1x);
+TILE_FUNC(8, 2, tile8_2x);
+TILE_FUNC(8, 4, tile8_4x);
+UNTILE_FUNC(32, 1, untile32_1x);
+UNTILE_FUNC(32, 2, untile32_2x);
+UNTILE_FUNC(16, 1, untile16_1x);
+UNTILE_FUNC(16, 2, untile16_2x);
+UNTILE_FUNC(8, 1, untile8_1x);
+UNTILE_FUNC(8, 2, untile8_2x);
+UNTILE_FUNC(8, 4, untile8_4x);
+
void
etna_texture_tile(void *dest, void *src, unsigned basex, unsigned basey,
unsigned dst_stride, unsigned width, unsigned height,
unsigned src_stride, unsigned elmtsize)
{
- if (elmtsize == 4) {
- DO_TILE(uint32_t)
- } else if (elmtsize == 2) {
- DO_TILE(uint16_t)
- } else if (elmtsize == 1) {
- DO_TILE(uint8_t)
- } else {
- printf("etna_texture_tile: unhandled element size %i\n", elmtsize);
+ if ((basex & 3)==0 && (basey & 3)==0 && (width & 3)==0 && (height & 3) == 0) {
+ /* specialized assembler implementations */
+ dest += basey*dst_stride + (basex/4)*4*4*elmtsize;
+ if (elmtsize == 4) {
+ if ((width & 7)==0) {
+ tile32_2x(dest, src, dst_stride*4, src_stride, width, height);
+ } else {
+ tile32_1x(dest, src, dst_stride*4, src_stride, width, height);
+ }
+ } else if (elmtsize == 2) {
+ if ((width & 7)==0) {
+ tile16_2x(dest, src, dst_stride*4, src_stride, width, height);
+ } else {
+ tile16_1x(dest, src, dst_stride*4, src_stride, width, height);
+ }
+ } else if (elmtsize == 1) {
+ if ((width & 15)==0) {
+ tile8_4x(dest, src, dst_stride*4, src_stride, width, height);
+ } else if ((width & 7)==0) {
+ tile8_2x(dest, src, dst_stride*4, src_stride, width, height);
+ } else {
+ tile8_1x(dest, src, dst_stride*4, src_stride, width, height);
+ }
+ } else {
+ printf("etna_texture_tile: unhandled element size %i\n", elmtsize);
+ }
+ } else { /* fallback */
+ if (elmtsize == 4) {
+ DO_TILE(uint32_t)
+ } else if (elmtsize == 2) {
+ DO_TILE(uint16_t)
+ } else if (elmtsize == 1) {
+ DO_TILE(uint8_t)
+ } else {
+ printf("etna_texture_tile: unhandled element size %i\n", elmtsize);
+ }
}
}
@@ -84,13 +473,41 @@ etna_texture_untile(void *dest, void *src, unsigned basex, unsigned basey,
unsigned src_stride, unsigned width, unsigned height,
unsigned dst_stride, unsigned elmtsize)
{
- if (elmtsize == 4) {
- DO_UNTILE(uint32_t);
- } else if (elmtsize == 2) {
- DO_UNTILE(uint16_t);
- } else if (elmtsize == 1) {
- DO_UNTILE(uint8_t);
- } else {
- printf("etna_texture_tile: unhandled element size %i\n", elmtsize);
+ if ((basex & 3)==0 && (basey & 3)==0 && (width & 3)==0 && (height & 3) == 0) {
+ /* specialized assembler implementations */
+ src += basey*src_stride + (basex/4)*4*4*elmtsize;
+ if (elmtsize == 4) {
+ if ((width & 7)==0) {
+ untile32_2x(src, dest, src_stride*4, dst_stride, width, height);
+ } else {
+ untile32_1x(src, dest, src_stride*4, dst_stride, width, height);
+ }
+ } else if (elmtsize == 2) {
+ if ((width & 7)==0) {
+ untile16_2x(src, dest, src_stride*4, dst_stride, width, height);
+ } else {
+ untile16_1x(src, dest, src_stride*4, dst_stride, width, height);
+ }
+ } else if (elmtsize == 1) {
+ if ((width & 15)==0) {
+ untile8_4x(src, dest, src_stride*4, dst_stride, width, height);
+ } else if ((width & 7)==0) {
+ untile8_2x(src, dest, src_stride*4, dst_stride, width, height);
+ } else {
+ untile8_1x(src, dest, src_stride*4, dst_stride, width, height);
+ }
+ } else {
+ printf("etna_texture_tile: unhandled element size %i\n", elmtsize);
+ }
+ } else { /* fallback */
+ if (elmtsize == 4) {
+ DO_UNTILE(uint32_t);
+ } else if (elmtsize == 2) {
+ DO_UNTILE(uint16_t);
+ } else if (elmtsize == 1) {
+ DO_UNTILE(uint8_t);
+ } else {
+ printf("etna_texture_tile: unhandled element size %i\n", elmtsize);
+ }
}
}