/* * Single Precision Floating Point * for C6X architeture * * Copyright (C) 2016 Krzysztof Mazur * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. */ #ifndef SFV_C6X_H_INCLUDED #define SFV_C6X_H_INCLUDED #define __svl_norm __svl_norm static inline float __svl_norm(const float *restrict a, unsigned int n) { float sum1; float sum2; float t1, t2, t3, t4; __asm__ ( " mvc .S2X %8, ilc\n" " || mvk .D1 0, %0\n" " || mvk .D2 0, %1\n" " nop 3\n" " sploop 1\n" " ldw .D1T1 *%6++[2], %2\n" " || ldw .D2T2 *%7++[2], %3\n" " nop 4\n" " mpysp .M1 %2, %2, %4\n" " || mpysp .M2 %3, %3, %5\n" " nop 3\n" " spkernel 0, 0\n" " || addsp .L1 %0, %4, %0\n" " || addsp .L2 %1, %5, %1\n" "[%9] ldw .D1T1 *%6, %2\n" " nop 4\n" " mpysp .M1 %2, %2, %6\n" " nop 3\n" " or .D1 0, %0, %2\n" " || or .D2 0, %1, %3\n" " addsp .S1 %2, %0, %2\n" " || addsp .S2 %3, %1, %3\n" " or .D1 0, %0, %4\n" " || or .D2 0, %1, %5\n" " addsp .L1 %4, %0, %0\n" " || addsp .L2 %5, %1, %1\n" " nop 3\n" " addsp .L1 %2, %0, %0\n" " || addsp .L2 %3, %1, %1\n" " nop 3\n" "[%9] addsp .L1 %6, %0, %0\n" " nop 3\n" : "=&a" (sum1), "=&b" (sum2), "=&a" (t1), "=&b" (t2), "=&a" (t3), "=&b" (t4) : "a" (a), "b" (a + 1), "a" (n >> 1), "A" (n & 1) : "ILC"); return sum1 + sum2; } #define __svl_prod __svl_prod static float __svl_prod(const float *restrict a, const float *restrict b, unsigned int n) { float sum; float t1, t2, t3; __asm__ ( " mvc .S2X %6, ilc\n" " || mvk .D1 0, %0\n" " nop 3\n" " sploop 1\n" " ldw .D1T1 *%4++[1], %1\n" " || ldw .D2T2 *%5++[1], %2\n" " nop 4\n" " mpysp .M1X %1, %2, %3\n" " nop 3\n" " spkernel 9, 0\n" " || addsp .L1 %0, %3, %0\n" " or .D1 0, %0, %1\n" " addsp .S1 %1, %0, %1\n" " or .D1 0, %0, %3\n" " addsp .L1 %3, %0, %0\n" " nop 3\n" " addsp .L1 %1, %0, %0\n" " nop 3\n" : "=&a" (sum), "=&a" (t1), "=&b" (t2), "=&a" (t3) : "a" (a), "b" (b), "a" (n) : "ILC"); return sum; } #define __svl_mpy __svl_mpy /* * 2 cycles per loop because we need 3 memory operations. * Loads are in two cycles to avoid cache conflicts. */ static void __svl_mpy(const float *a, float alpha, const float *restrict b, float beta, unsigned int n, float *c) { float t0, t1, t2, t3, t4; __asm__ __volatile__ ( " mvc .S2X %[n], ilc\n" " nop 3\n" " sploop 2\n" " ldw .D2T2 *%7++[1], %1\n" /* 0 */ " ldw .D1T1 *%5++[1], %0\n" /* 1 */ " nop 3\n" /* 2-4 */ " mpysp .M2 %1, %8, %3\n" /* 5 */ " mpysp .M1 %0, %6, %2\n" /* 6 */ " nop 3\n" /* 7-9 */ " addsp .L1X %2, %3, %4\n" /* 10 */ " nop 3\n" /* 11-13 */ " spkernel 14, 0\n" " || stw .D1T1 %4, *%9++[1]\n" /* 14 */ : "=&a" (t0), "=&b" (t1), "=&a" (t2), "=&b" (t3), "=&a" (t4) : "a" (a), "a" (alpha), "b" (b), "b" (beta), "a" (c), [n] "a" (n) : "ILC", "memory"); } #endif