/*
 * Single Precision Floating Point
 * for C6X architeture
 *
 * Copyright (C) 2016  Krzysztof Mazur <krzysiek@podlesie.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 */

#ifndef SFV_C6X_H_INCLUDED
#define SFV_C6X_H_INCLUDED

#define __svl_norm __svl_norm
static inline float __svl_norm(const float *restrict a, unsigned int n)
{
	float sum1;
	float sum2;
	float t1, t2, t3, t4;

	__asm__ (
		"	mvc	.S2X	%8, ilc\n"
		"	|| mvk	.D1	0, %0\n"
		"	|| mvk	.D2	0, %1\n"
		"	nop	3\n"
		"	sploop	1\n"
		"	ldw	.D1T1	*%6++[2], %2\n"
		"	|| ldw	.D2T2	*%7++[2], %3\n"
		"	nop	4\n"
		"	mpysp	.M1	%2, %2, %4\n"
		"	|| mpysp .M2	%3, %3, %5\n"
		"	nop		3\n"
		"	spkernel	0, 0\n"
		"	|| addsp .L1	%0, %4, %0\n"
		"	|| addsp .L2	%1, %5, %1\n"
		"[%9]	ldw	.D1T1	*%6, %2\n"
		"	nop	4\n"
		"	mpysp	.M1	%2, %2, %6\n"
		"	nop	3\n"
		"	or	.D1	0, %0, %2\n"
		"	|| or	.D2	0, %1, %3\n"
		"	addsp	.S1	%2, %0, %2\n"
		"	|| addsp .S2	%3, %1, %3\n"
		"	or	.D1	0, %0, %4\n"
		"	|| or	.D2	0, %1, %5\n"
		"	addsp	.L1	%4, %0, %0\n"
		"	|| addsp .L2	%5, %1, %1\n"
		"	nop	3\n"
		"	addsp	.L1	%2, %0, %0\n"
		"	|| addsp .L2	%3, %1, %1\n"
		"	nop	3\n"
		"[%9]	addsp	.L1	%6, %0, %0\n"
		"	nop	3\n"
		: "=&a" (sum1), "=&b" (sum2),
		  "=&a" (t1), "=&b" (t2),
		  "=&a" (t3), "=&b" (t4)
		: "a" (a), "b" (a + 1), "a" (n >> 1), "A" (n & 1)
		: "ILC");
	return sum1 + sum2;
}

#define __svl_prod __svl_prod
static float __svl_prod(const float *restrict a,
		const float *restrict b,
		unsigned int n)
{
	float sum;
	float t1, t2, t3;

	__asm__ (
		"	mvc	.S2X	%6, ilc\n"
		"	|| mvk	.D1	0, %0\n"
		"	nop	3\n"
		"	sploop	1\n"
		"	ldw	.D1T1	*%4++[1], %1\n"
		"	|| ldw	.D2T2	*%5++[1], %2\n"
		"	nop	4\n"
		"	mpysp	.M1X	%1, %2, %3\n"
		"	nop	3\n"
		"	spkernel	9, 0\n"
		"	|| addsp	.L1	%0, %3, %0\n"
		"	or	.D1	0, %0, %1\n"
		"	addsp	.S1	%1, %0, %1\n"
		"	or	.D1	0, %0, %3\n"
		"	addsp	.L1	%3, %0, %0\n"
		"	nop	3\n"
		"	addsp	.L1	%1, %0, %0\n"
		"	nop	3\n"
		: "=&a" (sum), "=&a" (t1), "=&b" (t2), "=&a" (t3)
		: "a" (a), "b" (b), "a" (n)
		: "ILC");
	return sum;
}

#define __svl_mpy __svl_mpy
/*
 * 2 cycles per loop because we need 3 memory operations.
 * Loads are in two cycles to avoid cache conflicts.
 */
static void __svl_mpy(const float *a, float alpha,
		const float *restrict b, float beta,
		unsigned int n, float *c)
{
	float t0, t1, t2, t3, t4;

	__asm__ __volatile__ (
		"	mvc	.S2X	%[n], ilc\n"
		"	nop	3\n"
		"	sploop	2\n"
		"	ldw	.D2T2	*%7++[1], %1\n"	/* 0 */
		"	ldw	.D1T1	*%5++[1], %0\n"	/* 1 */
		"	nop	3\n"			/* 2-4 */
		"	mpysp	.M2	%1, %8, %3\n"	/* 5 */
		"	mpysp	.M1	%0, %6, %2\n"	/* 6 */
		"	nop		3\n"		/* 7-9 */
		"	addsp	.L1X	%2, %3, %4\n"	/* 10 */
		"	nop		3\n"		/* 11-13 */
		"	spkernel	14, 0\n"
		"	|| stw	.D1T1	%4, *%9++[1]\n"	/* 14 */

		: "=&a" (t0), "=&b" (t1),
		  "=&a" (t2), "=&b" (t3),
		  "=&a" (t4)
		: "a" (a), "a" (alpha),
		  "b" (b), "b" (beta),
		  "a" (c), [n] "a" (n)
		: "ILC", "memory");
}

#endif