dotprod_av.c - platform/external/fec - Git at Google

 /* 16-bit signed integer dot product
  * Altivec-assisted version
  * Copyright 2004 Phil Karn
  * May be used under the terms of the GNU Lesser General Public License (LGPL)
  */
 #include <stdlib.h>
 #include "fec.h"

 struct dotprod {
   int len; /* Number of coefficients */

   /* On an Altivec machine, these hold 8 copies of the coefficients,
    * preshifted by 0,1,..7 words to meet all possible input data
    */
   signed short *coeffs[8];
 };

 /* Create and return a descriptor for use with the dot product function */
 void *initdp_av(signed short coeffs[],int len){
   struct dotprod *dp;
   int i,j;

   if(len == 0)
     return NULL;

   dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
   dp->len = len;

   /* Make 8 copies of coefficients, one for each data alignment,
    * each aligned to 16-byte boundary
    */
   for(i=0;i<8;i++){
     dp->coeffs[i] = calloc(1+(len+i-1)/8,sizeof(vector signed short));
     for(j=0;j<len;j++)
       dp->coeffs[i][j+i] = coeffs[j];
   }
   return (void *)dp;
 }


 /* Free a dot product descriptor created earlier */
 void freedp_av(void *p){
   struct dotprod *dp = (struct dotprod *)p;
   int i;

   for(i=0;i<8;i++)
     if(dp->coeffs[i] != NULL)
       free(dp->coeffs[i]);
   free(dp);
 }

 /* Compute a dot product given a descriptor and an input array
  * The length is taken from the descriptor
  */
 long dotprod_av(void *p,signed short a[]){
   struct dotprod *dp = (struct dotprod *)p;
   int al;
   vector signed short *ar,*d;
   vector signed int sums0,sums1,sums2,sums3;
   union { vector signed int v; signed int w[4];} s;
   int nblocks;

   /* round ar down to beginning of 16-byte block containing 0th element of
    * input buffer. Then set d to one of 8 sets of shifted coefficients
    */
   ar = (vector signed short *)((int)a & ~15);
   al = ((int)a & 15)/sizeof(signed short);
   d = (vector signed short *)dp->coeffs[al];

   nblocks = (dp->len+al-1)/8+1;

   /* Sum into four vectors each holding four 32-bit partial sums */
   sums3 = sums2 = sums1 = sums0 = (vector signed int)(0);
   while(nblocks >= 4){
     sums0 = vec_msums(ar[nblocks-1],d[nblocks-1],sums0);
     sums1 = vec_msums(ar[nblocks-2],d[nblocks-2],sums1);
     sums2 = vec_msums(ar[nblocks-3],d[nblocks-3],sums2);
     sums3 = vec_msums(ar[nblocks-4],d[nblocks-4],sums3);
     nblocks -= 4;
   }
   sums0 = vec_adds(sums0,sums1);
   sums2 = vec_adds(sums2,sums3);
   sums0 = vec_adds(sums0,sums2);
   while(nblocks-- > 0){
     sums0 = vec_msums(ar[nblocks],d[nblocks],sums0);
   }
   /* Sum 4 partial sums into final result */
   s.v = vec_sums(sums0,(vector signed int)(0));

   return s.w[3];
 }
	/* 16-bit signed integer dot product
	* Altivec-assisted version
	* Copyright 2004 Phil Karn
	* May be used under the terms of the GNU Lesser General Public License (LGPL)
	*/
	#include <stdlib.h>
	#include "fec.h"

	struct dotprod {
	int len; /* Number of coefficients */

	/* On an Altivec machine, these hold 8 copies of the coefficients,
	* preshifted by 0,1,..7 words to meet all possible input data
	*/
	signed short *coeffs[8];
	};

	/* Create and return a descriptor for use with the dot product function */
	void *initdp_av(signed short coeffs[],int len){
	struct dotprod *dp;
	int i,j;

	if(len == 0)
	return NULL;

	dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
	dp->len = len;

	/* Make 8 copies of coefficients, one for each data alignment,
	* each aligned to 16-byte boundary
	*/
	for(i=0;i<8;i++){
	dp->coeffs[i] = calloc(1+(len+i-1)/8,sizeof(vector signed short));
	for(j=0;j<len;j++)
	dp->coeffs[i][j+i] = coeffs[j];
	}
	return (void *)dp;
	}


	/* Free a dot product descriptor created earlier */
	void freedp_av(void *p){
	struct dotprod dp = (struct dotprod )p;
	int i;

	for(i=0;i<8;i++)
	if(dp->coeffs[i] != NULL)
	free(dp->coeffs[i]);
	free(dp);
	}

	/* Compute a dot product given a descriptor and an input array
	* The length is taken from the descriptor
	*/
	long dotprod_av(void *p,signed short a[]){
	struct dotprod dp = (struct dotprod )p;
	int al;
	vector signed short ar,d;
	vector signed int sums0,sums1,sums2,sums3;
	union { vector signed int v; signed int w[4];} s;
	int nblocks;

	/* round ar down to beginning of 16-byte block containing 0th element of
	* input buffer. Then set d to one of 8 sets of shifted coefficients
	*/
	ar = (vector signed short *)((int)a & ~15);
	al = ((int)a & 15)/sizeof(signed short);
	d = (vector signed short *)dp->coeffs[al];

	nblocks = (dp->len+al-1)/8+1;

	/* Sum into four vectors each holding four 32-bit partial sums */
	sums3 = sums2 = sums1 = sums0 = (vector signed int)(0);
	while(nblocks >= 4){
	sums0 = vec_msums(ar[nblocks-1],d[nblocks-1],sums0);
	sums1 = vec_msums(ar[nblocks-2],d[nblocks-2],sums1);
	sums2 = vec_msums(ar[nblocks-3],d[nblocks-3],sums2);
	sums3 = vec_msums(ar[nblocks-4],d[nblocks-4],sums3);
	nblocks -= 4;
	}
	sums0 = vec_adds(sums0,sums1);
	sums2 = vec_adds(sums2,sums3);
	sums0 = vec_adds(sums0,sums2);
	while(nblocks-- > 0){
	sums0 = vec_msums(ar[nblocks],d[nblocks],sums0);
	}
	/* Sum 4 partial sums into final result */
	s.v = vec_sums(sums0,(vector signed int)(0));

	return s.w[3];
	}