#ifndef lint
static char SCCSid[] = "@(#) ./sparse/fblock/fbmult.c 07/23/93";
#endif

#include "tools.h"
#include "sparse/spmat.h"
#include "sparse/sppriv.h"
#include "sparse/fblock/spfbpriv.h"
#include "inline/blas2.h"
#include "inline/copy.h"

/* Use inline versions */
#if defined(intelnx)
#undef DMV
#undef DVPMV
#define DMV(a,b,nr,nc,c)   DMV2aIL(a,b,nr,nc,c)
#define DVPMV(a,b,nr,nc,c) DVPMV2aIL(a,b,nr,nc,c)
#else 
#undef DMV
#undef DVPMV
#define DMV(a,b,nr,nc,c)   DMVIL(a,b,nr,nc,c)
#define DVPMV(a,b,nr,nc,c) DVPMVIL(a,b,nr,nc,c)
#endif

/* 
   Small blocksizes are not useful, as the overhead is too high.
   Try for at least a blocksize of 8 
 */

/*
    Matrix - vector product.  This assumes that pointers to the blocks are
    stored, rather than the blocks themselves.
 */    
void SpFBMult( mat, vin, vout )
SpMat  *mat;
double *vin, *vout;
{
int       n;
SpFBVec    **rs, *row;
double    *xv;
int       bsize, b2, k, off, nz, *xi, i;
SpFBRowMat *R;

SPLITTOMAT(mat);
if (mat->map) {
    /* Error, not supported */
    SETERRC(1,"SpFBMult does not support mapped matrices"); return;
    }

R      = GETBROWMAT(mat);
bsize  = R->bsize;
b2     = bsize * bsize;
n      = R->nfbvecs;
rs     = R->rs;
while (n--) {
    row  = *rs++;
    xv   = row->v;
    xi   = row->i;
    nz   = row->nz;
    DMV( vout, xv, bsize, bsize, vin+xi[0] );
    k    = bsize;
    off  = b2;
    for (i=1; i<nz; i++) {
    	DVPMV( vout, xv + off, bsize, bsize, vin+xi[k] );
	k   += bsize;
	off += b2;
        }
    vout += bsize;
    }
return;
}

/* 
   A version of SpFBMult for contiguous storage of the rowblocks
 */
void SpFBMultContig( mat, vin, vout )
SpMat  *mat;
double *vin, *vout;
{
int       n;
SpFBVec    **rs, *row;
double    *vitmp, *xv;
int       bsize, b2, k, off, nnz, j, nz, *xi, i, nc;
SpFBRowMat *R;

SPLITTOMAT(mat);
if (mat->map) {
    /* Error, not supported */
    SETERRC(1,"SpFBMultContig does not support mapped matrices"); return;
    }

R      = GETBROWMAT(mat);
bsize  = R->bsize;
b2     = bsize * bsize;
n      = R->nfbvecs;
rs     = R->rs;
/* 
   If the blocksize is a power of 2, then the shifts can be 
   computed on the fly more cheaply.  Alternately, we could
   precompute them and store them in the structure.
 */   
#define MAX_N_BLOCKS 10
vitmp = (double *)MALLOC( bsize * MAX_N_BLOCKS * sizeof(double) );
while (n--) {
    row  = *rs++;
    xv   = row->v;
    xi   = row->i;
    nz   = row->nz;
    /* Copy the blocks of vin[xi] (to at most ? blocks) to vitmp */
    off  = 0;
    k    = 0;
    for (i=0; i<nz; i+= MAX_N_BLOCKS) {
	nnz = i + MAX_N_BLOCKS;
	if (nnz > nz) nnz = nz;
	/* Could even test to see if this block in contiguous in xi ... */
	for (j=i; j<nnz; j++) 
	    COPY(vitmp+(j-i)*bsize,vin+xi[j*bsize],bsize);
	nc = (nnz-i) * bsize;
	if (i == 0) {
	    DMV( vout, xv, bsize, nc, vitmp );
	    }
	else {
	    DVPMV( vout, xv + off, bsize, nc, vitmp );
	    }
	off += b2;
	k   += bsize;
	}
    vout += bsize;
    }
FREE( vitmp );
return;
}

