#define NBITS 30	
!	Assembly versions of selected  routines
!
!			bigp_siever	(mpqs.c)
!			first_ge	(mpqs.c)
!			lowp_siever	(mpqs.c)
!			medp_siever	(mpqs.c)
!			sieveinit	(mpqs.c)
!
!	Use -DCONTRIBUTION_8 when compiling mpqs.c.
!
!	If this file is named "foo.s", then assemble with
!	"as -P -DFPU foo.s -o foo.o" if you have a floating-point
!	coprocessor, and with "as -P foo.s foo.o" if you do not.
!
!	Contributed by
!
!			Peter L. Montgomery
!			Department of Mathematics
!			University of California
!			Los Angeles, CA 90024
!		
!			pmontgom@math.ucla.edu
!			June, 1990
!^L
#include <sys/asm_linkage.h>
#define ASSEM_TRAPS 1
		! ASSEM_TRAPS = 0 to omit execution-time checks
		! ASSEM_TRAPS = 1 to minimal checking
		! ASSEM_TRAPS = 2 for extensive checking

!
!		bigp_siever(ibeg, iend, rsl, logp)
!
!		rootsi = &roots[ibeg];
!		end_primbase = &primbase[iend];
!		sieve_rsl = sieve + rsl
!		for (i = ibeg-iend; i <= 0;  i++, rootsi++) {
!		    rootdif = rootsi->r2;
!		    root1 = rootsi->r1 - rsl;
!		    if (root1 < 0) {
!			p = end_primbase[i];
!			sieve_rsl[root1] += logp;
!			root2 = root1 + rootdif
!			if (root2 < 0) {
!			    sieve_rsl[root2] += logp;
!			    rootsi->r1 = root1 + p;
!			} else {
!			    rootsi->r1 = root2;
!			    roots[i].r2 = p - rootdif;
!			}
!		    } else {
!			rootsi->r1 = root1;
!		    }
!		} /* for i */
#define ibeg %i0
#define iend %i1
#define rsl  %i2
#define logp %i3
#define I4 %i4
			/* I4 = 4*i */
#define root2 %l0
#define rootdif %l1
		/* root2, rootdif must be even-odd pair for ldd and std */
#define root1 %l2
#define end_primbase %l3
#define sieve_rsl %l4
#define rootsi	%l6
#define temp1  %o0
#define temp2  %o1
#define temp3  %o2
#define temp4  %o3
#define p      %o4

	.globl	bigp_siever		! bigp_siever(ibeg, iend, rsl, logp)
bigp_siever:
	save	%sp,-WINDOWSIZE,%sp
	subcc	ibeg,iend,I4			! ibeg - iend
	bg	9f				! Exit if ibeg > iend
	sll	I4,2,I4				! I4 = 4*(ibeg - iend) (delay)

	set	primbase,end_primbase
	set	roots,rootsi
	set	sieve,sieve_rsl
	ld	[end_primbase],end_primbase	! Load C pointer values
	ld	[rootsi],rootsi
	ld	[sieve_rsl],sieve_rsl
	sll	iend,2,iend			! iend <- 4*iend
	sll	ibeg,3,ibeg			! ibeg <- 8*ibeg
	add	iend,end_primbase,end_primbase  ! &primbase[iend]
	add	ibeg,rootsi,rootsi		! &roots[ibeg]	
	ba	2f				! Enter outer loop
	add	rsl,sieve_rsl,sieve_rsl		! &sieve[rsl] (delay)

8:						! Come here if root2 < 0
	ldub	[sieve_rsl+root2],temp2
	stb	temp1,[sieve_rsl+root1]		! Update sieve_rsl[root1]
	add	root1,p,root1			! root1 += p
	add	temp2,logp,temp2
	stb	temp2,[sieve_rsl+root2]		! Update sieve_rsl[root2]

1:						! Come here if root1 >= 0
	inccc	4,I4				! Increment 4*i
	st	root1,[rootsi]			! roots[i].r1 = root1	
	bg	9f				! Terminate if 4*i > 0
	inc	8,rootsi			! Advance &roots[i] (delay)
	
2:						! Start of outer loop
	ldd	[rootsi],root2			! root2 = roots[i].r1
						! rootdif = roots[i].r2
#if ASSEM_TRAPS > 1
	/* Expect 0 <= roots[i].r1 <= roots[i].r1 + roots[i].r2 < p >= rsl */
	tst root2;	tl  0			! Is 0 <= roots[i].r1?
	tst rootdif;	tl  0			! Is 0 <= roots[i].r2?
	ld	[end_primbase+I4],p		! p = end_primbase[i]
	add root2,rootdif,temp1	
	cmp temp1,p;    tge 0		! Is roots[i].r1 + roots[i].r2 < p?
	cmp p,rsl;	tl  0			! Is p <= rsl?
#endif
	subcc	root2,rsl,root1			! root1 = roots[i].r1 - rsl
	bge	1b				! Branch if root1 >= 0
	addcc	root1,rootdif,root2	   ! root2 = root1 + rootdif (delay)

	ldub	[sieve_rsl+root1],temp1
	ld	[end_primbase+I4],p		! p = end_primbase[i]
	bl	8b				! Branch if root2 >= 0
	add	temp1,logp,temp1		! (delay)

	sub	p,rootdif,rootdif		! p - rootdif
	stb	temp1,[sieve_rsl+root1]		! Update sieve_rsl[root1]
	inccc	4,I4				! Increment 4*i
	std	root2,[rootsi]			! roots[i].r1 = root2
						! roots[i].r2 = p - rootdif
	ble	2b				! Continue if 4*i <= 0
	inc	8,rootsi			! Advance @roots[i] (delay)

9:
	ret					! Exit
	restore					! (delay)	

#undef ibeg
#undef iend
#undef rsl
#undef I4
#undef logp
#undef root1
#undef end_primbase
#undef sieve_rsl
#undef rootsi
#undef temp1
#undef temp2
#undef temp3
#undef temp4
#undef p
!
!		lowp_siever(ibeg, iend, rsl, logp)
!
!		rootsi = &roots[ibeg];
!		end_primbase = &primbase[iend];
!		sieve_rsl = sieve + rsl
!		for (i = ibeg-iend; i <= 0;  i++, rootsi++) {
!		    p = end_primbase[i];
!		    rootdif = rootsi->r2;
!		    root2 = rootsi->r1 - rsl + rootdif;
!		    sieve1 = sieve_rsl - rootdif;
!		    root2p = root2 + p
!		    do {
!			sieve1   [root2 ] += logp;
!			sieve_rsl[root2 ] += logp;
!			sieve1   [root2p] += logp;
!			sieve_rsl[root2p] += logp;
!			root2  += 2*p;
!			root2p += 2*p
!		    } while (root2p < 0);
!		    if (root2 < 0) {
!			sieve1   [root2] += logp;
!			sieve_rsl[root2] += logp;
!			root2 += p;
!		    }
!		    root1 = root2 - rootdif;
!		    if (root1 < 0) {
!			sieve_rsl[root1] += logp;
!			rootsi->r1 = root2;		/* Smaller new root */
!			rootsi->r2 = p - rootdif;	/* New difference */
!		    } else {
!			rootsi->r1 = root1;
!		    }
!		} /* for i */
#define ibeg %i0
#define iend %i1
#define rsl  %i2
#define logp %i3
#define I4 %i4
			/* I4 = 4*i */
#define root2p %i5
#define root2 %l0
#define rootdif %l1
		/* root2, rootdif must be even-odd pair for ldd and std */
#define root1 %l2
#define end_primbase %l3
#define sieve_rsl %l4
#define sieve1	%l5
#define rootsi	%l6
#define twicep  %l7
#define temp1  %o0
#define temp2  %o1
#define temp3  %o2
#define temp4  %o3
#define p      %o4

	.globl	lowp_siever		! lowp_siever(ibeg, iend, rsl, logp)
lowp_siever:
	save	%sp,-WINDOWSIZE,%sp
	subcc	ibeg,iend,I4			! ibeg - iend
	bg	9f				! Exit if ibeg > iend
	sll	I4,2,I4				! I4 = 4*(ibeg - iend) (delay)

	set	primbase,end_primbase
	set	roots,rootsi
	set	sieve,sieve_rsl
	ld	[end_primbase],end_primbase	! Load C pointer values
	ld	[rootsi],rootsi
	ld	[sieve_rsl],sieve_rsl
	sll	iend,2,iend			! iend <- 4*iend
	sll	ibeg,3,ibeg			! ibeg <- 8*ibeg
	add	iend,end_primbase,end_primbase  ! &primbase[iend]
	add	ibeg,rootsi,rootsi		! &roots[ibeg]	
	ba	2f				! Enter outer loop
	add	rsl,sieve_rsl,sieve_rsl		! &sieve[rsl] (delay)

1:						! Come here if root1 >= 0
	st	root1,[rootsi]			! roots[i].r1 = root1	
	bg	9f				! Terminate if 4*i > 0
	inc	8,rootsi			! Advance &roots[i] (delay)
	
2:						! Start of outer loop
	ldd	[rootsi],root2			! root2 = roots[i].r1
						! rootdif = roots[i].r2
	ld	[end_primbase+I4],p		! p = end_primbase[i]
#if ASSEM_TRAPS > 1
	/* Expect 0 <= roots[i].r1 <= roots[i].r1 + roots[i].r2 < p <= rsl */
	tst root2;	tl  0			! Is 0 <= roots[i].r1?
	tst rootdif;	tl  0			! Is 0 <= roots[i].r2?
	add root2,rootdif,temp1	
	cmp temp1,p;    tge 0		! Is roots[i].r1 + roots[i].r2 < p?
	cmp p,rsl;	tg  0			! Is p <= rsl?
#endif
	sub	root2,rsl,root2			! root2 = roots[i].r1 - rsl
	add	root2,rootdif,root2		!		      + rootdif
	sub	sieve_rsl,rootdif,sieve1	! sieve1 = sieve_rsl - rootdif
	add	root2,p,root2p			! root2p = root2 + p	
	add	p,p,twicep			! 2*p

3:						! Inner loop
	ldub	[sieve1+root2],temp1;	ldub [sieve_rsl+root2],temp2
	ldub	[sieve1+root2p],temp3;	ldub [sieve_rsl+root2p],temp4
	add	temp1,logp,temp1;	add temp2,logp,temp2
	stb	temp1,[sieve1+root2];	stb temp2,[sieve_rsl+root2]
	add	temp3,logp,temp3;	add temp4,logp,temp4
	stb	temp3,[sieve1+root2p];	stb temp4,[sieve_rsl+root2p]
					! Add logp to each value
	addcc	twicep,root2p,root2p		! root2p += 2*p
	blt	3b;				! Loop while root2p < 0
	addcc	twicep,root2,root2		! root2 += 2*p (delay)

	bge,a	4f				! Branch if root2 >= 0
	subcc	root2,rootdif,root1		! root1 = smaller root (delay)

	ldub	[sieve1+root2],temp1;	ldub [sieve_rsl+root2],temp2
	add	temp1,logp,temp1;	add temp2,logp,temp2
	stb	temp1,[sieve1+root2];	stb temp2,[sieve_rsl+root2]
	subcc	root2p,rootdif,root1		! root1 = smaller root
	add	p,root2,root2
4:
	bge	1b				! Branch if root1 >= 0
	inccc	4,I4				! Increment 4*i (delay)

	ldub	[sieve_rsl+root1],temp1
	sub	p,rootdif,rootdif		! p - rootdif
	std	root2,[rootsi]			! roots[i].r1 = root2
						! roots[i].r2 = p - rootdif
	add	temp1,logp,temp1; stb temp1,[sieve_rsl+root1]
	ble	2b				! Continue if 4*i <= 0
	inc	8,rootsi			! Advance @roots[i] (delay)

9:
	ret					! Exit
	restore					! (delay)	

#undef ibeg
#undef iend
#undef rsl
#undef I4
#undef logp
#undef root2p
#undef root1
#undef end_primbase
#undef sieve_rsl
#undef sieve1
#undef rootsi
#undef twicep
#undef temp1
#undef temp2
#undef temp3
#undef temp4
#undef p
!
!		medp_siever(ibeg, iend, rsl, logp)
!
!		rootsi = &roots[ibeg];
!		end_primbase = &primbase[iend];
!		sieve_rsl = sieve + rsl
!		for (i = ibeg-iend; i <= 0;  i++, rootsi++) {
!		    p = end_primbase[i];
!		    rootdif = rootsi->r2;
!		    root1 = rootsi->r1 - rsl;
!		    root2 = root1 + rootdif;
!		    sieve_rsl[root1] += logp;
!		    sieve_rsl[root2] += logp;
!		    root1 += p;
!		    if (root1 < 0) {
!			sieve_rsl[root1] += logp;
!			root2 += p;
!			if (root2 < 0) {
!			    sieve_rsl[root2] += logp;
!			    rootsi->r1 = root1 + p;
!			} else {
!			    rootsi->r1 = root2;
!			    roots[i].r2 = p - rootdif;
!			}
!		    } else {
!			rootsi->r1 = root1;
!		    }
!		} /* for i */
#define ibeg %i0
#define iend %i1
#define rsl  %i2
#define logp %i3
#define I4 %i4
			/* I4 = 4*i */
#define root2 %l0
#define rootdif %l1
		/* root2, rootdif must be even-odd pair for ldd and std */
#define root1 %l2
#define end_primbase %l3
#define sieve_rsl %l4
#define rootsi	%l6
#define temp1  %o0
#define temp2  %o1
#define temp3  %o2
#define temp4  %o3
#define p      %o4

	.globl	medp_siever		! medp_siever(ibeg, iend, rsl, logp)
medp_siever:
	save	%sp,-WINDOWSIZE,%sp
	subcc	ibeg,iend,I4			! ibeg - iend
	bg	9f				! Exit if ibeg > iend
	sll	I4,2,I4				! I4 = 4*(ibeg - iend) (delay)

	set	primbase,end_primbase
	set	roots,rootsi
	set	sieve,sieve_rsl
	ld	[end_primbase],end_primbase	! Load C pointer values
	ld	[rootsi],rootsi
	ld	[sieve_rsl],sieve_rsl
	sll	iend,2,iend			! iend <- 4*iend
	sll	ibeg,3,ibeg			! ibeg <- 8*ibeg
	add	iend,end_primbase,end_primbase  ! &primbase[iend]
	add	ibeg,rootsi,rootsi		! &roots[ibeg]	
	ba	2f				! Enter outer loop
	add	rsl,sieve_rsl,sieve_rsl		! &sieve[rsl] (delay)

8:						! Come here if root2 < 0
	ldub	[sieve_rsl+root2],temp2
	stb	temp1,[sieve_rsl+root1]		! Update sieve_rsl[root1]
	add	root1,p,root1			! root1 += p
	add	temp2,logp,temp2
	stb	temp2,[sieve_rsl+root2]		! Update sieve_rsl[root2]

1:						! Come here if root1 >= 0
	inccc	4,I4				! Increment 4*i
	st	root1,[rootsi]			! roots[i].r1 = root1	
	bg	9f				! Terminate if 4*i > 0
	inc	8,rootsi			! Advance &roots[i] (delay)
	
2:						! Start of outer loop
	ldd	[rootsi],root2			! root2 = roots[i].r1
						! rootdif = roots[i].r2
	ld	[end_primbase+I4],p		! p = end_primbase[i]
#if ASSEM_TRAPS > 1
	/* Expect 0 <= roots[i].r1 <= roots[i].r1 + roots[i].r2 < p <= rsl */
	tst root2;	tl  0			! Is 0 <= roots[i].r1?
	tst rootdif;	tl  0			! Is 0 <= roots[i].r2?
	add root2,rootdif,temp1	
	cmp temp1,p;    tge 0		! Is roots[i].r1 + roots[i].r2 < p?
	cmp p,rsl;	tg  0			! Is p >= rsl?
#endif
	sub	root2,rsl,root1			! root1 = roots[i].r1 - rsl
	ldub	[sieve_rsl+root1],temp1
	add	root1,rootdif,root2		! root2 = root1 + rootdif
	ldub	[sieve_rsl+root2],temp2
	add	temp1,logp,temp1;
	stb	temp1,[sieve_rsl+root1]		! Update sieve_rsl[root1]
	addcc	root1,p,root1			! root1 += p
	add	temp2,logp,temp2
	bge	1b				! Branch if root1 >= 0
	stb	temp2,[sieve_rsl+root2]	! Update sieve_rsl[root2] (delay)

	ldub	[sieve_rsl+root1],temp1
	addcc	root2,p,root2			! root2 += p
	bl	8b				! Branch if root2 >= 0
	add	temp1,logp,temp1		! (delay)

	sub	p,rootdif,rootdif		! p - rootdif
	stb	temp1,[sieve_rsl+root1]		! Update sieve_rsl[root1]
	inccc	4,I4				! Increment 4*i
	std	root2,[rootsi]			! roots[i].r1 = root2
						! roots[i].r2 = p - rootdif
	ble	2b				! Continue if 4*i <= 0
	inc	8,rootsi			! Advance @roots[i] (delay)

9:
	ret					! Exit
	restore					! (delay)	

#undef ibeg
#undef iend
#undef rsl
#undef I4
#undef logp
#undef root1
#undef end_primbase
#undef sieve_rsl
#undef rootsi
#undef temp1
#undef temp2
#undef temp3
#undef temp4
#undef p
!
!
!		first_ge(array, start, bound)	(leaf)
!		(see C code)
#define array %o0
#define start %o1
#define bound %o2
#define parray %o3
#define x80808080 %o4
#define lbound %g1
#define test1  %g2
#define test2  %g3
#define test3  %g4
#define test4  %g5
		/* test1-test4 must be two even-odd pairs */

	.globl	first_ge
first_ge:				! first_ge(array, start, bound)
	and	bound,255,bound
	sub	bound,1,lbound		! bound-1
	sll	lbound,8,test1
	or	test1,lbound,lbound	! 257*(bound-1)
	sll	lbound,16,test2
	or	test2,lbound,lbound	! lbound = 65537*257*(bound - 1)
					!	 = 0x01010101 * (bound-1)
	add	array,start,parray
	set	0x80808080,x80808080
1:
	andcc	parray,15,test1
	be	3f			! Branch if multiple of 16
	.empty

2:					! Byte search mode
	ldub	[parray],test1
	cmp	test1,bound
	blt,a	1b			! Branch if value too small
	inc	parray			! Increment address (delay)

	retl
	sub	parray,array,%o0  ! Function value = parray - array (delay)

3:
	ldd	[parray],test1		! Load test1-test4
	ldd	[parray+8],test3	
	sub	lbound,test1,test1
	sub	lbound,test2,test2
	sub	lbound,test3,test3
	sub	lbound,test4,test4
	or	test2,test1,test1
	or	test4,test3,test3
	or	test3,test1,test1
	andcc	test1,x80808080,test1
	be,a	3b			! Branch if nothing good found
	inc	16,parray		! and increment address (delay)
	
	ba,a	2b			! Return to byte mode

#undef array
#undef start
#undef bound
#undef parray
#undef x80808080
#undef lbound
#undef test1
#undef test2
#undef test3
#undef test4

!
!
!		sieveinit(lng, value)			(leaf)
#define lng %o0
#define value %o1
#define dvalue1 %o2
#define dvalue2 %o3
	/* dvalue1 - dvalue2 must be even-odd pair */
#define nxtadd	%o4

	.globl	sieveinit
sieveinit:				! sieveinit(lng, value)
	mov	value,dvalue1
	mov	value,dvalue2
	subcc	lng,128,lng		! Decrement by 128 bytes
	set	sieve,nxtadd
	bl	2f			! If fewer than 128 to do
	ld	[nxtadd],nxtadd		! nxtadd = sieve (delay)
1:
	std	dvalue1,[nxtadd+  0];	std	dvalue1,[nxtadd+  8]
	std	dvalue1,[nxtadd+ 16];	std	dvalue1,[nxtadd+ 24]
	std	dvalue1,[nxtadd+ 32];	std	dvalue1,[nxtadd+ 40]
	std	dvalue1,[nxtadd+ 48];	std	dvalue1,[nxtadd+ 56]
	subcc	lng,128,lng
	std	dvalue1,[nxtadd+ 64];	std	dvalue1,[nxtadd+ 72]
	std	dvalue1,[nxtadd+ 80];	std	dvalue1,[nxtadd+ 88]
	std	dvalue1,[nxtadd+ 96];	std	dvalue1,[nxtadd+104]
	std	dvalue1,[nxtadd+112];	std	dvalue1,[nxtadd+120]
	bge	1b			! if at least 128 to go
	add	nxtadd,128,nxtadd	! (delay)
2:	
	addcc	lng,128,lng		! Restore remaining byte count
	ble	4f			! Exit if nothing to clear
	.empty

3:					! Finish eight bytes at a time
	subcc	lng,8,lng
	std	dvalue1,[nxtadd]
	bg	3b			! If more bytes to go
	add	nxtadd,8,nxtadd		! (delay)
4:
	retl				! Return from leaf
	nop
#undef lng
#undef value
#undef dvalue1
#undef dvalue2
#undef nxtadd

