// file kernel/n/alpha/karatsuba.S: Karatsuba multiplication
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                     Multiplication de Karatsuba                       |
 |                                                                       |
 +-----------------------------------------------------------------------*/

                            # +------------------+
                            # |  Multiplication  |
                            # +------------------+
        

   #  void xn(karamul)(chiffre *a, long la, chiffre *b, long lb, chiffre *c)
   #
   #  entre :
   #  a = naturel de longueur la
   #  b = naturel de longueur lb
   #  c = naturel de longueur la+lb, non confondu avec a ou b
   #  contraintes : 0 < lb <= la
   #
   #  sortie :
   #  c <- a*b

#ifdef assembly_sn_karamul
#define L(x) .Lsn_karamul_##x

        .align 5
#ifdef debug_karamul
        .globl sn_karamul_buggy
        .ent   sn_karamul_buggy
sn_karamul_buggy:
        .frame $30,0,$26,0
        .prologue 1
	ldgp   $gp,  0($27)
#else
        .globl sn_karamul
        .ent   sn_karamul
sn_karamul:
        .frame $30,0,$26,0
        .prologue 1
	ldgp   $gp,  0($27)
L(nogp):
#endif
        
	cmpule $19,  karamul_lim, $0 # petite multiplication ?
	bne    $0,   .Lsn_mul_n2_nogp# => algorithme en n^2
	addq   $17,  1,    $0   # r0 <- p = ceil(la/2)
	srl    $0,   1,    $0
	subq   $19,  $0,   $1   # r1 <- r = lb - p
	ble    $1,   L(tranches)# si lb <= p, dcoupe a en tranches

	# ici lb >= ceil(la/2) : dcoupage de Karatsuba
	# variables locales
	#define _d_  64($30)
        #define _a_  56($30)
        #define _b_  48($30)
        #define _c_  40($30)
	#define _p_  32($30)
	#define _q_  24($30)
	#define _r_  16($30)
	#define _x_   8($30)
	#define _ra_  0($30)

	subq   $17,  $0,   $2 	# r2 <- q = floor(la/2)
	sll    $0,   4,    $3   # rserve 2p+8 chiffres dans la pile
	addq   $3,   64,   $3
	subq   $30,  $3,   $30
	stq    $16,  _a_        # sauve les paramtres
	stq    $18,  _b_
	stq    $20,  _c_
	stq    $0,   _p_
	stq    $2,   _q_
	stq    $1,   _r_
	stq    $26,  _ra_

	# calcule |a0-a1| et |b0-b1| dans c
	bis    $0,   $0,   $17  # r17 <- p
	s8addq $0,   $16,  $18	# r18 <- &a1
	bis    $2,   $2,   $19  # r19 <- q
	bsr    $26,  .Lsn_asub_nogp # c[0..p-1] <- |a0-a1|
	stq    $0,   _x_        # x <- sgn(a0-a1)
	
	ldq    $16,  _b_
	ldq    $17,  _p_
	s8addq $17,  $16,  $18  # r18 <- &b1
	ldq    $19,  _r_
	ldq    $20,  _c_
	s8addq $17,  $20,  $20	# r20 <- &c[p]
	bsr    $26,  .Lsn_asub_nogp # c[p..2p] <- |b0-b1|
	ldq    $1,   _x_
	xor    $0,   $1,   $0
	stq    $0,   _x_        # x <- sgn((a0-a1)*(b0-b1))
	
	# calcule |a0-a1|*|b0-b1|, a0*b0, a1*b1
	ldq    $16,  _c_
	ldq    $17,  _p_
	s8addq $17,  $16,  $18	# r18 <- &c[p]
	bis    $17,  $17,  $19  # r19 <- p
	lda    $20,  _d_
	bsr    $26,  L(nogp)    # d <- |a0-a1|*|b0-b1|
	
	ldq    $16,  _a_	# r16 <- &a0
	ldq    $17,  _p_
	ldq    $18,  _b_	# r18 <- &b0
	bis    $17,  $17,  $19  # r19 <- p
	ldq    $20,  _c_
	bsr    $26,  L(nogp)    # c[0..2p-1] <- a0*b0
	
	ldq    $17,  _p_
	ldq    $16,  _a_
	ldq    $18,  _b_
	ldq    $20,  _c_
	s8addq $17,  $16,  $16  # r16 <- &a1
	s8addq $17,  $18,  $18  # r18 <- &b1
	s8addq $17,  $20,  $20  # r20 <- &c[2p]
	s8addq $17,  $20,  $20
	ldq    $17,  _q_
	ldq    $19,  _r_
	bsr    $26,  L(nogp)    # c[2p..2p+q+r-1] <- a1*b1

        # point de chute pour karasqr
.Lsn_kara_aux:

	# prpare l addition/soustraction croise
	ldq    $22,  _p_
	ldq    $23,  _q_
	ldq    $24,  _r_
	addq   $23,  $24,  $23
	subq   $23,  $22,  $23  # r23 <- q+r-p
	subq   $22,  $23,  $24  # r24 <- 2p-q-r
	ldq    $16,  _c_	# r16 <- &c0
	s8addq $22,  $16,  $17 	# r17 <- &c1
	s8addq $22,  $17,  $18  # r18 <- &c2
	s8addq $22,  $18,  $19  # r19 <- &c3
	lda    $20,  _d_        # r20 <- &d0
	s8addq $22,  $20,  $21  # r21 <- &d1
	bis    $31,  $31,  $0   # r0 <- 0 (retenue pour c1)
	bis    $31,  $31,  $1   # r1 <- 0 (retenue pour c2)
	ldq    $4,   _x_
	bne    $4,   L(cross_add)

	# c1:c2 <- (c0+c1+c2-d0):(c1+c2+c3-d1)
	beq    $23,  2f
	.align 5
1:
	ldq    $3,   0($16)	# r3 <- c0[i]
	ldq    $4,   0($17)	# r4 <- c1[i]
	ldq    $5,   0($18)	# r5 <- c2[i]
	ldq    $6,   0($19)	# r6 <- c3[i]
	ldq    $7,   0($20)	# r7 <- d0[i]
	ldq    $8,   0($21)	# r8 <- d1[i]

	addq   $4,   $5,   $4	# r4 <- c1[i] + c2[i]
	cmpult $4,   $5,   $5   # r5 <- retenue
	
	addq   $3,   $0,   $3	# r3 <- c0[i] + ret
	cmpult $3,   $0,   $2
	sra    $0,   2,    $0
	addq   $0,   $2,   $0
	addq   $0,   $5,   $0
	subq   $3,   $7,   $7	# r7 <- c0[1] - d0[i] + ret
	cmpult $3,   $7,   $2
	subq   $0,   $2,   $0
	addq   $4,   $7,   $7	# r7 <- c0[1] + c1[i] + c2[i] - d0[i] + ret
	cmpult $7,   $4,   $2
	addq   $0,   $2,   $0

	addq   $6,   $1,   $6	# r6 <- c3[i] + ret
	cmpult $6,   $1,   $2
	sra    $1,   2,    $1
	addq   $1,   $2,   $1
	addq   $1,   $5,   $1
	subq   $6,   $8,   $8	# r8 <- c3[i] - d1[i] + ret
	cmpult $6,   $8,   $2
	subq   $1,   $2,   $1
	addq   $4,   $8,   $8	# r8 <- c1[1] + c2[i] + c3[i] - d0[i] + ret
	cmpult $8,   $4,   $2
	addq   $1,   $2,   $1

	stq    $7,   0($17)     # sauve c1[i]
	stq    $8,   0($18)     # sauve c2[i]
	subq   $23,  1,    $23  # chiffres suivants
	addq   $16,  8,    $16
	addq   $17,  8,    $17
	addq   $18,  8,    $18
	addq   $19,  8,    $19
	addq   $20,  8,    $20
	addq   $21,  8,    $21
	bne    $23,  1b

	# continue sans c3
	beq    $24,  L(cross_done)
	.align 5
2:
	ldq    $3,   0($16)	# r3 <- c0[i]
	ldq    $4,   0($17)	# r4 <- c1[i]
	ldq    $5,   0($18)	# r5 <- c2[i]
	ldq    $7,   0($20)	# r7 <- d0[i]
	ldq    $8,   0($21)	# r8 <- d1[i]

	addq   $4,   $5,   $4	# r4 <- c1[i] + c2[i]
	cmpult $4,   $5,   $5   # r5 <- retenue
	
	addq   $3,   $0,   $3	# r3 <- c0[i] + ret
	cmpult $3,   $0,   $2
	sra    $0,   2,    $0
	addq   $0,   $2,   $0
	addq   $0,   $5,   $0
	subq   $3,   $7,   $7	# r7 <- c0[1] - d0[i] + ret
	cmpult $3,   $7,   $2
	subq   $0,   $2,   $0
	addq   $4,   $7,   $7	# r7 <- c0[1] + c1[i] + c2[i] - d0[i] + ret
	cmpult $7,   $4,   $2
	addq   $0,   $2,   $0

	addq   $4,   $1,   $4	# r4 <- c1[i] + c2[i] + ret
	cmpult $4,   $1,   $2
	sra    $1,   2,    $1
	addq   $1,   $2,   $1
	addq   $1,   $5,   $1
	subq   $4,   $8,   $8	# r8 <- c1[i] + c2[i] - d1[i] + ret
	cmpult $4,   $8,   $2
	subq   $1,   $2,   $1

	stq    $7,   0($17)     # sauve c1[i]
	stq    $8,   0($18)     # sauve c2[i]
	subq   $24,  1,    $24  # chiffres suivants
	addq   $16,  8,    $16
	addq   $17,  8,    $17
	addq   $18,  8,    $18
	addq   $20,  8,    $20
	addq   $21,  8,    $21
	bne    $24,  2b
	br     $31,  L(cross_done)

	# c1:c2 <- (c0+c1+c2+d0):(c1+c2+c3+d1)
	.align 5
L(cross_add):
	beq    $23,  2f
	.align 5
1:
	ldq    $3,   0($16)	# r3 <- c0[i]
	ldq    $4,   0($17)	# r4 <- c1[i]
	ldq    $5,   0($18)	# r5 <- c2[i]
	ldq    $6,   0($19)	# r6 <- c3[i]
	ldq    $7,   0($20)	# r7 <- d0[i]
	ldq    $8,   0($21)	# r8 <- d1[i]

	addq   $4,   $5,   $4	# r4 <- c1[i] + c2[i]
	cmpult $4,   $5,   $5   # r5 <- retenue
	
	addq   $3,   $0,   $3	# r3 <- c0[i] + ret
	cmpult $3,   $0,   $2
	sra    $0,   2,    $0
	addq   $0,   $2,   $0
	addq   $0,   $5,   $0
	addq   $3,   $7,   $7	# r7 <- c0[1] + d0[i] + ret
	cmpult $7,   $3,   $2
	addq   $0,   $2,   $0
	addq   $4,   $7,   $7	# r7 <- c0[1] + c1[i] + c2[i] + d0[i] + ret
	cmpult $7,   $4,   $2
	addq   $0,   $2,   $0

	addq   $6,   $1,   $6	# r6 <- c3[i] + ret
	cmpult $6,   $1,   $2
	sra    $1,   2,    $1
	addq   $1,   $2,   $1
	addq   $1,   $5,   $1
	addq   $6,   $8,   $8	# r8 <- c3[i] + d1[i] + ret
	cmpult $8,   $6,   $2
	addq   $1,   $2,   $1
	addq   $4,   $8,   $8	# r8 <- c1[1] + c2[i] + c3[i] + d0[i] + ret
	cmpult $8,   $4,   $2
	addq   $1,   $2,   $1

	stq    $7,   0($17)     # sauve c1[i]
	stq    $8,   0($18)     # sauve c2[i]
	subq   $23,  1,    $23  # chiffres suivants
	addq   $16,  8,    $16
	addq   $17,  8,    $17
	addq   $18,  8,    $18
	addq   $19,  8,    $19
	addq   $20,  8,    $20
	addq   $21,  8,    $21
	bne    $23,  1b

	# continue sans c3
	beq    $24,  L(cross_done)
	.align 5
2:
	ldq    $3,   0($16)	# r3 <- c0[i]
	ldq    $4,   0($17)	# r4 <- c1[i]
	ldq    $5,   0($18)	# r5 <- c2[i]
	ldq    $7,   0($20)	# r7 <- d0[i]
	ldq    $8,   0($21)	# r8 <- d1[i]

	addq   $4,   $5,   $4	# r4 <- c1[i] + c2[i]
	cmpult $4,   $5,   $5   # r5 <- retenue
	
	addq   $3,   $0,   $3	# r3 <- c0[i] + ret
	cmpult $3,   $0,   $2
	sra    $0,   2,    $0
	addq   $0,   $2,   $0
	addq   $0,   $5,   $0
	addq   $3,   $7,   $7	# r7 <- c0[1] + d0[i] + ret
	cmpult $7,   $3,   $2
	addq   $0,   $2,   $0
	addq   $4,   $7,   $7	# r7 <- c0[1] + c1[i] + c2[i] + d0[i] + ret
	cmpult $7,   $4,   $2
	addq   $0,   $2,   $0

	addq   $4,   $1,   $4	# r4 <- c1[i] + c2[i] + ret
	cmpult $4,   $1,   $2
	sra    $1,   2,    $1
	addq   $1,   $2,   $1
	addq   $1,   $5,   $1
	addq   $4,   $8,   $8	# r8 <- c1[i] + c2[i] + d1[i] + ret
	cmpult $8,   $4,   $2
	addq   $1,   $2,   $1

	stq    $7,   0($17)     # sauve c1[i]
	stq    $8,   0($18)     # sauve c2[i]
	subq   $24,  1,    $24  # chiffres suivants
	addq   $16,  8,    $16
	addq   $17,  8,    $17
	addq   $18,  8,    $18
	addq   $20,  8,    $20
	addq   $21,  8,    $21
	bne    $24,  2b

L(cross_done):
	# c2 <- c2 + r0
	beq    $0,   2f
1:
	subq   $22,  1,    $22
	ldq    $2,   0($17)
	addq   $2,   $0,   $2
	cmpult $2,   $0,   $3
	sra    $0,   2,    $0
	addq   $0,   $3,   $0
	stq    $2,   0($17)
	addq   $17,  8,    $17
	beq    $0,   2f
	bne    $22,  1b
	addq   $1,   $0,   $1

	# c3 <- c3 + r1
	beq    $1,   L(done)
2:
	ldq    $0,   0($18)
	addq   $0,   $1,   $0
	cmpult $0,   $1,   $1
	stq    $0,   0($18)
	addq   $18,  8,    $18
	bne    $1,   2b

	# nettoie la pile
L(done):
	ldq    $26,  _ra_
	lda    $30,  0($21)
	ret    $31,  ($26),1
	
        #undef  _a_
        #undef  _b_
        #undef  _c_
	#undef  _d_
	#undef  _p_
	#undef  _q_
	#undef  _r_
	#undef  _x_
	#undef  _ra_

        # ici lb <= ceil(la/2) : dcoupage en tranches de longueur lb
        # variables locales
	#define _sp_    80($30)
        #define _d_     72($30)
        #define _a_     64($30)
        #define _la_    56($30)
        #define _b_     48($30)
        #define _lb_    40($30)
        #define _c_  	32($30)
	#define _l_  	24($30)
        #define _ra_ 	16($30)
	#define _add_  	 8($30)
	#define _move_ 	 0($30)
	
        .align 5
L(tranches):
	s8addq $19,  88,   $1 	# rserve lb+10 chiffres dans la pile
	bic    $1,   15,   $1   # en arrondissant  un compte pair
	subq   $30,  $1,   $30
	stq    $16,  _a_        # sauve les paramtres
	stq    $17,  _la_
	stq    $18,  _b_
	stq    $19,  _lb_
	stq    $31,  _l_
	stq    $26,  _ra_

	# prpare le droulement des boucles
	subq   $31,  $19,  $0	# r0 <- -lb
	and    $0,   31,   $0	# r0 <- (-lb) % 32
	lda    $1,   sn_cpuploop
	s8addq $0,   $1,   $1   # r1 <- adresse de saut pour move
	stq    $1,   _move_
	sll    $0,   3,    $0   # r0 <- 8*((-lb) % 32)
	lda    $1,   sn_addloop
	s4addq $0,   $1,   $1   # r1 <- adresse de saut pour add
	stq    $1,   _add_
	subq   $20,  $0,   $1   # r1 <- c cadr sur un multiple de 32
	stq    $1,   _c_
	lda    $1,   _sp_
	subq   $1,   $0,   $1   # r1 <- d cadr sur un multiple de 32
	stq    $1,   _d_
	
        # premire multiplication : c <- a[0..lb-1]*b
	bis    $19,  $19,  $17
	bsr    $26,  L(nogp)
	br     $31,  3f

        # multiplications suivantes
	.align 5
1:
	stq    $18,  _a_	# sauvegarde les paramtres
	stq    $19,  _la_
	stq    $16,  _c_
	cmpult $17,  $19,  $0	# l <- min(lb,la)
	cmovne $0,   $17,  $19
	stq    $19,  _l_

	# sauvegarde c[0..lb-1] dans la pile
	ldq    $27,  _move_
	ldq    $20,  _d_
	subq   $31,  $17,  $2	# r2 <- -lb
	jsr    $27,  ($27)      # d <- c[0..lb-1]

        # multiplication
	sll    $17,  3,   $0
	subq   $16,  $0,  $20   # r20 <- &c[0]
	ldq    $16,  _b_
	bsr    $26,  L(nogp)

	# ajoute d
	ldq    $16,  _c_
	ldq    $18,  _d_
	ldq    $2,   _lb_
	ldq    $27,  _add_
	subq   $31,  $2,   $2   # r2 <- -lb
	bis    $16,  $16,  $20
	bis    $31,  $31,  $0   # r0 <- 0 (retenue)
	jsr    $27,  ($27)	# effectue l addition
	ldq    $2,   _l_
	jsr    $27,  sn_incloop # propage la retenue

3:
	ldq    $18,  _a_	# rcupre les paramtres
	ldq    $19,  _la_
	ldq    $17,  _lb_
	ldq    $16,  _c_
	s8addq $17,  $18,  $18  # a  += lb
	s8addq $17,  $16,  $16  # c  += lb
	subq   $19,  $17,  $19  # la -= lb
	bgt    $19,  1b

	addq   $17,  1,   $17   # nettoie la pile
	bic    $17,  1,   $17   
	ldq    $26,  _ra_
	lda    $30,  _sp_
	s8addq $17,  $30,  $30
	ret    $31,  ($26),1
	
        #undef  _sp_
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _d_
        #undef  _la_
        #undef  _lb_
        #undef  _l_
	#undef  _ra_
	#undef _move_
	#undef _add_

#ifdef debug_karamul
	.end sn_karamul_buggy
#else
	.end sn_karamul
#endif
#undef L
#endif /* assembly_sn_karamul */
#if !defined(assembly_sn_karamul) || defined(debug_karamul)
	REPLACE(sn_karamul)
#endif

                                 # +---------+
                                 # |  Carr  |
                                 # +---------+

   #  void xn(karasqr)(chiffre *a, long la, chiffre *c)
   #
   #  entre :
   #  a = naturel de longueur la
   #  c = naturel de longueur 2*la, non confondu avec a
   #  contraintes : 0 < la
   #
   #  sortie :
   #  c <- a^2

#ifdef assembly_sn_karasqr
#define L(x) .Lsn_karasqr_##x

        .align 5
#ifdef debug_karamul
        .globl sn_karasqr_buggy
        .ent   sn_karasqr_buggy
sn_karasqr_buggy:
        .frame $30,0,$26,0
        .prologue 1
	ldgp   $gp,  0($27)
#else
        .globl sn_karasqr
        .ent   sn_karasqr
sn_karasqr:
        .frame $30,0,$26,0
        .prologue 1
	ldgp   $gp,  0($27)
L(nogp):
#endif

        # petit carr => algorithme en n^2
	cmpule $17,  karasqr_lim, $0
	bne    $0,   .Lsn_sqr_n2_nogp

	# dcoupage de Karatsuba
	# variables locales
	#define _d_  64($30)
        #define _a_  56($30)
        #define _b_  48($30)
        #define _c_  40($30)
	#define _p_  32($30)
	#define _q_  24($30)
	#define _r_  16($30)
	#define _x_   8($30)
	#define _ra_  0($30)

	addq   $17,  1,    $0   # r0 <- p = ceil(la/2)
	srl    $0,   1,    $0
	subq   $17,  $0,   $2 	# r2 <- q = floor(la/2)
	sll    $0,   4,    $3   # rserve 2p+8 chiffres dans la pile
	addq   $3,   64,   $3
	subq   $30,  $3,   $30
	stq    $16,  _a_        # sauve les paramtres
	stq    $16,  _b_
	stq    $18,  _c_
	stq    $0,   _p_
	stq    $2,   _q_
	stq    $2,   _r_
	stq    $26,  _ra_

	# calcule |a0-a1| dans c
	bis    $18,  $18,  $20
	bis    $0,   $0,   $17  # r17 <- p
	s8addq $0,   $16,  $18	# r18 <- &a1
	bis    $2,   $2,   $19  # r19 <- q
	bsr    $26,  .Lsn_asub_nogp # c[0..p-1] <- |a0-a1|
	stq    $31   _x_
	
	# calcule |a0-a1|^2, a0^2, a1^2
	ldq    $16,  _c_
	ldq    $17,  _p_
	lda    $18,  _d_
	bsr    $26,  L(nogp)    # d <- |a0-a1|^2

	ldq    $16,  _a_	# r16 <- &a0
	ldq    $17,  _p_
	ldq    $18,  _c_
	bsr    $26,  L(nogp)    # c[0..2p-1] <- a0^2
	
	ldq    $17,  _p_
	ldq    $16,  _a_
	ldq    $18,  _c_
	s8addq $17,  $16,  $16  # r16 <- &a1
	s8addq $17,  $18,  $18  # r18 <- &c[2p]
	s8addq $17,  $18,  $18
	ldq    $17,  _q_
	bsr    $26,  L(nogp)    # c[2p..2p+q+r-1] <- a1^2

	# continue avec karamul
	br     $31,  .Lsn_kara_aux

        #undef  _a_
        #undef  _b_
        #undef  _c_
	#undef  _d_
	#undef  _p_
	#undef  _q_
	#undef  _r_
	#undef  _x_
	#undef  _ra_

#ifdef debug_karamul
	.end sn_karasqr_buggy
#else
	.end sn_karasqr
#endif
#undef L
#endif /* assembly_sn_karasqr */
#if !defined(assembly_sn_karasqr) || defined(debug_karamul)
	REPLACE(sn_karasqr)
#endif
