// file kernel/n/ppc32/karatsuba.S: Karatsuba multiplication
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                     Multiplication de Karatsuba                       |
 |                                                                       |
 +-----------------------------------------------------------------------*/

                            ; +------------------+
                            ; |  Multiplication  |
                            ; +------------------+
        

   ;  void xn(karamul)(chiffre *a, long la, chiffre *b, long lb, chiffre *c)
   ;
   ;  entre :
   ;  a = naturel de longueur la
   ;  b = naturel de longueur lb
   ;  c = naturel de longueur la+lb, non confondu avec a ou b
   ;  contraintes : 0 < lb <= la
   ;
   ;  sortie :
   ;  c <- a*b

#ifdef assembly_sn_karamul
#define L(x) Lsn_karamul_##x

#ifdef debug_karamul
.globl _sn_karamul_buggy
_sn_karamul_buggy:
#else
.globl _sn_karamul
_sn_karamul:
Lsn_karamul:
#endif

	cmpwi  cr0,   r6,   karamul_lim ; petite multiplication ?
	ble    Lsn_mul_n2       ; => algorithme en n^2
	
	addi   r8,   r4,   1    ; r8 <- p = ceil(la/2)
	srawi  r8,   r8,   1
	subf.  r9,   r8,   r6   ; r9 <- r = lb - p
	ble    L(tranches)      ; si lb <= p, dcoupe a en tranches

	; ici lb >= ceil(la/2) : dcoupage de Karatsuba
	; variables locales
	#define _d_  36(r1)
        #define _a_  r31
        #define _b_  r30
        #define _c_  r29
	#define _p_  r28
	#define _q_  r27
	#define _r_  r26
	#define _x_  r25
	#define _ra_ 32(r1)

	subf   r10,  r8,   r4 	; r10 <- q = floor(la/2)
	stmw   r25,  4(r1)	; sauvegarde r25-r31
	slwi   r11,  r8,    3	; rserve 2p chiffres + lr + cadre de pile,
	addi   r11,  r11,  48
	clrrwi r11,  r11,   4	; arrondi  un multiple de 16 octets
	neg    r11,  r11
	stwux  r1,   r1,  r11
	mflr   r0
	stw    r0,   _ra_

	mr     _a_,  r3         ; sauve les paramtres
	mr     _b_,  r5
	mr     _c_,  r7
	mr     _p_,  r8
	mr     _q_,  r10
	mr     _r_,  r9

	; calcule |a0-a1| et |b0-b1| dans c
/*	mr     r3,   _a_ */
	mr     r4,   _p_
	slwi   r5,   _p_,   2	; r5 <- &a1
	add    r5,   r5,   _a_
	mr     r6,   _q_
	mr     r7,   _c_
	bl     Lsn_asub		; c[0..p-1] <- |a0-a1|
	mr     _x_,  r2         ; x <- sgn(a0-a1)
	mr     r3,   _b_
	mr     r4,   _p_
	slwi   r5,   _p_,   2	; r5 <- 4p
	mr     r6,   _r_
	add    r7,   r5,   _c_	; r7 <- &c[p]
	add    r5,   r5,   _b_	; r5 <- &b1
	bl     Lsn_asub		; c[0..p-1] <- |a0-a1|
	xor    _x_,  _x_,  r2   ; x <- sgn((a0-a1)*(b0-b1))
	
	; calcule |a0-a1|*|b0-b1|, a0*b0, a1*b1
	mr     r3,   _c_
	mr     r4,   _p_
	slwi   r5,   _p_,   2	; r5 <- 4p
	add    r5,   r5,   _c_	; r5 <- &c[p]
	mr     r6,   _p_
	la     r7,   _d_
	bl     Lsn_karamul      ; d <- |a0-a1|*|b0-b1|
	
	mr     r3,   _a_
	mr     r4,   _p_
	mr     r5,   _b_
	mr     r6,   _p_
	mr     r7,   _c_
	bl     Lsn_karamul      ; c[0..2p-1] <- a0*b0
	
	slwi   r4,   _p_,   2	; r4 <- 4p
	add    r3,   _a_,  r4	; r3 <- &a1
	add    r5,   _b_,  r4	; r5 <- &b1
	add    r7,   _c_,  r4	; r7 <- &c[2p]
	add    r7,   r7,   r4
	mr     r4,   _q_
	mr     r6,   _r_
	bl     Lsn_karamul      ; c[2p..2p+q+r-1] <- a1*b1
	add    _q_,  _q_,  _r_	; q <- q+r

        ; point de chute pour karasqr
        ; c += (a0b0 + a1b1)*BASE^p
Lsn_kara_aux:
	mr     r4,   _p_
	slwi   r6,   _p_,   2	; r6 <- 4p
	add    r5,   _c_,  r6	; r5 <- &c[p]
	add    r3,   r5,   r6	; r3 <- &c[2p]
	mr     r6,   _p_
	bl     Lsn_inc		; c[2p..3p-1] += c[p..2p-1]
	mr     _r_,  r3 	; r <- retenue

	mr     r3,   _c_
	mr     r4,   _p_
	slwi   r6,   _p_,   2	; r6 <- 4p
	add    r7,   _c_,  r6	; r7 <- &c[p]
	add    r5,   r7,   r6	; r5 <- &c[2p]
	mr     r6,   _p_
	bl     Lsn_add		; c[p..2p-1] <- c[0..p-1] + c[2p..3p-1]
	add   _b_,   r3,  _r_	; b <- retenue

	mr     r4,   _q_	; r4 <- q+r
	subf   r6,   _p_,  _q_	; r6 <- q+r-p
	slwi   r7,   _p_,   2	; r7 <- 4p
	add    r3,   _c_,  r7	; r3 <- &c[2p]
	add    r3,   r3,   r7
	add    r5,   r3,   r7	; r5 <- c[3p]
	bl     Lsn_inc		;  c[2p..2p+q+r-1] += c[3p..2p+q+r-1]

        ; propage les retenues
	slwi  r6,    _p_,   2	; r6 <- 4p
	add   r3,    _c_,  r6	; r3 <- &c[2p-1]
	add   r3,    r3,   r6
	subi  r3,    r3,   4
	add   r5,    r3,   r6	; r5 <- &c[3p-1]
	subf. r4,    _p_,  _q_	; r4 <- q+r-p
	beq   2f
	mtctr r4
1:
	lwz   r4,   4(r5)	; propage la retenue sur c[3p]
	addc  r4,   r4,   _r_
	li    _r_,  0
	stwu  r4,   4(r5)
	addze. _r_, _r_
	bdnzf  eq,  1b
2:
	mtctr _q_
1:
	lwz   r4,   4(r3)	; propage la retenue sur c[2p]
	addc  r4,   r4,   _b_
	li    _b_,  0
	stwu  r4,   4(r3)
	addze. _b_, _b_
	bdnzf  eq,  1b
	
        ; c[p..2p+q+r-1] -= (a0 - a1)*(b0 - b1)
	add   r4,    _q_,  _p_	; r4 <- p+q+r
	slwi  r6,    _p_,   1	; r6 <- 2p
	add   r3,    _c_,  r6	; r3 <- &c[p]
	add   r3,    r3,   r6
	la    r5,    _d_
	and.  _x_,   _x_,  _x_
	bne   1f
	bl    Lsn_dec
	b     2f
1:
	bl    Lsn_inc
2:
	lwz   r0,    _ra_	; rcupre l adresse de retour
	mtlr  r0
	lwz   r1,    0(r1)	; nettoie la pile
	lmw   r25,   4(r1)	; rcupre r25-r31
	blr
	
        #undef  _a_
        #undef  _b_
        #undef  _c_
	#undef  _d_
	#undef  _p_
	#undef  _q_
	#undef  _r_
	#undef  _x_
	#undef  _ra_

        ; ici lb <= ceil(la/2) : dcoupage en tranches de longueur lb
        ; Le code qui suit est recopi mot  mot dans toommul.
        ; Attention  rpercuter les mises  jour !
L(tranches):

        # variables locales
        #define _d_  32(r1)
        #define _a_  r31
        #define _b_  r30
        #define _c_  r29
        #define _la_ r28
        #define _lb_ r27
        #define _ra_ r26

	stmw   r26, 4(r1)	; sauvegarde r26-r31 dans le cadre de pile
	mflr   _ra_		; ra <- adresse de retour

	slwi   r8,   r6,   2	; rserve lb chiffres + cadre de pile, arrondi
	addi   r8,   r8,  44	;  un multiple de 16 octets
	clrrwi r8,   r8,   4
	neg    r8,   r8
	stwux  r1,   r1,  r8
	mr     _a_,  r3
	mr     _b_,  r5
	mr     _c_,  r7
	mr     _la_, r4
	mr     _lb_, r6

        ; premire multiplication : c <- a[0..(la % lb)-1]*b
	mr     r4,   _lb_
	divwu  r8,   _la_, _lb_	; r6 <- l = la % lb
	mullw  r8,   r8,   _lb_
	subf.  r6,   r8,   _la_
	bne    1f		; si la est multiple de lb, r6 <- lb
	mr     r6,   _lb_
1:
	mr     r3,   _b_
	mr     r5,   _a_
/*	mr     r7,   _c_ */
	slwi   r8,   r6,   2
	add    _a_,  _a_,  r8	; a  += l
	add    _c_,  _c_,  r8	; c  += l
	subf   _la_, r6,   _la_	; la -= l
	bl     Lsn_karamul

        ; multiplications suivantes
L(loop):
	mr     r4,   _lb_
	slwi   r8,   _lb_,  2	; r8 <- 4*lb
	mr     r3,   _a_
	add    _a_,  _a_,  r8	; a += lb
	mr     r5,   _b_
	mr     r6,   _lb_
	mr     r7,   _c_
	subi   r10,  _c_,   4	; r10 <- &c[-1]
	la     r11,  28(r1)	; r11 <- &d[-1]
	mtctr  r4
1:
	lwzu   r8,   4(r10)	; d <- c[0..lb-1]
	stwu   r8,   4(r11)
	bdnz   1b
	bl     Lsn_karamul	;  c[0..2lb-1] <- a[0..lb-1]*b

	mr     r3,   _c_
	la     r5,   _d_
	mr     r6,   _lb_
	slwi   r4,   r6,   1	; r4 <- 2*lb
	add    _c_,  _c_,  r4	; c += lb
	add    _c_,  _c_,  r4
	bl     Lsn_inc		; c <- c + d
	subf.  _la_, _lb_, _la_ ;  la -= lb
        bne    L(loop)

        ; termin
	mtlr  _ra_
	lwz   r1,    0(r1)	; nettoie la pile
	lmw   r26,   4(r1)	; rcupre r26-r31
	blr

        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _d_
        #undef  _la_
        #undef  _lb_
	#undef  _ra_

#undef L
#endif /* assembly_sn_karamul */
#if !defined(assembly_sn_karamul) || defined(debug_karamul)
REPLACE(sn_karamul)
#endif

                                 ; +---------+
                                 ; |  Carr  |
                                 ; +---------+

   ;  void xn(karasqr)(chiffre *a, long la, chiffre *c)
   ;
   ;  entre :
   ;  a = naturel de longueur la
   ;  c = naturel de longueur 2*la, non confondu avec a
   ;  contraintes : 0 < la
   ;
   ;  sortie :
   ;  c <- a^2

#ifdef assembly_sn_karasqr
#define L(x) Lsn_karasqr_##x

#ifdef debug_karamul
.globl _sn_karasqr_buggy
_sn_karasqr_buggy:
#else
.globl _sn_karasqr
_sn_karasqr:
Lsn_karasqr:
#endif

        ; petit carr => algorithme en n^2
	cmpwi  cr0,   r4,   karasqr_lim
	ble    Lsn_sqr_n2
	
	; dcoupage de Karatsuba
	; variables locales
	#define _d_  36(r1)
        #define _a_  r31
        #define _b_  r30
        #define _c_  r29
	#define _p_  r28
	#define _q_  r27
	#define _r_  r26
	#define _x_  r25
	#define _ra_ 32(r1)

	stmw   r25,  4(r1)	; sauvegarde r25-r31
	srawi  _r_,  r4,   1    ; r <- floor(la/2)
	subf   _p_,  _r_,  r4   ; p <- ceil(la/2)
	slwi   r11,  _p_,   3	; rserve 2p chiffres + lr + cadre de pile,
	addi   r11,  r11,  48
	clrrwi r11,  r11,   4	; arrondi  un multiple de 16 octets
	neg    r11,  r11
	stwux  r1,   r1,  r11
	mflr   r0
	stw    r0,   _ra_

	mr     _a_,  r3         ; sauve les paramtres
	mr     _c_,  r5
	slwi   _q_,  _r_,   1

	; calcule |a0-a1| dans c
/*	mr     r3,   _a_ */
	mr     r4,   _p_
	slwi   r5,   _p_,   2	; r5 <- &a1
	add    r5,   r5,   _a_
	mr     r6,   _r_
	mr     r7,   _c_
	bl     Lsn_asub		; c[0..p-1] <- |a0-a1|
	li     _x_,  0          ; x <- sgn((a0-a1)^2)
	
	; calcule (a0-a1)^2, a0^2, a1^2
	mr     r3,   _c_
	mr     r4,   _p_
	la     r5,   _d_
	bl     Lsn_karasqr      ; d <- (a0-a1)^2
	
	mr     r3,   _a_
	mr     r4,   _p_
	mr     r5,   _c_
	bl     Lsn_karasqr      ; c[0..2p-1] <- a0^2
	
	slwi   r4,   _p_,   2	; r4 <- 4p
	add    r3,   _a_,  r4	; r3 <- &a1
	add    r5,   _c_,  r4	; r7 <- &c[2p]
	add    r5,   r5,   r4
	mr     r4,   _r_
	bl     Lsn_karasqr      ; c[2p..2p+q+r-1] <- a1^2
	b      Lsn_kara_aux	; continue avec karamul

        #undef  _a_
        #undef  _b_
        #undef  _c_
	#undef  _d_
	#undef  _p_
	#undef  _q_
	#undef  _r_
	#undef  _x_
	#undef  _ra_

#undef L
#endif /* assembly_sn_karasqr */
#if !defined(assembly_sn_karasqr) || defined(debug_karamul)
REPLACE(sn_karasqr)
#endif
