451 lines
9.1 KiB
ArmAsm
451 lines
9.1 KiB
ArmAsm
// Copyright 2019 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Based on CRYPTOGAMS code with the following comment:
|
|
// # ====================================================================
|
|
// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
|
// # project. The module is, however, dual licensed under OpenSSL and
|
|
// # CRYPTOGAMS licenses depending on where you obtain it. For further
|
|
// # details see http://www.openssl.org/~appro/cryptogams/.
|
|
// # ====================================================================
|
|
|
|
// Code for the perl script that generates the ppc64 assembler
|
|
// can be found in the cryptogams repository at the link below. It is based on
|
|
// the original from openssl.
|
|
|
|
// https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
|
|
|
|
// The differences in this and the original implementation are
|
|
// due to the calling conventions and initialization of constants.
|
|
|
|
//go:build gc && !purego
|
|
// +build gc,!purego
|
|
|
|
#include "textflag.h"
|
|
|
|
#define OUT R3
|
|
#define INP R4
|
|
#define LEN R5
|
|
#define KEY R6
|
|
#define CNT R7
|
|
#define TMP R15
|
|
|
|
#define CONSTBASE R16
|
|
#define BLOCKS R17
|
|
|
|
DATA consts<>+0x00(SB)/8, $0x3320646e61707865
|
|
DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
|
|
DATA consts<>+0x10(SB)/8, $0x0000000000000001
|
|
DATA consts<>+0x18(SB)/8, $0x0000000000000000
|
|
DATA consts<>+0x20(SB)/8, $0x0000000000000004
|
|
DATA consts<>+0x28(SB)/8, $0x0000000000000000
|
|
DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
|
|
DATA consts<>+0x38(SB)/8, $0x0203000106070405
|
|
DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
|
|
DATA consts<>+0x48(SB)/8, $0x0102030005060704
|
|
DATA consts<>+0x50(SB)/8, $0x6170786561707865
|
|
DATA consts<>+0x58(SB)/8, $0x6170786561707865
|
|
DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
|
|
DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
|
|
DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
|
|
DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
|
|
DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
|
|
DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
|
|
DATA consts<>+0x90(SB)/8, $0x0000000100000000
|
|
DATA consts<>+0x98(SB)/8, $0x0000000300000002
|
|
GLOBL consts<>(SB), RODATA, $0xa0
|
|
|
|
//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
|
|
TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
|
|
MOVD out+0(FP), OUT
|
|
MOVD inp+8(FP), INP
|
|
MOVD len+16(FP), LEN
|
|
MOVD key+24(FP), KEY
|
|
MOVD counter+32(FP), CNT
|
|
|
|
// Addressing for constants
|
|
MOVD $consts<>+0x00(SB), CONSTBASE
|
|
MOVD $16, R8
|
|
MOVD $32, R9
|
|
MOVD $48, R10
|
|
MOVD $64, R11
|
|
SRD $6, LEN, BLOCKS
|
|
// V16
|
|
LXVW4X (CONSTBASE)(R0), VS48
|
|
ADD $80,CONSTBASE
|
|
|
|
// Load key into V17,V18
|
|
LXVW4X (KEY)(R0), VS49
|
|
LXVW4X (KEY)(R8), VS50
|
|
|
|
// Load CNT, NONCE into V19
|
|
LXVW4X (CNT)(R0), VS51
|
|
|
|
// Clear V27
|
|
VXOR V27, V27, V27
|
|
|
|
// V28
|
|
LXVW4X (CONSTBASE)(R11), VS60
|
|
|
|
// splat slot from V19 -> V26
|
|
VSPLTW $0, V19, V26
|
|
|
|
VSLDOI $4, V19, V27, V19
|
|
VSLDOI $12, V27, V19, V19
|
|
|
|
VADDUWM V26, V28, V26
|
|
|
|
MOVD $10, R14
|
|
MOVD R14, CTR
|
|
|
|
loop_outer_vsx:
|
|
// V0, V1, V2, V3
|
|
LXVW4X (R0)(CONSTBASE), VS32
|
|
LXVW4X (R8)(CONSTBASE), VS33
|
|
LXVW4X (R9)(CONSTBASE), VS34
|
|
LXVW4X (R10)(CONSTBASE), VS35
|
|
|
|
// splat values from V17, V18 into V4-V11
|
|
VSPLTW $0, V17, V4
|
|
VSPLTW $1, V17, V5
|
|
VSPLTW $2, V17, V6
|
|
VSPLTW $3, V17, V7
|
|
VSPLTW $0, V18, V8
|
|
VSPLTW $1, V18, V9
|
|
VSPLTW $2, V18, V10
|
|
VSPLTW $3, V18, V11
|
|
|
|
// VOR
|
|
VOR V26, V26, V12
|
|
|
|
// splat values from V19 -> V13, V14, V15
|
|
VSPLTW $1, V19, V13
|
|
VSPLTW $2, V19, V14
|
|
VSPLTW $3, V19, V15
|
|
|
|
// splat const values
|
|
VSPLTISW $-16, V27
|
|
VSPLTISW $12, V28
|
|
VSPLTISW $8, V29
|
|
VSPLTISW $7, V30
|
|
|
|
loop_vsx:
|
|
VADDUWM V0, V4, V0
|
|
VADDUWM V1, V5, V1
|
|
VADDUWM V2, V6, V2
|
|
VADDUWM V3, V7, V3
|
|
|
|
VXOR V12, V0, V12
|
|
VXOR V13, V1, V13
|
|
VXOR V14, V2, V14
|
|
VXOR V15, V3, V15
|
|
|
|
VRLW V12, V27, V12
|
|
VRLW V13, V27, V13
|
|
VRLW V14, V27, V14
|
|
VRLW V15, V27, V15
|
|
|
|
VADDUWM V8, V12, V8
|
|
VADDUWM V9, V13, V9
|
|
VADDUWM V10, V14, V10
|
|
VADDUWM V11, V15, V11
|
|
|
|
VXOR V4, V8, V4
|
|
VXOR V5, V9, V5
|
|
VXOR V6, V10, V6
|
|
VXOR V7, V11, V7
|
|
|
|
VRLW V4, V28, V4
|
|
VRLW V5, V28, V5
|
|
VRLW V6, V28, V6
|
|
VRLW V7, V28, V7
|
|
|
|
VADDUWM V0, V4, V0
|
|
VADDUWM V1, V5, V1
|
|
VADDUWM V2, V6, V2
|
|
VADDUWM V3, V7, V3
|
|
|
|
VXOR V12, V0, V12
|
|
VXOR V13, V1, V13
|
|
VXOR V14, V2, V14
|
|
VXOR V15, V3, V15
|
|
|
|
VRLW V12, V29, V12
|
|
VRLW V13, V29, V13
|
|
VRLW V14, V29, V14
|
|
VRLW V15, V29, V15
|
|
|
|
VADDUWM V8, V12, V8
|
|
VADDUWM V9, V13, V9
|
|
VADDUWM V10, V14, V10
|
|
VADDUWM V11, V15, V11
|
|
|
|
VXOR V4, V8, V4
|
|
VXOR V5, V9, V5
|
|
VXOR V6, V10, V6
|
|
VXOR V7, V11, V7
|
|
|
|
VRLW V4, V30, V4
|
|
VRLW V5, V30, V5
|
|
VRLW V6, V30, V6
|
|
VRLW V7, V30, V7
|
|
|
|
VADDUWM V0, V5, V0
|
|
VADDUWM V1, V6, V1
|
|
VADDUWM V2, V7, V2
|
|
VADDUWM V3, V4, V3
|
|
|
|
VXOR V15, V0, V15
|
|
VXOR V12, V1, V12
|
|
VXOR V13, V2, V13
|
|
VXOR V14, V3, V14
|
|
|
|
VRLW V15, V27, V15
|
|
VRLW V12, V27, V12
|
|
VRLW V13, V27, V13
|
|
VRLW V14, V27, V14
|
|
|
|
VADDUWM V10, V15, V10
|
|
VADDUWM V11, V12, V11
|
|
VADDUWM V8, V13, V8
|
|
VADDUWM V9, V14, V9
|
|
|
|
VXOR V5, V10, V5
|
|
VXOR V6, V11, V6
|
|
VXOR V7, V8, V7
|
|
VXOR V4, V9, V4
|
|
|
|
VRLW V5, V28, V5
|
|
VRLW V6, V28, V6
|
|
VRLW V7, V28, V7
|
|
VRLW V4, V28, V4
|
|
|
|
VADDUWM V0, V5, V0
|
|
VADDUWM V1, V6, V1
|
|
VADDUWM V2, V7, V2
|
|
VADDUWM V3, V4, V3
|
|
|
|
VXOR V15, V0, V15
|
|
VXOR V12, V1, V12
|
|
VXOR V13, V2, V13
|
|
VXOR V14, V3, V14
|
|
|
|
VRLW V15, V29, V15
|
|
VRLW V12, V29, V12
|
|
VRLW V13, V29, V13
|
|
VRLW V14, V29, V14
|
|
|
|
VADDUWM V10, V15, V10
|
|
VADDUWM V11, V12, V11
|
|
VADDUWM V8, V13, V8
|
|
VADDUWM V9, V14, V9
|
|
|
|
VXOR V5, V10, V5
|
|
VXOR V6, V11, V6
|
|
VXOR V7, V8, V7
|
|
VXOR V4, V9, V4
|
|
|
|
VRLW V5, V30, V5
|
|
VRLW V6, V30, V6
|
|
VRLW V7, V30, V7
|
|
VRLW V4, V30, V4
|
|
BC 16, LT, loop_vsx
|
|
|
|
VADDUWM V12, V26, V12
|
|
|
|
WORD $0x13600F8C // VMRGEW V0, V1, V27
|
|
WORD $0x13821F8C // VMRGEW V2, V3, V28
|
|
|
|
WORD $0x10000E8C // VMRGOW V0, V1, V0
|
|
WORD $0x10421E8C // VMRGOW V2, V3, V2
|
|
|
|
WORD $0x13A42F8C // VMRGEW V4, V5, V29
|
|
WORD $0x13C63F8C // VMRGEW V6, V7, V30
|
|
|
|
XXPERMDI VS32, VS34, $0, VS33
|
|
XXPERMDI VS32, VS34, $3, VS35
|
|
XXPERMDI VS59, VS60, $0, VS32
|
|
XXPERMDI VS59, VS60, $3, VS34
|
|
|
|
WORD $0x10842E8C // VMRGOW V4, V5, V4
|
|
WORD $0x10C63E8C // VMRGOW V6, V7, V6
|
|
|
|
WORD $0x13684F8C // VMRGEW V8, V9, V27
|
|
WORD $0x138A5F8C // VMRGEW V10, V11, V28
|
|
|
|
XXPERMDI VS36, VS38, $0, VS37
|
|
XXPERMDI VS36, VS38, $3, VS39
|
|
XXPERMDI VS61, VS62, $0, VS36
|
|
XXPERMDI VS61, VS62, $3, VS38
|
|
|
|
WORD $0x11084E8C // VMRGOW V8, V9, V8
|
|
WORD $0x114A5E8C // VMRGOW V10, V11, V10
|
|
|
|
WORD $0x13AC6F8C // VMRGEW V12, V13, V29
|
|
WORD $0x13CE7F8C // VMRGEW V14, V15, V30
|
|
|
|
XXPERMDI VS40, VS42, $0, VS41
|
|
XXPERMDI VS40, VS42, $3, VS43
|
|
XXPERMDI VS59, VS60, $0, VS40
|
|
XXPERMDI VS59, VS60, $3, VS42
|
|
|
|
WORD $0x118C6E8C // VMRGOW V12, V13, V12
|
|
WORD $0x11CE7E8C // VMRGOW V14, V15, V14
|
|
|
|
VSPLTISW $4, V27
|
|
VADDUWM V26, V27, V26
|
|
|
|
XXPERMDI VS44, VS46, $0, VS45
|
|
XXPERMDI VS44, VS46, $3, VS47
|
|
XXPERMDI VS61, VS62, $0, VS44
|
|
XXPERMDI VS61, VS62, $3, VS46
|
|
|
|
VADDUWM V0, V16, V0
|
|
VADDUWM V4, V17, V4
|
|
VADDUWM V8, V18, V8
|
|
VADDUWM V12, V19, V12
|
|
|
|
CMPU LEN, $64
|
|
BLT tail_vsx
|
|
|
|
// Bottom of loop
|
|
LXVW4X (INP)(R0), VS59
|
|
LXVW4X (INP)(R8), VS60
|
|
LXVW4X (INP)(R9), VS61
|
|
LXVW4X (INP)(R10), VS62
|
|
|
|
VXOR V27, V0, V27
|
|
VXOR V28, V4, V28
|
|
VXOR V29, V8, V29
|
|
VXOR V30, V12, V30
|
|
|
|
STXVW4X VS59, (OUT)(R0)
|
|
STXVW4X VS60, (OUT)(R8)
|
|
ADD $64, INP
|
|
STXVW4X VS61, (OUT)(R9)
|
|
ADD $-64, LEN
|
|
STXVW4X VS62, (OUT)(R10)
|
|
ADD $64, OUT
|
|
BEQ done_vsx
|
|
|
|
VADDUWM V1, V16, V0
|
|
VADDUWM V5, V17, V4
|
|
VADDUWM V9, V18, V8
|
|
VADDUWM V13, V19, V12
|
|
|
|
CMPU LEN, $64
|
|
BLT tail_vsx
|
|
|
|
LXVW4X (INP)(R0), VS59
|
|
LXVW4X (INP)(R8), VS60
|
|
LXVW4X (INP)(R9), VS61
|
|
LXVW4X (INP)(R10), VS62
|
|
VXOR V27, V0, V27
|
|
|
|
VXOR V28, V4, V28
|
|
VXOR V29, V8, V29
|
|
VXOR V30, V12, V30
|
|
|
|
STXVW4X VS59, (OUT)(R0)
|
|
STXVW4X VS60, (OUT)(R8)
|
|
ADD $64, INP
|
|
STXVW4X VS61, (OUT)(R9)
|
|
ADD $-64, LEN
|
|
STXVW4X VS62, (OUT)(V10)
|
|
ADD $64, OUT
|
|
BEQ done_vsx
|
|
|
|
VADDUWM V2, V16, V0
|
|
VADDUWM V6, V17, V4
|
|
VADDUWM V10, V18, V8
|
|
VADDUWM V14, V19, V12
|
|
|
|
CMPU LEN, $64
|
|
BLT tail_vsx
|
|
|
|
LXVW4X (INP)(R0), VS59
|
|
LXVW4X (INP)(R8), VS60
|
|
LXVW4X (INP)(R9), VS61
|
|
LXVW4X (INP)(R10), VS62
|
|
|
|
VXOR V27, V0, V27
|
|
VXOR V28, V4, V28
|
|
VXOR V29, V8, V29
|
|
VXOR V30, V12, V30
|
|
|
|
STXVW4X VS59, (OUT)(R0)
|
|
STXVW4X VS60, (OUT)(R8)
|
|
ADD $64, INP
|
|
STXVW4X VS61, (OUT)(R9)
|
|
ADD $-64, LEN
|
|
STXVW4X VS62, (OUT)(R10)
|
|
ADD $64, OUT
|
|
BEQ done_vsx
|
|
|
|
VADDUWM V3, V16, V0
|
|
VADDUWM V7, V17, V4
|
|
VADDUWM V11, V18, V8
|
|
VADDUWM V15, V19, V12
|
|
|
|
CMPU LEN, $64
|
|
BLT tail_vsx
|
|
|
|
LXVW4X (INP)(R0), VS59
|
|
LXVW4X (INP)(R8), VS60
|
|
LXVW4X (INP)(R9), VS61
|
|
LXVW4X (INP)(R10), VS62
|
|
|
|
VXOR V27, V0, V27
|
|
VXOR V28, V4, V28
|
|
VXOR V29, V8, V29
|
|
VXOR V30, V12, V30
|
|
|
|
STXVW4X VS59, (OUT)(R0)
|
|
STXVW4X VS60, (OUT)(R8)
|
|
ADD $64, INP
|
|
STXVW4X VS61, (OUT)(R9)
|
|
ADD $-64, LEN
|
|
STXVW4X VS62, (OUT)(R10)
|
|
ADD $64, OUT
|
|
|
|
MOVD $10, R14
|
|
MOVD R14, CTR
|
|
BNE loop_outer_vsx
|
|
|
|
done_vsx:
|
|
// Increment counter by number of 64 byte blocks
|
|
MOVD (CNT), R14
|
|
ADD BLOCKS, R14
|
|
MOVD R14, (CNT)
|
|
RET
|
|
|
|
tail_vsx:
|
|
ADD $32, R1, R11
|
|
MOVD LEN, CTR
|
|
|
|
// Save values on stack to copy from
|
|
STXVW4X VS32, (R11)(R0)
|
|
STXVW4X VS36, (R11)(R8)
|
|
STXVW4X VS40, (R11)(R9)
|
|
STXVW4X VS44, (R11)(R10)
|
|
ADD $-1, R11, R12
|
|
ADD $-1, INP
|
|
ADD $-1, OUT
|
|
|
|
looptail_vsx:
|
|
// Copying the result to OUT
|
|
// in bytes.
|
|
MOVBZU 1(R12), KEY
|
|
MOVBZU 1(INP), TMP
|
|
XOR KEY, TMP, KEY
|
|
MOVBU KEY, 1(OUT)
|
|
BC 16, LT, looptail_vsx
|
|
|
|
// Clear the stack values
|
|
STXVW4X VS48, (R11)(R0)
|
|
STXVW4X VS48, (R11)(R8)
|
|
STXVW4X VS48, (R11)(R9)
|
|
STXVW4X VS48, (R11)(R10)
|
|
BR done_vsx
|