// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2022-2024 Jason A. Donenfeld . All Rights Reserved. */ #include #include .section .rodata, "a" .align 16 CONSTANTS: .octa 0x6b20657479622d323320646e61707865 .text /* * Very basic SSE2 implementation of ChaCha20. Produces a given positive number * of blocks of output with a nonce of 0, taking an input key and 8-byte * counter. Importantly does not spill to the stack. Its arguments are: * * rdi: output bytes * rsi: 32-byte key input * rdx: 8-byte counter input/output * rcx: number of 64-byte blocks to write to output */ SYM_FUNC_START(__arch_chacha20_blocks_nostack) .set output, %rdi .set key, %rsi .set counter, %rdx .set nblocks, %rcx .set i, %al /* xmm registers are *not* callee-save. */ .set temp, %xmm0 .set state0, %xmm1 .set state1, %xmm2 .set state2, %xmm3 .set state3, %xmm4 .set copy0, %xmm5 .set copy1, %xmm6 .set copy2, %xmm7 .set copy3, %xmm8 .set one, %xmm9 /* copy0 = "expand 32-byte k" */ movaps CONSTANTS(%rip),copy0 /* copy1,copy2 = key */ movups 0x00(key),copy1 movups 0x10(key),copy2 /* copy3 = counter || zero nonce */ movq 0x00(counter),copy3 /* one = 1 || 0 */ movq $1,%rax movq %rax,one .Lblock: /* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */ movdqa copy0,state0 movdqa copy1,state1 movdqa copy2,state2 movdqa copy3,state3 movb $10,i .Lpermute: /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ paddd state1,state0 pxor state0,state3 movdqa state3,temp pslld $16,temp psrld $16,state3 por temp,state3 /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ paddd state3,state2 pxor state2,state1 movdqa state1,temp pslld $12,temp psrld $20,state1 por temp,state1 /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ paddd state1,state0 pxor state0,state3 movdqa state3,temp pslld $8,temp psrld $24,state3 por temp,state3 /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ paddd state3,state2 pxor state2,state1 movdqa state1,temp pslld $7,temp psrld $25,state1 por temp,state1 /* state1[0,1,2,3] = state1[1,2,3,0] */ pshufd $0x39,state1,state1 /* state2[0,1,2,3] = state2[2,3,0,1] */ pshufd $0x4e,state2,state2 /* state3[0,1,2,3] = state3[3,0,1,2] */ pshufd $0x93,state3,state3 /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ paddd state1,state0 pxor state0,state3 movdqa state3,temp pslld $16,temp psrld $16,state3 por temp,state3 /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ paddd state3,state2 pxor state2,state1 movdqa state1,temp pslld $12,temp psrld $20,state1 por temp,state1 /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ paddd state1,state0 pxor state0,state3 movdqa state3,temp pslld $8,temp psrld $24,state3 por temp,state3 /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ paddd state3,state2 pxor state2,state1 movdqa state1,temp pslld $7,temp psrld $25,state1 por temp,state1 /* state1[0,1,2,3] = state1[3,0,1,2] */ pshufd $0x93,state1,state1 /* state2[0,1,2,3] = state2[2,3,0,1] */ pshufd $0x4e,state2,state2 /* state3[0,1,2,3] = state3[1,2,3,0] */ pshufd $0x39,state3,state3 decb i jnz .Lpermute /* output0 = state0 + copy0 */ paddd copy0,state0 movups state0,0x00(output) /* output1 = state1 + copy1 */ paddd copy1,state1 movups state1,0x10(output) /* output2 = state2 + copy2 */ paddd copy2,state2 movups state2,0x20(output) /* output3 = state3 + copy3 */ paddd copy3,state3 movups state3,0x30(output) /* ++copy3.counter */ paddq one,copy3 /* output += 64, --nblocks */ addq $64,output decq nblocks jnz .Lblock /* counter = copy3.counter */ movq copy3,0x00(counter) /* Zero out the potentially sensitive regs, in case nothing uses these again. */ pxor state0,state0 pxor state1,state1 pxor state2,state2 pxor state3,state3 pxor copy1,copy1 pxor copy2,copy2 pxor temp,temp ret SYM_FUNC_END(__arch_chacha20_blocks_nostack)