// This code was imported from https://github.com/armfazh/rfc7748_precomputed // CHECK_BMI2ADX triggers bmi2adx if supported, // otherwise it fallbacks to legacy code. #define CHECK_BMI2ADX(label, legacy, bmi2adx) \ CMPB ·hasBmi2Adx(SB), $0 \ JE label \ bmi2adx \ RET \ label: \ legacy \ RET // cselect is a conditional move // if b=1: it copies y into x; // if b=0: x remains with the same value; // if b<> 0,1: undefined. // Uses: AX, DX, FLAGS // Instr: x86_64, cmov #define cselect(x,y,b) \ TESTQ b, b \ MOVQ 0+x, AX; MOVQ 0+y, DX; CMOVQNE DX, AX; MOVQ AX, 0+x; \ MOVQ 8+x, AX; MOVQ 8+y, DX; CMOVQNE DX, AX; MOVQ AX, 8+x; \ MOVQ 16+x, AX; MOVQ 16+y, DX; CMOVQNE DX, AX; MOVQ AX, 16+x; \ MOVQ 24+x, AX; MOVQ 24+y, DX; CMOVQNE DX, AX; MOVQ AX, 24+x; // cswap is a conditional swap // if b=1: x,y <- y,x; // if b=0: x,y remain with the same values; // if b<> 0,1: undefined. // Uses: AX, DX, R8, FLAGS // Instr: x86_64, cmov #define cswap(x,y,b) \ TESTQ b, b \ MOVQ 0+x, AX; MOVQ AX, R8; MOVQ 0+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 0+x; MOVQ DX, 0+y; \ MOVQ 8+x, AX; MOVQ AX, R8; MOVQ 8+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 8+x; MOVQ DX, 8+y; \ MOVQ 16+x, AX; MOVQ AX, R8; MOVQ 16+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 16+x; MOVQ DX, 16+y; \ MOVQ 24+x, AX; MOVQ AX, R8; MOVQ 24+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 24+x; MOVQ DX, 24+y; // additionLeg adds x and y and stores in z // Uses: AX, DX, R8-R11, FLAGS // Instr: x86_64, cmov #define additionLeg(z,x,y) \ MOVL $38, AX; \ MOVL $0, DX; \ MOVQ 0+x, R8; ADDQ 0+y, R8; \ MOVQ 8+x, R9; ADCQ 8+y, R9; \ MOVQ 16+x, R10; ADCQ 16+y, R10; \ MOVQ 24+x, R11; ADCQ 24+y, R11; \ CMOVQCS AX, DX; \ ADDQ DX, R8; \ ADCQ $0, R9; MOVQ R9, 8+z; \ ADCQ $0, R10; MOVQ R10, 16+z; \ ADCQ $0, R11; MOVQ R11, 24+z; \ MOVL $0, DX; \ CMOVQCS AX, DX; \ ADDQ DX, R8; MOVQ R8, 0+z; // additionAdx adds x and y and stores in z // Uses: AX, DX, R8-R11, FLAGS // Instr: x86_64, cmov, adx #define additionAdx(z,x,y) \ MOVL $38, AX; \ XORL DX, DX; \ MOVQ 0+x, R8; ADCXQ 0+y, R8; \ MOVQ 8+x, R9; ADCXQ 8+y, R9; \ MOVQ 16+x, R10; ADCXQ 16+y, R10; \ MOVQ 24+x, R11; ADCXQ 24+y, R11; \ CMOVQCS AX, DX ; \ XORL AX, AX; \ ADCXQ DX, R8; \ ADCXQ AX, R9; MOVQ R9, 8+z; \ ADCXQ AX, R10; MOVQ R10, 16+z; \ ADCXQ AX, R11; MOVQ R11, 24+z; \ MOVL $38, DX; \ CMOVQCS DX, AX; \ ADDQ AX, R8; MOVQ R8, 0+z; // subtraction subtracts y from x and stores in z // Uses: AX, DX, R8-R11, FLAGS // Instr: x86_64, cmov #define subtraction(z,x,y) \ MOVL $38, AX; \ MOVQ 0+x, R8; SUBQ 0+y, R8; \ MOVQ 8+x, R9; SBBQ 8+y, R9; \ MOVQ 16+x, R10; SBBQ 16+y, R10; \ MOVQ 24+x, R11; SBBQ 24+y, R11; \ MOVL $0, DX; \ CMOVQCS AX, DX; \ SUBQ DX, R8; \ SBBQ $0, R9; MOVQ R9, 8+z; \ SBBQ $0, R10; MOVQ R10, 16+z; \ SBBQ $0, R11; MOVQ R11, 24+z; \ MOVL $0, DX; \ CMOVQCS AX, DX; \ SUBQ DX, R8; MOVQ R8, 0+z; // integerMulAdx multiplies x and y and stores in z // Uses: AX, DX, R8-R15, FLAGS // Instr: x86_64, bmi2, adx #define integerMulAdx(z,x,y) \ MOVQ 0+y, DX; XORL AX, AX; \ MULXQ 0+x, AX, R8; MOVQ AX, 0+z; \ MULXQ 8+x, AX, R9; ADCXQ AX, R8; \ MULXQ 16+x, AX, R10; ADCXQ AX, R9; \ MULXQ 24+x, AX, R11; ADCXQ AX, R10; \ MOVL $0, AX;;;;;;;;; ADCXQ AX, R11; \ MOVQ 8+y, DX; XORL AX, AX; \ MULXQ 0+x, AX, R12; ADCXQ R8, AX; MOVQ AX, 8+z; \ MULXQ 8+x, AX, R13; ADCXQ R9, R12; ADOXQ AX, R12; \ MULXQ 16+x, AX, R14; ADCXQ R10, R13; ADOXQ AX, R13; \ MULXQ 24+x, AX, R15; ADCXQ R11, R14; ADOXQ AX, R14; \ MOVL $0, AX;;;;;;;;; ADCXQ AX, R15; ADOXQ AX, R15; \ MOVQ 16+y, DX; XORL AX, AX; \ MULXQ 0+x, AX, R8; ADCXQ R12, AX; MOVQ AX, 16+z; \ MULXQ 8+x, AX, R9; ADCXQ R13, R8; ADOXQ AX, R8; \ MULXQ 16+x, AX, R10; ADCXQ R14, R9; ADOXQ AX, R9; \ MULXQ 24+x, AX, R11; ADCXQ R15, R10; ADOXQ AX, R10; \ MOVL $0, AX;;;;;;;;; ADCXQ AX, R11; ADOXQ AX, R11; \ MOVQ 24+y, DX; XORL AX, AX; \ MULXQ 0+x, AX, R12; ADCXQ R8, AX; MOVQ AX, 24+z; \ MULXQ 8+x, AX, R13; ADCXQ R9, R12; ADOXQ AX, R12; MOVQ R12, 32+z; \ MULXQ 16+x, AX, R14; ADCXQ R10, R13; ADOXQ AX, R13; MOVQ R13, 40+z; \ MULXQ 24+x, AX, R15; ADCXQ R11, R14; ADOXQ AX, R14; MOVQ R14, 48+z; \ MOVL $0, AX;;;;;;;;; ADCXQ AX, R15; ADOXQ AX, R15; MOVQ R15, 56+z; // integerMulLeg multiplies x and y and stores in z // Uses: AX, DX, R8-R15, FLAGS // Instr: x86_64 #define integerMulLeg(z,x,y) \ MOVQ 0+y, R8; \ MOVQ 0+x, AX; MULQ R8; MOVQ AX, 0+z; MOVQ DX, R15; \ MOVQ 8+x, AX; MULQ R8; MOVQ AX, R13; MOVQ DX, R10; \ MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; \ MOVQ 24+x, AX; MULQ R8; \ ADDQ R13, R15; \ ADCQ R14, R10; MOVQ R10, 16+z; \ ADCQ AX, R11; MOVQ R11, 24+z; \ ADCQ $0, DX; MOVQ DX, 32+z; \ MOVQ 8+y, R8; \ MOVQ 0+x, AX; MULQ R8; MOVQ AX, R12; MOVQ DX, R9; \ MOVQ 8+x, AX; MULQ R8; MOVQ AX, R13; MOVQ DX, R10; \ MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; \ MOVQ 24+x, AX; MULQ R8; \ ADDQ R12, R15; MOVQ R15, 8+z; \ ADCQ R13, R9; \ ADCQ R14, R10; \ ADCQ AX, R11; \ ADCQ $0, DX; \ ADCQ 16+z, R9; MOVQ R9, R15; \ ADCQ 24+z, R10; MOVQ R10, 24+z; \ ADCQ 32+z, R11; MOVQ R11, 32+z; \ ADCQ $0, DX; MOVQ DX, 40+z; \ MOVQ 16+y, R8; \ MOVQ 0+x, AX; MULQ R8; MOVQ AX, R12; MOVQ DX, R9; \ MOVQ 8+x, AX; MULQ R8; MOVQ AX, R13; MOVQ DX, R10; \ MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; \ MOVQ 24+x, AX; MULQ R8; \ ADDQ R12, R15; MOVQ R15, 16+z; \ ADCQ R13, R9; \ ADCQ R14, R10; \ ADCQ AX, R11; \ ADCQ $0, DX; \ ADCQ 24+z, R9; MOVQ R9, R15; \ ADCQ 32+z, R10; MOVQ R10, 32+z; \ ADCQ 40+z, R11; MOVQ R11, 40+z; \ ADCQ $0, DX; MOVQ DX, 48+z; \ MOVQ 24+y, R8; \ MOVQ 0+x, AX; MULQ R8; MOVQ AX, R12; MOVQ DX, R9; \ MOVQ 8+x, AX; MULQ R8; MOVQ AX, R13; MOVQ DX, R10; \ MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; \ MOVQ 24+x, AX; MULQ R8; \ ADDQ R12, R15; MOVQ R15, 24+z; \ ADCQ R13, R9; \ ADCQ R14, R10; \ ADCQ AX, R11; \ ADCQ $0, DX; \ ADCQ 32+z, R9; MOVQ R9, 32+z; \ ADCQ 40+z, R10; MOVQ R10, 40+z; \ ADCQ 48+z, R11; MOVQ R11, 48+z; \ ADCQ $0, DX; MOVQ DX, 56+z; // integerSqrLeg squares x and stores in z // Uses: AX, CX, DX, R8-R15, FLAGS // Instr: x86_64 #define integerSqrLeg(z,x) \ MOVQ 0+x, R8; \ MOVQ 8+x, AX; MULQ R8; MOVQ AX, R9; MOVQ DX, R10; /* A[0]*A[1] */ \ MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; /* A[0]*A[2] */ \ MOVQ 24+x, AX; MULQ R8; MOVQ AX, R15; MOVQ DX, R12; /* A[0]*A[3] */ \ MOVQ 24+x, R8; \ MOVQ 8+x, AX; MULQ R8; MOVQ AX, CX; MOVQ DX, R13; /* A[3]*A[1] */ \ MOVQ 16+x, AX; MULQ R8; /* A[3]*A[2] */ \ \ ADDQ R14, R10;\ ADCQ R15, R11; MOVL $0, R15;\ ADCQ CX, R12;\ ADCQ AX, R13;\ ADCQ $0, DX; MOVQ DX, R14;\ MOVQ 8+x, AX; MULQ 16+x;\ \ ADDQ AX, R11;\ ADCQ DX, R12;\ ADCQ $0, R13;\ ADCQ $0, R14;\ ADCQ $0, R15;\ \ SHLQ $1, R14, R15; MOVQ R15, 56+z;\ SHLQ $1, R13, R14; MOVQ R14, 48+z;\ SHLQ $1, R12, R13; MOVQ R13, 40+z;\ SHLQ $1, R11, R12; MOVQ R12, 32+z;\ SHLQ $1, R10, R11; MOVQ R11, 24+z;\ SHLQ $1, R9, R10; MOVQ R10, 16+z;\ SHLQ $1, R9; MOVQ R9, 8+z;\ \ MOVQ 0+x,AX; MULQ AX; MOVQ AX, 0+z; MOVQ DX, R9;\ MOVQ 8+x,AX; MULQ AX; MOVQ AX, R10; MOVQ DX, R11;\ MOVQ 16+x,AX; MULQ AX; MOVQ AX, R12; MOVQ DX, R13;\ MOVQ 24+x,AX; MULQ AX; MOVQ AX, R14; MOVQ DX, R15;\ \ ADDQ 8+z, R9; MOVQ R9, 8+z;\ ADCQ 16+z, R10; MOVQ R10, 16+z;\ ADCQ 24+z, R11; MOVQ R11, 24+z;\ ADCQ 32+z, R12; MOVQ R12, 32+z;\ ADCQ 40+z, R13; MOVQ R13, 40+z;\ ADCQ 48+z, R14; MOVQ R14, 48+z;\ ADCQ 56+z, R15; MOVQ R15, 56+z; // integerSqrAdx squares x and stores in z // Uses: AX, CX, DX, R8-R15, FLAGS // Instr: x86_64, bmi2, adx #define integerSqrAdx(z,x) \ MOVQ 0+x, DX; /* A[0] */ \ MULXQ 8+x, R8, R14; /* A[1]*A[0] */ XORL R15, R15; \ MULXQ 16+x, R9, R10; /* A[2]*A[0] */ ADCXQ R14, R9; \ MULXQ 24+x, AX, CX; /* A[3]*A[0] */ ADCXQ AX, R10; \ MOVQ 24+x, DX; /* A[3] */ \ MULXQ 8+x, R11, R12; /* A[1]*A[3] */ ADCXQ CX, R11; \ MULXQ 16+x, AX, R13; /* A[2]*A[3] */ ADCXQ AX, R12; \ MOVQ 8+x, DX; /* A[1] */ ADCXQ R15, R13; \ MULXQ 16+x, AX, CX; /* A[2]*A[1] */ MOVL $0, R14; \ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ADCXQ R15, R14; \ XORL R15, R15; \ ADOXQ AX, R10; ADCXQ R8, R8; \ ADOXQ CX, R11; ADCXQ R9, R9; \ ADOXQ R15, R12; ADCXQ R10, R10; \ ADOXQ R15, R13; ADCXQ R11, R11; \ ADOXQ R15, R14; ADCXQ R12, R12; \ ;;;;;;;;;;;;;;; ADCXQ R13, R13; \ ;;;;;;;;;;;;;;; ADCXQ R14, R14; \ MOVQ 0+x, DX; MULXQ DX, AX, CX; /* A[0]^2 */ \ ;;;;;;;;;;;;;;; MOVQ AX, 0+z; \ ADDQ CX, R8; MOVQ R8, 8+z; \ MOVQ 8+x, DX; MULXQ DX, AX, CX; /* A[1]^2 */ \ ADCQ AX, R9; MOVQ R9, 16+z; \ ADCQ CX, R10; MOVQ R10, 24+z; \ MOVQ 16+x, DX; MULXQ DX, AX, CX; /* A[2]^2 */ \ ADCQ AX, R11; MOVQ R11, 32+z; \ ADCQ CX, R12; MOVQ R12, 40+z; \ MOVQ 24+x, DX; MULXQ DX, AX, CX; /* A[3]^2 */ \ ADCQ AX, R13; MOVQ R13, 48+z; \ ADCQ CX, R14; MOVQ R14, 56+z; // reduceFromDouble finds z congruent to x modulo p such that 0