mirror of https://github.com/gogits/gogs.git
249 lines
7.9 KiB
249 lines
7.9 KiB
// Copyright 2016 The Go Authors. All rights reserved. |
|
// Use of this source code is governed by a BSD-style |
|
// license that can be found in the LICENSE file. |
|
|
|
// +build s390x |
|
|
|
#include "textflag.h" |
|
|
|
// Vector register range containing CRC-32 constants |
|
|
|
#define CONST_PERM_LE2BE V9 |
|
#define CONST_R2R1 V10 |
|
#define CONST_R4R3 V11 |
|
#define CONST_R5 V12 |
|
#define CONST_RU_POLY V13 |
|
#define CONST_CRC_POLY V14 |
|
|
|
// The CRC-32 constant block contains reduction constants to fold and |
|
// process particular chunks of the input data stream in parallel. |
|
// |
|
// Note that the constant definitions below are extended in order to compute |
|
// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction. |
|
// The rightmost doubleword can be 0 to prevent contribution to the result or |
|
// can be multiplied by 1 to perform an XOR without the need for a separate |
|
// VECTOR EXCLUSIVE OR instruction. |
|
// |
|
// The polynomials used are bit-reflected: |
|
// |
|
// IEEE: P'(x) = 0x0edb88320 |
|
// Castagnoli: P'(x) = 0x082f63b78 |
|
|
|
// IEEE polynomial constants |
|
DATA ·crcleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask |
|
DATA ·crcleconskp+8(SB)/8, $0x0706050403020100 |
|
DATA ·crcleconskp+16(SB)/8, $0x00000001c6e41596 // R2 |
|
DATA ·crcleconskp+24(SB)/8, $0x0000000154442bd4 // R1 |
|
DATA ·crcleconskp+32(SB)/8, $0x00000000ccaa009e // R4 |
|
DATA ·crcleconskp+40(SB)/8, $0x00000001751997d0 // R3 |
|
DATA ·crcleconskp+48(SB)/8, $0x0000000000000000 |
|
DATA ·crcleconskp+56(SB)/8, $0x0000000163cd6124 // R5 |
|
DATA ·crcleconskp+64(SB)/8, $0x0000000000000000 |
|
DATA ·crcleconskp+72(SB)/8, $0x00000001F7011641 // u' |
|
DATA ·crcleconskp+80(SB)/8, $0x0000000000000000 |
|
DATA ·crcleconskp+88(SB)/8, $0x00000001DB710641 // P'(x) << 1 |
|
|
|
GLOBL ·crcleconskp(SB), RODATA, $144 |
|
|
|
// Castagonli Polynomial constants |
|
DATA ·crccleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask |
|
DATA ·crccleconskp+8(SB)/8, $0x0706050403020100 |
|
DATA ·crccleconskp+16(SB)/8, $0x000000009e4addf8 // R2 |
|
DATA ·crccleconskp+24(SB)/8, $0x00000000740eef02 // R1 |
|
DATA ·crccleconskp+32(SB)/8, $0x000000014cd00bd6 // R4 |
|
DATA ·crccleconskp+40(SB)/8, $0x00000000f20c0dfe // R3 |
|
DATA ·crccleconskp+48(SB)/8, $0x0000000000000000 |
|
DATA ·crccleconskp+56(SB)/8, $0x00000000dd45aab8 // R5 |
|
DATA ·crccleconskp+64(SB)/8, $0x0000000000000000 |
|
DATA ·crccleconskp+72(SB)/8, $0x00000000dea713f1 // u' |
|
DATA ·crccleconskp+80(SB)/8, $0x0000000000000000 |
|
DATA ·crccleconskp+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1 |
|
|
|
GLOBL ·crccleconskp(SB), RODATA, $144 |
|
|
|
// func hasVectorFacility() bool |
|
TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1 |
|
MOVD $x-24(SP), R1 |
|
XC $24, 0(R1), 0(R1) // clear the storage |
|
MOVD $2, R0 // R0 is the number of double words stored -1 |
|
WORD $0xB2B01000 // STFLE 0(R1) |
|
XOR R0, R0 // reset the value of R0 |
|
MOVBZ z-8(SP), R1 |
|
AND $0x40, R1 |
|
BEQ novector |
|
|
|
vectorinstalled: |
|
// check if the vector instruction has been enabled |
|
VLEIB $0, $0xF, V16 |
|
VLGVB $0, V16, R1 |
|
CMPBNE R1, $0xF, novector |
|
MOVB $1, ret+0(FP) // have vx |
|
RET |
|
|
|
novector: |
|
MOVB $0, ret+0(FP) // no vx |
|
RET |
|
|
|
// The CRC-32 function(s) use these calling conventions: |
|
// |
|
// Parameters: |
|
// |
|
// R2: Initial CRC value, typically ~0; and final CRC (return) value. |
|
// R3: Input buffer pointer, performance might be improved if the |
|
// buffer is on a doubleword boundary. |
|
// R4: Length of the buffer, must be 64 bytes or greater. |
|
// |
|
// Register usage: |
|
// |
|
// R5: CRC-32 constant pool base pointer. |
|
// V0: Initial CRC value and intermediate constants and results. |
|
// V1..V4: Data for CRC computation. |
|
// V5..V8: Next data chunks that are fetched from the input buffer. |
|
// |
|
// V9..V14: CRC-32 constants. |
|
|
|
// func vectorizedIEEE(crc uint32, p []byte) uint32 |
|
TEXT ·vectorizedIEEE(SB), NOSPLIT, $0 |
|
MOVWZ crc+0(FP), R2 // R2 stores the CRC value |
|
MOVD p+8(FP), R3 // data pointer |
|
MOVD p_len+16(FP), R4 // len(p) |
|
|
|
MOVD $·crcleconskp(SB), R5 |
|
BR vectorizedBody<>(SB) |
|
|
|
// func vectorizedCastagnoli(crc uint32, p []byte) uint32 |
|
TEXT ·vectorizedCastagnoli(SB), NOSPLIT, $0 |
|
MOVWZ crc+0(FP), R2 // R2 stores the CRC value |
|
MOVD p+8(FP), R3 // data pointer |
|
MOVD p_len+16(FP), R4 // len(p) |
|
|
|
// R5: crc-32 constant pool base pointer, constant is used to reduce crc |
|
MOVD $·crccleconskp(SB), R5 |
|
BR vectorizedBody<>(SB) |
|
|
|
TEXT vectorizedBody<>(SB), NOSPLIT, $0 |
|
XOR $0xffffffff, R2 // NOTW R2 |
|
VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY |
|
|
|
// Load the initial CRC value into the rightmost word of V0 |
|
VZERO V0 |
|
VLVGF $3, R2, V0 |
|
|
|
// Crash if the input size is less than 64-bytes. |
|
CMP R4, $64 |
|
BLT crash |
|
|
|
// Load a 64-byte data chunk and XOR with CRC |
|
VLM 0(R3), V1, V4 // 64-bytes into V1..V4 |
|
|
|
// Reflect the data if the CRC operation is in the bit-reflected domain |
|
VPERM V1, V1, CONST_PERM_LE2BE, V1 |
|
VPERM V2, V2, CONST_PERM_LE2BE, V2 |
|
VPERM V3, V3, CONST_PERM_LE2BE, V3 |
|
VPERM V4, V4, CONST_PERM_LE2BE, V4 |
|
|
|
VX V0, V1, V1 // V1 ^= CRC |
|
ADD $64, R3 // BUF = BUF + 64 |
|
ADD $(-64), R4 |
|
|
|
// Check remaining buffer size and jump to proper folding method |
|
CMP R4, $64 |
|
BLT less_than_64bytes |
|
|
|
fold_64bytes_loop: |
|
// Load the next 64-byte data chunk into V5 to V8 |
|
VLM 0(R3), V5, V8 |
|
VPERM V5, V5, CONST_PERM_LE2BE, V5 |
|
VPERM V6, V6, CONST_PERM_LE2BE, V6 |
|
VPERM V7, V7, CONST_PERM_LE2BE, V7 |
|
VPERM V8, V8, CONST_PERM_LE2BE, V8 |
|
|
|
// Perform a GF(2) multiplication of the doublewords in V1 with |
|
// the reduction constants in V0. The intermediate result is |
|
// then folded (accumulated) with the next data chunk in V5 and |
|
// stored in V1. Repeat this step for the register contents |
|
// in V2, V3, and V4 respectively. |
|
|
|
VGFMAG CONST_R2R1, V1, V5, V1 |
|
VGFMAG CONST_R2R1, V2, V6, V2 |
|
VGFMAG CONST_R2R1, V3, V7, V3 |
|
VGFMAG CONST_R2R1, V4, V8, V4 |
|
|
|
// Adjust buffer pointer and length for next loop |
|
ADD $64, R3 // BUF = BUF + 64 |
|
ADD $(-64), R4 // LEN = LEN - 64 |
|
|
|
CMP R4, $64 |
|
BGE fold_64bytes_loop |
|
|
|
less_than_64bytes: |
|
// Fold V1 to V4 into a single 128-bit value in V1 |
|
VGFMAG CONST_R4R3, V1, V2, V1 |
|
VGFMAG CONST_R4R3, V1, V3, V1 |
|
VGFMAG CONST_R4R3, V1, V4, V1 |
|
|
|
// Check whether to continue with 64-bit folding |
|
CMP R4, $16 |
|
BLT final_fold |
|
|
|
fold_16bytes_loop: |
|
VL 0(R3), V2 // Load next data chunk |
|
VPERM V2, V2, CONST_PERM_LE2BE, V2 |
|
|
|
VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk |
|
|
|
// Adjust buffer pointer and size for folding next data chunk |
|
ADD $16, R3 |
|
ADD $-16, R4 |
|
|
|
// Process remaining data chunks |
|
CMP R4, $16 |
|
BGE fold_16bytes_loop |
|
|
|
final_fold: |
|
VLEIB $7, $0x40, V9 |
|
VSRLB V9, CONST_R4R3, V0 |
|
VLEIG $0, $1, V0 |
|
|
|
VGFMG V0, V1, V1 |
|
|
|
VLEIB $7, $0x20, V9 // Shift by words |
|
VSRLB V9, V1, V2 // Store remaining bits in V2 |
|
VUPLLF V1, V1 // Split rightmost doubleword |
|
VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2 |
|
|
|
// The input values to the Barret reduction are the degree-63 polynomial |
|
// in V1 (R(x)), degree-32 generator polynomial, and the reduction |
|
// constant u. The Barret reduction result is the CRC value of R(x) mod |
|
// P(x). |
|
// |
|
// The Barret reduction algorithm is defined as: |
|
// |
|
// 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u |
|
// 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) |
|
// 3. C(x) = R(x) XOR T2(x) mod x^32 |
|
// |
|
// Note: To compensate the division by x^32, use the vector unpack |
|
// instruction to move the leftmost word into the leftmost doubleword |
|
// of the vector register. The rightmost doubleword is multiplied |
|
// with zero to not contribute to the intermedate results. |
|
|
|
// T1(x) = floor( R(x) / x^32 ) GF2MUL u |
|
VUPLLF V1, V2 |
|
VGFMG CONST_RU_POLY, V2, V2 |
|
|
|
// Compute the GF(2) product of the CRC polynomial in VO with T1(x) in |
|
// V2 and XOR the intermediate result, T2(x), with the value in V1. |
|
// The final result is in the rightmost word of V2. |
|
|
|
VUPLLF V2, V2 |
|
VGFMAG CONST_CRC_POLY, V2, V1, V2 |
|
|
|
done: |
|
VLGVF $2, V2, R2 |
|
XOR $0xffffffff, R2 // NOTW R2 |
|
MOVWZ R2, ret + 32(FP) |
|
RET |
|
|
|
crash: |
|
MOVD $0, (R0) // input size is less than 64-bytes
|
|
|