mirror of https://github.com/gogits/gogs.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
213 lines
3.5 KiB
213 lines
3.5 KiB
//+build !noasm |
|
//+build !appengine |
|
|
|
// Copyright 2015, Klaus Post, see LICENSE for details. |
|
|
|
// func crc32sse(a []byte) uint32 |
|
TEXT ·crc32sse(SB), 4, $0 |
|
MOVQ a+0(FP), R10 |
|
XORQ BX, BX |
|
|
|
// CRC32 dword (R10), EBX |
|
BYTE $0xF2; BYTE $0x41; BYTE $0x0f |
|
BYTE $0x38; BYTE $0xf1; BYTE $0x1a |
|
|
|
MOVL BX, ret+24(FP) |
|
RET |
|
|
|
// func crc32sseAll(a []byte, dst []uint32) |
|
TEXT ·crc32sseAll(SB), 4, $0 |
|
MOVQ a+0(FP), R8 // R8: src |
|
MOVQ a_len+8(FP), R10 // input length |
|
MOVQ dst+24(FP), R9 // R9: dst |
|
SUBQ $4, R10 |
|
JS end |
|
JZ one_crc |
|
MOVQ R10, R13 |
|
SHRQ $2, R10 // len/4 |
|
ANDQ $3, R13 // len&3 |
|
XORQ BX, BX |
|
ADDQ $1, R13 |
|
TESTQ R10, R10 |
|
JZ rem_loop |
|
|
|
crc_loop: |
|
MOVQ (R8), R11 |
|
XORQ BX, BX |
|
XORQ DX, DX |
|
XORQ DI, DI |
|
MOVQ R11, R12 |
|
SHRQ $8, R11 |
|
MOVQ R12, AX |
|
MOVQ R11, CX |
|
SHRQ $16, R12 |
|
SHRQ $16, R11 |
|
MOVQ R12, SI |
|
|
|
// CRC32 EAX, EBX |
|
BYTE $0xF2; BYTE $0x0f |
|
BYTE $0x38; BYTE $0xf1; BYTE $0xd8 |
|
|
|
// CRC32 ECX, EDX |
|
BYTE $0xF2; BYTE $0x0f |
|
BYTE $0x38; BYTE $0xf1; BYTE $0xd1 |
|
|
|
// CRC32 ESI, EDI |
|
BYTE $0xF2; BYTE $0x0f |
|
BYTE $0x38; BYTE $0xf1; BYTE $0xfe |
|
MOVL BX, (R9) |
|
MOVL DX, 4(R9) |
|
MOVL DI, 8(R9) |
|
|
|
XORQ BX, BX |
|
MOVL R11, AX |
|
|
|
// CRC32 EAX, EBX |
|
BYTE $0xF2; BYTE $0x0f |
|
BYTE $0x38; BYTE $0xf1; BYTE $0xd8 |
|
MOVL BX, 12(R9) |
|
|
|
ADDQ $16, R9 |
|
ADDQ $4, R8 |
|
XORQ BX, BX |
|
SUBQ $1, R10 |
|
JNZ crc_loop |
|
|
|
rem_loop: |
|
MOVL (R8), AX |
|
|
|
// CRC32 EAX, EBX |
|
BYTE $0xF2; BYTE $0x0f |
|
BYTE $0x38; BYTE $0xf1; BYTE $0xd8 |
|
|
|
MOVL BX, (R9) |
|
ADDQ $4, R9 |
|
ADDQ $1, R8 |
|
XORQ BX, BX |
|
SUBQ $1, R13 |
|
JNZ rem_loop |
|
|
|
end: |
|
RET |
|
|
|
one_crc: |
|
MOVQ $1, R13 |
|
XORQ BX, BX |
|
JMP rem_loop |
|
|
|
// func matchLenSSE4(a, b []byte, max int) int |
|
TEXT ·matchLenSSE4(SB), 4, $0 |
|
MOVQ a_base+0(FP), SI |
|
MOVQ b_base+24(FP), DI |
|
MOVQ DI, DX |
|
MOVQ max+48(FP), CX |
|
|
|
cmp8: |
|
// As long as we are 8 or more bytes before the end of max, we can load and |
|
// compare 8 bytes at a time. If those 8 bytes are equal, repeat. |
|
CMPQ CX, $8 |
|
JLT cmp1 |
|
MOVQ (SI), AX |
|
MOVQ (DI), BX |
|
CMPQ AX, BX |
|
JNE bsf |
|
ADDQ $8, SI |
|
ADDQ $8, DI |
|
SUBQ $8, CX |
|
JMP cmp8 |
|
|
|
bsf: |
|
// If those 8 bytes were not equal, XOR the two 8 byte values, and return |
|
// the index of the first byte that differs. The BSF instruction finds the |
|
// least significant 1 bit, the amd64 architecture is little-endian, and |
|
// the shift by 3 converts a bit index to a byte index. |
|
XORQ AX, BX |
|
BSFQ BX, BX |
|
SHRQ $3, BX |
|
ADDQ BX, DI |
|
|
|
// Subtract off &b[0] to convert from &b[ret] to ret, and return. |
|
SUBQ DX, DI |
|
MOVQ DI, ret+56(FP) |
|
RET |
|
|
|
cmp1: |
|
// In the slices' tail, compare 1 byte at a time. |
|
CMPQ CX, $0 |
|
JEQ matchLenEnd |
|
MOVB (SI), AX |
|
MOVB (DI), BX |
|
CMPB AX, BX |
|
JNE matchLenEnd |
|
ADDQ $1, SI |
|
ADDQ $1, DI |
|
SUBQ $1, CX |
|
JMP cmp1 |
|
|
|
matchLenEnd: |
|
// Subtract off &b[0] to convert from &b[ret] to ret, and return. |
|
SUBQ DX, DI |
|
MOVQ DI, ret+56(FP) |
|
RET |
|
|
|
// func histogram(b []byte, h []int32) |
|
TEXT ·histogram(SB), 4, $0 |
|
MOVQ b+0(FP), SI // SI: &b |
|
MOVQ b_len+8(FP), R9 // R9: len(b) |
|
MOVQ h+24(FP), DI // DI: Histogram |
|
MOVQ R9, R8 |
|
SHRQ $3, R8 |
|
JZ hist1 |
|
XORQ R11, R11 |
|
|
|
loop_hist8: |
|
MOVQ (SI), R10 |
|
|
|
MOVB R10, R11 |
|
INCL (DI)(R11*4) |
|
SHRQ $8, R10 |
|
|
|
MOVB R10, R11 |
|
INCL (DI)(R11*4) |
|
SHRQ $8, R10 |
|
|
|
MOVB R10, R11 |
|
INCL (DI)(R11*4) |
|
SHRQ $8, R10 |
|
|
|
MOVB R10, R11 |
|
INCL (DI)(R11*4) |
|
SHRQ $8, R10 |
|
|
|
MOVB R10, R11 |
|
INCL (DI)(R11*4) |
|
SHRQ $8, R10 |
|
|
|
MOVB R10, R11 |
|
INCL (DI)(R11*4) |
|
SHRQ $8, R10 |
|
|
|
MOVB R10, R11 |
|
INCL (DI)(R11*4) |
|
SHRQ $8, R10 |
|
|
|
INCL (DI)(R10*4) |
|
|
|
ADDQ $8, SI |
|
DECQ R8 |
|
JNZ loop_hist8 |
|
|
|
hist1: |
|
ANDQ $7, R9 |
|
JZ end_hist |
|
XORQ R10, R10 |
|
|
|
loop_hist1: |
|
MOVB (SI), R10 |
|
INCL (DI)(R10*4) |
|
INCQ SI |
|
DECQ R9 |
|
JNZ loop_hist1 |
|
|
|
end_hist: |
|
RET
|
|
|