253 lines
7.5 KiB
C++
253 lines
7.5 KiB
C++
|
/*
|
||
|
* Copyright 2013 The LibYuv Project Authors. All rights reserved.
|
||
|
*
|
||
|
* Use of this source code is governed by a BSD-style license
|
||
|
* that can be found in the LICENSE file in the root of the source
|
||
|
* tree. An additional intellectual property rights grant can be found
|
||
|
* in the file PATENTS. All contributing project authors may
|
||
|
* be found in the AUTHORS file in the root of the source tree.
|
||
|
*/
|
||
|
|
||
|
#include "libyuv/rotate_row.h"
|
||
|
#include "libyuv/row.h"
|
||
|
|
||
|
#ifdef __cplusplus
|
||
|
namespace libyuv {
|
||
|
extern "C" {
|
||
|
#endif
|
||
|
|
||
|
// This module is for 32 bit Visual C x86 and clangcl
|
||
|
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
||
|
|
||
|
__declspec(naked) void TransposeWx8_SSSE3(const uint8* src,
|
||
|
int src_stride,
|
||
|
uint8* dst,
|
||
|
int dst_stride,
|
||
|
int width) {
|
||
|
__asm {
|
||
|
push edi
|
||
|
push esi
|
||
|
push ebp
|
||
|
mov eax, [esp + 12 + 4] // src
|
||
|
mov edi, [esp + 12 + 8] // src_stride
|
||
|
mov edx, [esp + 12 + 12] // dst
|
||
|
mov esi, [esp + 12 + 16] // dst_stride
|
||
|
mov ecx, [esp + 12 + 20] // width
|
||
|
|
||
|
// Read in the data from the source pointer.
|
||
|
// First round of bit swap.
|
||
|
align 4
|
||
|
convertloop:
|
||
|
movq xmm0, qword ptr [eax]
|
||
|
lea ebp, [eax + 8]
|
||
|
movq xmm1, qword ptr [eax + edi]
|
||
|
lea eax, [eax + 2 * edi]
|
||
|
punpcklbw xmm0, xmm1
|
||
|
movq xmm2, qword ptr [eax]
|
||
|
movdqa xmm1, xmm0
|
||
|
palignr xmm1, xmm1, 8
|
||
|
movq xmm3, qword ptr [eax + edi]
|
||
|
lea eax, [eax + 2 * edi]
|
||
|
punpcklbw xmm2, xmm3
|
||
|
movdqa xmm3, xmm2
|
||
|
movq xmm4, qword ptr [eax]
|
||
|
palignr xmm3, xmm3, 8
|
||
|
movq xmm5, qword ptr [eax + edi]
|
||
|
punpcklbw xmm4, xmm5
|
||
|
lea eax, [eax + 2 * edi]
|
||
|
movdqa xmm5, xmm4
|
||
|
movq xmm6, qword ptr [eax]
|
||
|
palignr xmm5, xmm5, 8
|
||
|
movq xmm7, qword ptr [eax + edi]
|
||
|
punpcklbw xmm6, xmm7
|
||
|
mov eax, ebp
|
||
|
movdqa xmm7, xmm6
|
||
|
palignr xmm7, xmm7, 8
|
||
|
// Second round of bit swap.
|
||
|
punpcklwd xmm0, xmm2
|
||
|
punpcklwd xmm1, xmm3
|
||
|
movdqa xmm2, xmm0
|
||
|
movdqa xmm3, xmm1
|
||
|
palignr xmm2, xmm2, 8
|
||
|
palignr xmm3, xmm3, 8
|
||
|
punpcklwd xmm4, xmm6
|
||
|
punpcklwd xmm5, xmm7
|
||
|
movdqa xmm6, xmm4
|
||
|
movdqa xmm7, xmm5
|
||
|
palignr xmm6, xmm6, 8
|
||
|
palignr xmm7, xmm7, 8
|
||
|
// Third round of bit swap.
|
||
|
// Write to the destination pointer.
|
||
|
punpckldq xmm0, xmm4
|
||
|
movq qword ptr [edx], xmm0
|
||
|
movdqa xmm4, xmm0
|
||
|
palignr xmm4, xmm4, 8
|
||
|
movq qword ptr [edx + esi], xmm4
|
||
|
lea edx, [edx + 2 * esi]
|
||
|
punpckldq xmm2, xmm6
|
||
|
movdqa xmm6, xmm2
|
||
|
palignr xmm6, xmm6, 8
|
||
|
movq qword ptr [edx], xmm2
|
||
|
punpckldq xmm1, xmm5
|
||
|
movq qword ptr [edx + esi], xmm6
|
||
|
lea edx, [edx + 2 * esi]
|
||
|
movdqa xmm5, xmm1
|
||
|
movq qword ptr [edx], xmm1
|
||
|
palignr xmm5, xmm5, 8
|
||
|
punpckldq xmm3, xmm7
|
||
|
movq qword ptr [edx + esi], xmm5
|
||
|
lea edx, [edx + 2 * esi]
|
||
|
movq qword ptr [edx], xmm3
|
||
|
movdqa xmm7, xmm3
|
||
|
palignr xmm7, xmm7, 8
|
||
|
sub ecx, 8
|
||
|
movq qword ptr [edx + esi], xmm7
|
||
|
lea edx, [edx + 2 * esi]
|
||
|
jg convertloop
|
||
|
|
||
|
pop ebp
|
||
|
pop esi
|
||
|
pop edi
|
||
|
ret
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
|
||
|
int src_stride,
|
||
|
uint8* dst_a,
|
||
|
int dst_stride_a,
|
||
|
uint8* dst_b,
|
||
|
int dst_stride_b,
|
||
|
int w) {
|
||
|
__asm {
|
||
|
push ebx
|
||
|
push esi
|
||
|
push edi
|
||
|
push ebp
|
||
|
mov eax, [esp + 16 + 4] // src
|
||
|
mov edi, [esp + 16 + 8] // src_stride
|
||
|
mov edx, [esp + 16 + 12] // dst_a
|
||
|
mov esi, [esp + 16 + 16] // dst_stride_a
|
||
|
mov ebx, [esp + 16 + 20] // dst_b
|
||
|
mov ebp, [esp + 16 + 24] // dst_stride_b
|
||
|
mov ecx, esp
|
||
|
sub esp, 4 + 16
|
||
|
and esp, ~15
|
||
|
mov [esp + 16], ecx
|
||
|
mov ecx, [ecx + 16 + 28] // w
|
||
|
|
||
|
align 4
|
||
|
// Read in the data from the source pointer.
|
||
|
// First round of bit swap.
|
||
|
convertloop:
|
||
|
movdqu xmm0, [eax]
|
||
|
movdqu xmm1, [eax + edi]
|
||
|
lea eax, [eax + 2 * edi]
|
||
|
movdqa xmm7, xmm0 // use xmm7 as temp register.
|
||
|
punpcklbw xmm0, xmm1
|
||
|
punpckhbw xmm7, xmm1
|
||
|
movdqa xmm1, xmm7
|
||
|
movdqu xmm2, [eax]
|
||
|
movdqu xmm3, [eax + edi]
|
||
|
lea eax, [eax + 2 * edi]
|
||
|
movdqa xmm7, xmm2
|
||
|
punpcklbw xmm2, xmm3
|
||
|
punpckhbw xmm7, xmm3
|
||
|
movdqa xmm3, xmm7
|
||
|
movdqu xmm4, [eax]
|
||
|
movdqu xmm5, [eax + edi]
|
||
|
lea eax, [eax + 2 * edi]
|
||
|
movdqa xmm7, xmm4
|
||
|
punpcklbw xmm4, xmm5
|
||
|
punpckhbw xmm7, xmm5
|
||
|
movdqa xmm5, xmm7
|
||
|
movdqu xmm6, [eax]
|
||
|
movdqu xmm7, [eax + edi]
|
||
|
lea eax, [eax + 2 * edi]
|
||
|
movdqu [esp], xmm5 // backup xmm5
|
||
|
neg edi
|
||
|
movdqa xmm5, xmm6 // use xmm5 as temp register.
|
||
|
punpcklbw xmm6, xmm7
|
||
|
punpckhbw xmm5, xmm7
|
||
|
movdqa xmm7, xmm5
|
||
|
lea eax, [eax + 8 * edi + 16]
|
||
|
neg edi
|
||
|
// Second round of bit swap.
|
||
|
movdqa xmm5, xmm0
|
||
|
punpcklwd xmm0, xmm2
|
||
|
punpckhwd xmm5, xmm2
|
||
|
movdqa xmm2, xmm5
|
||
|
movdqa xmm5, xmm1
|
||
|
punpcklwd xmm1, xmm3
|
||
|
punpckhwd xmm5, xmm3
|
||
|
movdqa xmm3, xmm5
|
||
|
movdqa xmm5, xmm4
|
||
|
punpcklwd xmm4, xmm6
|
||
|
punpckhwd xmm5, xmm6
|
||
|
movdqa xmm6, xmm5
|
||
|
movdqu xmm5, [esp] // restore xmm5
|
||
|
movdqu [esp], xmm6 // backup xmm6
|
||
|
movdqa xmm6, xmm5 // use xmm6 as temp register.
|
||
|
punpcklwd xmm5, xmm7
|
||
|
punpckhwd xmm6, xmm7
|
||
|
movdqa xmm7, xmm6
|
||
|
|
||
|
// Third round of bit swap.
|
||
|
// Write to the destination pointer.
|
||
|
movdqa xmm6, xmm0
|
||
|
punpckldq xmm0, xmm4
|
||
|
punpckhdq xmm6, xmm4
|
||
|
movdqa xmm4, xmm6
|
||
|
movdqu xmm6, [esp] // restore xmm6
|
||
|
movlpd qword ptr [edx], xmm0
|
||
|
movhpd qword ptr [ebx], xmm0
|
||
|
movlpd qword ptr [edx + esi], xmm4
|
||
|
lea edx, [edx + 2 * esi]
|
||
|
movhpd qword ptr [ebx + ebp], xmm4
|
||
|
lea ebx, [ebx + 2 * ebp]
|
||
|
movdqa xmm0, xmm2 // use xmm0 as the temp register.
|
||
|
punpckldq xmm2, xmm6
|
||
|
movlpd qword ptr [edx], xmm2
|
||
|
movhpd qword ptr [ebx], xmm2
|
||
|
punpckhdq xmm0, xmm6
|
||
|
movlpd qword ptr [edx + esi], xmm0
|
||
|
lea edx, [edx + 2 * esi]
|
||
|
movhpd qword ptr [ebx + ebp], xmm0
|
||
|
lea ebx, [ebx + 2 * ebp]
|
||
|
movdqa xmm0, xmm1 // use xmm0 as the temp register.
|
||
|
punpckldq xmm1, xmm5
|
||
|
movlpd qword ptr [edx], xmm1
|
||
|
movhpd qword ptr [ebx], xmm1
|
||
|
punpckhdq xmm0, xmm5
|
||
|
movlpd qword ptr [edx + esi], xmm0
|
||
|
lea edx, [edx + 2 * esi]
|
||
|
movhpd qword ptr [ebx + ebp], xmm0
|
||
|
lea ebx, [ebx + 2 * ebp]
|
||
|
movdqa xmm0, xmm3 // use xmm0 as the temp register.
|
||
|
punpckldq xmm3, xmm7
|
||
|
movlpd qword ptr [edx], xmm3
|
||
|
movhpd qword ptr [ebx], xmm3
|
||
|
punpckhdq xmm0, xmm7
|
||
|
sub ecx, 8
|
||
|
movlpd qword ptr [edx + esi], xmm0
|
||
|
lea edx, [edx + 2 * esi]
|
||
|
movhpd qword ptr [ebx + ebp], xmm0
|
||
|
lea ebx, [ebx + 2 * ebp]
|
||
|
jg convertloop
|
||
|
|
||
|
mov esp, [esp + 16]
|
||
|
pop ebp
|
||
|
pop edi
|
||
|
pop esi
|
||
|
pop ebx
|
||
|
ret
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
|
||
|
|
||
|
#ifdef __cplusplus
|
||
|
} // extern "C"
|
||
|
} // namespace libyuv
|
||
|
#endif
|