RT,今天尝试手写 uint128_t,在测试除移位以外的运算符都与系统自带的汇编代码一模一样,但在写左移和右移时发现无论什么我可以想到的写法都不能避免分支预测,无法使用 cmovne 指令。
struct _U {
unsigned long long r, l;
_U operator=(unsigned long long v) {
r = v, l = 0;
return *this;
}
_U operator<<(unsigned int amount) {
const auto lo = r, hi = l;
return
amount < 64 ? _U{lo << amount, (hi << amount) | (lo >> (64 - amount))}
: _U{0, lo << amount - 64};
}
};
_U x1, x2;
unsigned long long v;
int main() {
__builtin_scanf("%llu", &v); x1 = v;
__builtin_scanf("%llu", &v);
x2 = x1 << v;
}
#define _U __int128 unsigned
_U x1, x2;
unsigned long long v;
int main() {
__builtin_scanf("%llu", &v); x1 = v;
__builtin_scanf("%llu", &v);
x2 = x1 << v;
}
这两份代码在计算左移时的输出是
# 自定义 uint128_t
mov rdx, QWORD PTR v[rip]
mov rax, QWORD PTR x1[rip]
mov r8, QWORD PTR x1[rip+8]
cmp edx, 63
jbe .L5
lea ecx, -64[rdx]
xor r9d, r9d
sal rax, cl
.L3:
mov QWORD PTR x2[rip+8], rax
xor eax, eax
mov QWORD PTR x2[rip], r9
add rsp, 40
ret
.L5:
mov r9, rax
mov ecx, edx
sal r9, cl
mov ecx, 64
sub ecx, edx
shr rax, cl
mov ecx, edx
sal r8, cl
or rax, r8
jmp .L3
# 自带的 uint128_t
movzx ecx, BYTE PTR v[rip]
mov rax, QWORD PTR x1[rip]
xor r8d, r8d
mov rdx, QWORD PTR x1[rip+8]
shld rdx, rax, cl
sal rax, cl
test cl, 64
cmovne rdx, rax
cmovne rax, r8
mov QWORD PTR x2[rip], rax
xor eax, eax
mov QWORD PTR x2[rip+8], rdx
add rsp, 40
ret
求问怎样的写法才可以与系统自带的 uint128_t 输出大致相同?(开 -O2)