有如下代码:
#pragma GCC diagnostic push
#pragma G++ diagnostic push
#pragma GCC diagnostic ignored "-Wpragmas"
#pragma GCC diagnostic ignored "-Wattributes"
#pragma G++ diagnostic ignored "-Wpragmas"
#pragma G++ diagnostic ignored "-Wattributes"
#pragma GCC optimize(1)//o1???
#pragma GCC optimize(2)//o2???
#pragma GCC optimize(3)//o3???
#pragma GCC optimize("Ofast")//ofast???(??????????????????)??
#pragma GCC optimize("inline")//inline????
#pragma GCC optimize("-fgcse")//fgcse???
#pragma GCC optimize("-fgcse-lm")//-fgcse-lm
#pragma GCC optimize("-fipa-sra")//????
#pragma GCC optimize("-ftree-pre")//????tree
#pragma GCC optimize("-ftree-vrp")//???tree
#pragma GCC optimize("-ffast-math")//???????
#pragma GCC optimize("-fsched-spec")//???????
#pragma GCC optimize("-fdevirtualize")//fugechar???
#pragma GCC optimize("-fcaller-saves")//????????
#pragma GCC optimize("-fschedule-insns")//fschedule-insns???
#pragma GCC optimize("inline-functions")//inline-functions???
#pragma GCC optimize("-ftree-tail-merge")//-ftree-tail-merge???
#pragma GCC optimize("-fschedule-insns2")//-fschedule-insns2???
#pragma GCC optimize("-fstrict-aliasing")//-fstrict-aliasing???
#pragma GCC optimize("-fstrict-overflow")//?????
#pragma GCC optimize("-fcse-skip-blocks")
#pragma GCC optimize("-fcse-follow-jumps")
#pragma GCC optimize("-fsched-interblock")
#pragma GCC optimize("-fpartial-inlining")
#pragma GCC optimize("no-stack-protector")
#pragma GCC optimize("-frerun-cse-after-loop")
#pragma GCC optimize("inline-small-functions")
#pragma GCC optimize("-finline-small-functions")
#pragma GCC optimize("-ftree-switch-conversion")
#pragma GCC optimize("-foptimize-sibling-calls")
#pragma GCC optimize("-findirect-inlining")
#pragma GCC optimize("-fexpensive-optimizations")
#pragma GCC optimize("-faggressive-loop-optimizations")
#pragma GCC optimize("inline-functions-called-once")
#pragma GCC optimize("-fdelete-null-pointer-checks")
#pragma GCC optimize("-fomit-frame-pointer")
#pragma GCC optimize("-fno-semantic-interposition")
#pragma GCC optimize("-freciprocal-math")
#pragma GCC optimize("tree-vectorize") // 启用自动向量化
#pragma GCC optimize("move-loop-invariants") // 循环不变量外提
#pragma GCC optimize("branch-target-load-optimize") // 分支目标加载优化
#pragma GCC optimize("btr-bb-exclusive") // 基本块独占优化
#pragma GCC optimize("predictive-commoning") // 预测性公用子表达式消除
#pragma GCC optimize("gcse-sm") // 存储移动的全局公共子表达式消除
#pragma GCC optimize("gcse-las") // 加载存储的全局公共子表达式消除
#pragma GCC optimize("ipa-pta") // 过程间指针分析
#pragma GCC optimize("ipa-ra") // 过程间寄存器分配
#pragma GCC optimize("ipa-cp") // 过程间常量传播
#pragma GCC optimize("ipa-bit-cp") // 过程间位域常量传播
#pragma GCC optimize("ipa-vrp") // 过程间值范围传播
#pragma GCC optimize("ipa-sra") // 过程间标量替换聚合
#pragma GCC optimize("prefetch-loop-arrays") // 数组预取
#pragma GCC optimize("-fmodulo-sched") // 模调度
#pragma GCC optimize("-freschedule-modulo-scheduled-loops")
#pragma GCC optimize("-fselective-scheduling")
#pragma GCC optimize("-fsel-sched-pipelining")
#pragma GCC optimize("-fsel-sched-pipelining-outer-loops")
#pragma G++ optimize(1)
#pragma G++ optimize(2)
#pragma G++ optimize(3)
#pragma G++ optimize("Ofast")
#pragma G++ optimize("inline")
#pragma G++ optimize("-fgcse")
#pragma G++ optimize("-fgcse-lm")
#pragma G++ optimize("-fipa-sra")
#pragma G++ optimize("-ftree-pre")
#pragma G++ optimize("-ftree-vrp")
#pragma G++ optimize("-ffast-math")
#pragma G++ optimize("-fsched-spec")
#pragma G++ optimize("-fdevirtualize")
#pragma G++ optimize("-fcaller-saves")
#pragma G++ optimize("-fschedule-insns")//fschedule-insns???
#pragma G++ optimize("inline-functions")//inline-functions???
#pragma G++ optimize("-ftree-tail-merge")//-ftree-tail-merge???
#pragma G++ optimize("-fschedule-insns2")//-fschedule-insns2???
#pragma G++ optimize("-fstrict-aliasing")//-fstrict-aliasing???
#pragma G++ optimize("-fstrict-overflow")//?????
#pragma G++ optimize("-fcse-skip-blocks")
#pragma G++ optimize("-fcse-follow-jumps")
#pragma G++ optimize("-fsched-interblock")
#pragma G++ optimize("-fpartial-inlining")
#pragma G++ optimize("no-stack-protector")
#pragma G++ optimize("-frerun-cse-after-loop")
#pragma G++ optimize("inline-small-functions")
#pragma G++ optimize("-finline-small-functions")
#pragma G++ optimize("-ftree-switch-conversion")
#pragma G++ optimize("-foptimize-sibling-calls")
#pragma G++ optimize("-findirect-inlining")
#pragma G++ optimize("-fexpensive-optimizations")
#pragma G++ optimize("-faggressive-loop-optimizations")
#pragma G++ optimize("inline-functions-called-once")
#pragma G++ optimize("-fdelete-null-pointer-checks")
#pragma G++ optimize("-fomit-frame-pointer")
#pragma G++ optimize("-fno-semantic-interposition")
#pragma G++ optimize("-freciprocal-math")
#pragma G++ optimize("tree-vectorize") // 启用自动向量化
#pragma G++ optimize("move-loop-invariants") // 循环不变量外提
#pragma G++ optimize("branch-target-load-optimize") // 分支目标加载优化
#pragma G++ optimize("btr-bb-exclusive") // 基本块独占优化
#pragma G++ optimize("predictive-commoning") // 预测性公用子表达式消除
#pragma G++ optimize("gcse-sm") // 存储移动的全局公共子表达式消除
#pragma G++ optimize("gcse-las") // 加载存储的全局公共子表达式消除
#pragma G++ optimize("ipa-pta") // 过程间指针分析
#pragma G++ optimize("ipa-ra") // 过程间寄存器分配
#pragma G++ optimize("ipa-cp") // 过程间常量传播
#pragma G++ optimize("ipa-bit-cp") // 过程间位域常量传播
#pragma G++ optimize("ipa-vrp") // 过程间值范围传播
#pragma G++ optimize("ipa-sra") // 过程间标量替换聚合
#pragma G++ optimize("prefetch-loop-arrays") // 数组预取
#pragma G++ optimize("-fmodulo-sched") // 模调度
#pragma G++ optimize("-freschedule-modulo-scheduled-loops")
#pragma G++ optimize("-fselective-scheduling")
#pragma G++ optimize("-fsel-sched-pipelining")
#pragma G++ optimize("-fsel-sched-pipelining-outer-loops")
#include <immintrin.h>
#include <cstdint>
#include <iostream>
#pragma GCC target("sse3","sse2","sse","sse4","sse4.1","sse4.2","ssse3","f16c","fma","avx2","xop","fma4","mmx","popcnt","abm")
#pragma G++ target("sse3","sse2","sse","sse4","sse4.1","sse4.2","ssse3","f16c","fma","avx2","xop","fma4","mmx","popcnt","abm")
// Barrett约减器类模板(编译期初始化)
template <uint32_t Mod>
class BarrettReducer {
private:
// 编译期计算位移量k(满足2^(k-1) <= Mod < 2^k)
static constexpr int calc_k() {
uint32_t tmp = Mod;
int k = 0;
while (tmp) {
k++;
tmp >>= 1;
}
return k;
}
static constexpr int k = calc_k();
static constexpr uint64_t mu = (1ULL << (2 * k)) / Mod;
public:
// 单个数取模(使用Barrett约减)
uint32_t reduce(uint64_t x) const {
uint64_t q = (x * mu) >> (2 * k);
uint32_t r = x - q * Mod;
return r >= Mod ? r - Mod : r;
}
#ifdef USE_AVX512
// 批量取模(AVX-512优化版本,一次处理8个整数)
void reduce_many(const uint32_t* input, uint32_t* output, size_t count) {
const __m512i vmod = _mm512_set1_epi64(Mod);
const __m512i vmu = _mm512_set1_epi64(mu);
size_t i = 0;
// 使用AVX-512处理8的倍数部分
for (; i + 8 <= count; i += 8) {
// 加载8个32位整数
__m256i in = _mm256_loadu_si256((__m256i*)(input + i));
// 将32位整数扩展为64位整数
__m512i x = _mm512_cvtepu32_epi64(in);
// 计算q = (x * mu) >> (2*k)
__m512i product = _mm512_mul_epi32(x, vmu);
__m512i q = _mm512_srli_epi64(product, 2 * k);
// 计算r = x - q * mod
__m512i q_mod = _mm512_mul_epi32(q, vmod);
__m512i r = _mm512_sub_epi64(x, q_mod);
// 处理溢出:如果r >= Mod,则r -= Mod
__mmask8 mask = _mm512_cmpge_epu64_mask(r, vmod);
r = _mm512_mask_sub_epi64(r, mask, r, vmod);
// 将结果转换回32位整数
__m256i result = _mm512_cvtepi64_epi32(r);
_mm256_storeu_si256((__m256i*)(output + i), result);
}
// 处理剩余元素(不足8个)
for (; i < count; i++) {
output[i] = reduce(input[i]);
}
}
#else
// 批量取模(AVX-256优化版本,一次处理4个整数)
void reduce_many(const uint32_t* input, uint32_t* output, size_t count) {
const __m256i vmod = _mm256_set1_epi64x(Mod);
const __m256i vmu = _mm256_set1_epi64x(mu);
const __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
size_t i = 0;
// 使用AVX-256处理4的倍数部分
for (; i + 4 <= count; i += 4) {
// 加载4个32位整数
__m128i in = _mm_loadu_si128((__m128i*)(input + i));
// 将32位整数扩展为64位整数
__m256i x = _mm256_cvtepu32_epi64(in);
// 计算q = (x * mu) >> (2*k)
__m256i product = _mm256_mul_epu32(x, vmu);
__m256i q = _mm256_srli_epi64(product, 2 * k);
// 计算r = x - q * mod
__m256i q_mod = _mm256_mul_epu32(q, vmod);
__m256i r = _mm256_sub_epi64(x, q_mod);
// 处理溢出:如果r >= Mod,则r -= Mod
__m256i cmp = _mm256_cmpgt_epi64(vmod, r); // 比较 mod > r
__m256i underflow = _mm256_andnot_si256(cmp, vmod); // 如果r>=mod,则underflow=mod,否则0
r = _mm256_sub_epi64(r, underflow);
// 将结果转换回32位整数(高效AVX2方法)
// 使用permutevar8x32指令提取每个64位整数的低32位
__m256i permuted = _mm256_permutevar8x32_epi32(r, idx);
__m128i result = _mm256_castsi256_si128(permuted);
_mm_storeu_si128((__m128i*)(output + i), result);
}
// 处理剩余元素(不足4个)
for (; i < count; i++) {
output[i] = reduce(input[i]);
}
}
#endif
};
#include <iostream>
#include <vector>
#include <random>
// 测试编译期初始化版本
void testCompileTime() {
constexpr uint32_t Mod = 998244353;
BarrettReducer<Mod> reducer;
std::cout << "=== Testing Compile-time BarrettReducer (Mod = " << Mod << ") ===" << std::endl;
// 测试边界值
std::vector<uint64_t> testValues = {
0, 1, Mod-1, Mod, Mod+1,
2*Mod-1, 2*Mod, 2*Mod+1,
1234567890, 9876543210
};
for (auto x : testValues) {
uint32_t result = reducer.reduce(x);
uint32_t expected = x % Mod;
std::cout << x << " mod " << Mod << " = " << result;
if (result != expected) {
std::cout << " (Error! Expected: " << expected << ")";
}
std::cout << std::endl;
}
// 测试批量处理
std::vector<uint32_t> inputs(64);
std::vector<uint32_t> outputs(64);
// 生成随机测试数据(确保在uint32_t范围内)
std::mt19937 rng(std::random_device{}());
std::uniform_int_distribution<uint32_t> dist(0, 1ll<<31);
for (auto& val : inputs) {
val = dist(rng);
}
std::cout << "Starting batch processing..." << std::endl;
reducer.reduce_many(inputs.data(), outputs.data(), inputs.size());
std::cout << "Batch processing completed" << std::endl;
bool batchError = false;
for (size_t i = 0; i < inputs.size(); i++) {
uint32_t expected = inputs[i] % Mod;
if (outputs[i] != expected) {
std::cerr << "Error: " << inputs[i] << " mod " << Mod
<< " = " << outputs[i] << " (Expected: " << expected << ")" << std::endl;
batchError = true;
}
}
if (!batchError) {
std::cout << "Batch processing: all " << inputs.size() << " values correct" << std::endl;
}
}
int main() {
testCompileTime();
return 0;
}
在开启默认优化等级的情况下,程序无异常。
开启 O2 优化的情况下,程序无异常。
开启 O3 或 Ofast 或者手动打开自动向量化编译选项的情况下,程序 RE。
请问这是什么情况?是 GCC 的 bug 还是我的程序有 UB?
编译环境:TDM-GCC 10.3.0
运行环境:R9 5900HS