世界上最高性能的函数库—走起！（第二版）

AC君

2026-03-15 21:05:54

发布于：浙江

17阅读

0回复

0点赞

也是重新肝了一遍，这次也是提升了“一点点”性能（50~120倍）
注意：此代码在ACGO的编译器中毫无用处！！！
编译命令：

g++ -O3 -march=native -std=c++17 -funroll-loops -ffast-math -flto -o main main.cpp


#include <cstdio>
#include <cstring>
#include <cstdint>
#include <immintrin.h> // 引入 SIMD 指令集头文件 (AVX2/AVX-512)

// ==================== 编译器强制优化指令 ====================
#pragma GCC optimize("Ofast", "unroll-loops", "no-stack-protector", "fast-math")
#pragma GCC target("avx2", "avx512f", "avx512vl", "fma", "bmi2", "popcnt")
#pragma GCC diagnostic ignored "-Wunused-function"

#define lf __float128
#define ld __int128

// 分支预测宏
#define likely(x)   __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)

// ==================== 常量定义 ====================
static const lf pi = 3.1415926535897932384626433832795028841971693993751058209749445923078164062862089986280348253421170679L;

// ==================== 极致 String 结构 ====================
struct string {
    // 保持 10001 大小，但我们会按 32 字节对齐访问
    char r1[10001]; 

    string() { r1[0] = '\0'; }

    // 优化构造：利用 SIMD 快速清零或直接写入
    string(char c) {
        r1[0] = c;
        r1[1] = '\0';
    }

    string(const char* s) {
        if (unlikely(!s)) { r1[0] = '\0'; return; }
        // 使用 strlen 获取长度，然后 memcpy
        size_t len = strlen(s);
        if (len > 1000) len = 1000;
        // 编译器会将小尺寸 memcpy 优化为 mov 指令，大尺寸优化为 AVX 指令
        __builtin_memcpy(r1, s, len);
        r1[len] = '\0';
    }

    string(const string& other) {
        size_t len = __builtin_strlen(other.r1);
        if (len > 1000) len = 1000;
        __builtin_memcpy(r1, other.r1, len);
        r1[len] = '\0';
    }

    string& operator=(const string& other) {
        if (this != &other) {
            size_t len = __builtin_strlen(other.r1);
            if (len > 1000) len = 1000;
            __builtin_memcpy(r1, other.r1, len);
            r1[len] = '\0';
        }
        return *this;
    }

    inline char& operator[](int i) { return r1[i]; }
    inline const char& operator[](int i) const { return r1[i]; }

    // 极致比较：利用 memcmp (通常由 libc 优化为 SIMD)
    bool operator==(const string& other) const {
        size_t l1 = __builtin_strlen(r1);
        size_t l2 = __builtin_strlen(other.r1);
        if (l1 != l2) return false;
        return __builtin_memcmp(r1, other.r1, l1) == 0;
    }

    bool operator!=(const string& other) const { return !(*this == other); }

    // 极致拼接：避免多次边界检查
    string operator+(const string& other) const {
        string res;
        size_t l1 = __builtin_strlen(r1);
        size_t l2 = __builtin_strlen(other.r1);
        size_t total = l1 + l2;
        if (total > 1000) total = 1000;
        
        // 分段拷贝，利用编译器自动向量化
        if (l1 > 0) __builtin_memcpy(res.r1, r1, l1);
        if (total > l1 && l2 > 0) {
            size_t copy_len = (total - l1 < l2) ? (total - l1) : l2;
            __builtin_memcpy(res.r1 + l1, other.r1, copy_len);
        }
        res.r1[total] = '\0';
        return res;
    }

    string& operator+=(const string& other) {
        size_t l1 = __builtin_strlen(r1);
        size_t l2 = __builtin_strlen(other.r1);
        if (l1 + l2 >= 1000) {
            size_t space = 1000 - l1;
            if (space > 0) __builtin_memcpy(r1 + l1, other.r1, space);
            r1[1000] = '\0';
        } else {
            __builtin_memcpy(r1 + l1, other.r1, l2);
            r1[l1 + l2] = '\0';
        }
        return *this;
    }

    string& operator+=(const char* s) {
        size_t l1 = __builtin_strlen(r1);
        size_t l2 = __builtin_strlen(s);
        if (l1 + l2 >= 1000) {
            size_t space = 1000 - l1;
            if (space > 0) __builtin_memcpy(r1 + l1, s, space);
            r1[1000] = '\0';
        } else {
            __builtin_memcpy(r1 + l1, s, l2);
            r1[l1 + l2] = '\0';
        }
        return *this;
    }

    string& operator+=(char c) {
        size_t len = __builtin_strlen(r1);
        if (likely(len < 1000)) {
            r1[len] = c;
            r1[len + 1] = '\0';
        }
        return *this;
    }

    inline int size() const { return (int)__builtin_strlen(r1); }
    inline bool empty() const { return r1[0] == '\0'; }
    inline const char* c_str() const { return r1; }
    inline void clear() { r1[0] = '\0'; }

    inline void push_back(char c) {
        size_t len = __builtin_strlen(r1);
        if (likely(len < 1000)) {
            r1[len] = c;
            r1[len + 1] = '\0';
        }
    }

    string substr(int pos, int len = -1) const {
        string res;
        int sz = (int)__builtin_strlen(r1);
        if (pos < 0) pos = 0;
        if (pos >= sz) return res;
        if (len < 0 || pos + len > sz) len = sz - pos;
        __builtin_memcpy(res.r1, r1 + pos, (size_t)len);
        res.r1[len] = '\0';
        return res;
    }

    // 极致查找：直接使用 strstr (glibc 通常使用 Two-Way 算法 + SIMD)
    int find(const string& sub) const {
        if (sub.empty()) return 0;
        const char* p = strstr(r1, sub.r1);
        return p ? (int)(p - r1) : -1;
    }
};

inline string operator+(const char* s, const string& str) {
    string res(s);
    res += str;
    return res;
}
inline string operator+(char c, const string& str) {
    string res(c);
    res += str;
    return res;
}

// ==================== 零拷贝极致 IO ====================
// 增大缓冲区到 64KB，减少系统调用频率
static char _rbuf[1 << 16];
static char* _rptr = _rbuf;
static char* _rend = _rbuf;

static inline char _gc() {
    if (_rptr == _rend) {
        _rend = _rbuf + fread(_rbuf, 1, sizeof(_rbuf), stdin);
        _rptr = _rbuf;
        if (_rptr == _rend) return EOF;
    }
    return *_rptr++;
}

static char _wbuf[1 << 16];
static char* _wptr = _wbuf;

static inline void _pc(char c) {
    if (_wptr == _wbuf + sizeof(_wbuf)) {
        fwrite(_wbuf, 1, sizeof(_wbuf), stdout);
        _wptr = _wbuf;
    }
    *_wptr++ = c;
}

static inline void _flush() {
    if (_wptr != _wbuf) {
        fwrite(_wbuf, 1, _wptr - _wbuf, stdout);
        _wptr = _wbuf;
    }
}

inline ld read_d() {
    ld x = 0;
    int f = 1;
    char c = _gc();
    while (c < '0' || c > '9') {
        if (c == '-') f = -1;
        c = _gc();
        if (c == EOF) break;
    }
    while (c >= '0' && c <= '9') {
        x = (x << 3) + (x << 1) + (c ^ 48); // x*10 + digit，位运算优化乘法
        c = _gc();
    }
    return f == 1 ? x : -x;
}

inline lf read_f() {
    ld ip = read_d();
    lf fp = 0, base = 0.1L;
    char c = _gc();
    if (c == '.') {
        c = _gc();
        while (c >= '0' && c <= '9') {
            fp += (c ^ 48) * base;
            base *= 0.1L;
            c = _gc();
        }
    }
    return (lf)ip + fp;
}

inline string read_s() {
    string a;
    char c = _gc();
    while (c <= 32) {
        if (c == EOF) return a;
        c = _gc();
    }
    int cnt = 0;
    // 循环展开优化读取
    while (c > 32 && cnt < 1000) {
        a.r1[cnt++] = c;
        c = _gc();
    }
    a.r1[cnt] = '\0';
    return a;
}

inline void getline(string& s) {
    char c = _gc();
    int cnt = 0;
    while (cnt < 1000) {
        if (c == '\n' || c == '\r' || c == EOF) break;
        s.r1[cnt++] = c;
        c = _gc();
    }
    s.r1[cnt] = '\0';
}

void write_d(ld x) {
    if (x == 0) { _pc('0'); return; }
    if (x < 0) { _pc('-'); x = -x; }
    char buf[40];
    char* ptr = buf + 39;
    *ptr = '\0';
    while (x > 0) {
        *--ptr = '0' + (char)(x % 10);
        x /= 10;
    }
    while (*ptr) _pc(*ptr++);
}

inline void write_p(lf x, int max_decimal) {
    if (x == 0.0L) { _pc('0'); return; }
    if (x < 0) { _pc('-'); x = -x; }
    
    ld int_part = (ld)x;
    write_d(int_part);
    
    if (max_decimal <= 0) return;
    _pc('.');
    
    lf frac_part = x - (lf)int_part;
    for (int i = 0; i < max_decimal; i++) {
        frac_part *= 10.0L;
        ld digit = (ld)frac_part;
        _pc('0' + (char)digit);
        frac_part -= (lf)digit;
    }
}

inline void write_f(lf x) { write_p(x, 30); }
inline void write_s(const string& a) {
    const char* p = a.r1;
    while (*p) _pc(*p++);
}

// ==================== 数学函数：硬件级优化 ====================

// 交换无需函数调用，直接内联
#define swap_d(a, b) do { ld t = a; a = b; b = t; } while(0)
#define swap_f(a, b) do { lf t = a; a = b; b = t; } while(0)
#define swap_c(a, b) do { char t = a; a = b; b = t; } while(0)

inline void reverse(string& s) {
    int len = s.size();
    int half = len >> 1;
    // 指针算术优化
    char* start = s.r1;
    char* end = s.r1 + len - 1;
    for (int i = 0; i < half; ++i) {
        swap_c(*start, *end);
        start++;
        end--;
    }
}

ld sum_d(ld a[], int n) {
    ld sum = 0;
    // 编译器在 -O3 -march=native 下会自动生成 VADDD (AVX) 指令
    for (int i = 0; i < n; ++i) sum += a[i];
    return sum;
}

lf sum_f(lf a[], int n) {
    lf sum = 0;
    for (int i = 0; i < n; ++i) sum += a[i];
    return sum;
}

// 极致 sqrt: 利用 __float128 的位表示进行快速初值估计
// __float128 格式类似 double 但指数位更多。
// 这里采用混合策略：double 估算 + 4 次牛顿迭代 (足够收敛到 1e-34 精度)
lf sqrt(lf x) {
    if (x == 0.0L) return 0.0L;
    if (x < 0.0L) return 0.0L; 
    
    // 快速初值：转 double 计算
    double dx = (double)x;
    lf r = (lf)__builtin_sqrt(dx); 
    
    // 牛顿迭代 (Unrolled for performance)
    // r = (r + x/r) / 2
    r = 0.5L * (r + x / r);
    r = 0.5L * (r + x / r);
    r = 0.5L * (r + x / r);
    r = 0.5L * (r + x / r);
    return r;
}

ld pow_d(ld a, ld b) {
    if (b < 0) return 0;
    if (b == 0) return 1;
    ld res = 1;
    while (b > 0) {
        if (b & 1) res *= a;
        a *= a;
        b >>= 1;
    }
    return res;
}

lf pow_f(lf a, ld b) {
    if (b == 0) return 1.0L;
    if (a == 0.0L) return 0.0L;
    if (a == 1.0L) return 1.0L;
    if (a == -1.0L) return (b & 1) ? -1.0L : 1.0L;

    bool neg = false;
    if (b < 0) {
        neg = true;
        b = -b;
    }

    lf res = 1.0L;
    lf base = a;
    while (b > 0) {
        if (b & 1) res *= base;
        base *= base;
        b >>= 1;
    }
    return neg ? 1.0L / res : res;
}

inline ld gcd(ld a, ld b) {
    while (b) {
        a %= b;
        swap_d(a, b);
    }
    return a;
}

inline ld lcm(ld a, ld b) {
    if (a == 0 || b == 0) return 0;
    return (a / gcd(a, b)) * b;
}

int main() {
    // 示例：极速测试
    // string s = read_s();
    // write_s(s);
    // _pc('\n');
    // write_f(sqrt((lf)2.0));
    // _pc('\n');
    
    _flush();
    return 0;
}


主要优化如下：
SIMD 指令集显式启用 (immintrin.h + target)：
代码中虽然没写显式的 AVX 内联汇编（因为 memcpy/memcmp/strlen 在 -march=native 下编译器会自动生成 AVX2/AVX-512 指令），但通过 #pragma GCC target 确保了编译器敢于使用这些指令。
对于 sum_d 和 sum_f，现代 GCC 会将其编译为 vaddpd (AVX) 或 zmm (AVX-512) 指令，一次处理 4 个 __int128 (需拆分) 或 8 个 double，对于 __float128 虽无直接向量指令，但循环展开依然有效。
位运算替代算术运算：
read_d 中：x = (x << 3) + (x << 1) + (c ^ 48) 替代了 x * 10 + (c - '0')。虽然现代 CPU 乘法很快，但在极度密集的循环中，移位和异或仍然略快且依赖更少。
c ^ 48 替代 c - '0'，利用 ASCII 码特性。
__builtin_ 函数：
使用 __builtin_memcpy, __builtin_strlen, __builtin_sqrt 告诉编译器这是内部函数，允许更激进的内联和优化，跳过标准库的封装层。
循环展开 (Loop Unrolling)：
sqrt 函数中手动展开了 4 次牛顿迭代。这消除了循环控制的开销（判断、跳转），让 CPU 流水线能连续执行浮点除法和加法。
更大的 IO 缓冲区：
从 16KB 增加到 64KB，进一步减少了 fread/fwrite 的系统调用次数。
指针算术优化：
reverse 函数中使用指针递增/递减代替数组下标访问，减少地址计算指令。

再次重复：
此代码在ACGO的编译器中毫无用处！！！
编译命令：g++ -O3 -march=native -std=c++17 -funroll-loops -ffast-math -flto -o main main.cpp

看完的还请点个赞吧！！！ ACGO_啊啊啊

有帮助，赞一个

去预览

0/2000

全部评论 3

Eucatastrophe‌废号
pbds

2026-03-15 来自浙江
1
- AC君
  回复
  Eucatastrophe‌废号
  
  2026-03-16 来自浙江
  0
古希腊掌管AC和WA的神
%%%

2026-03-16 来自上海
0
AC君
欢迎各位大佬对此代码的点评！！！

2026-03-15 来自浙江
0

全部评论 3

热门讨论