#创作计划#用python判断AI代码

we go!

2025-12-08 19:05:29

发布于：湖北

18阅读

0回复

0点赞

作为一个刚到ACGO的新人，我也没有什么好说的，来给大家写一篇文章玩玩～
不过，等—作为一个资深两年半学习py，HTML和C++的帅哥，我终于，写完了——（爽）
相信各位新团长、管理员肯定还在为竞赛、题单、作业中有人用AI写代码而发愁？
嗯，那么，接下来的“秘籍”请你接好！
（此文章针对KNighter框架进行编写，后期还会通过码风和做题切出进行编写）

在正式食用前
不过，等，在你康下面的内容之前，建议你去康康"(*￣3￣)热爱￥ 303LYP”写的如何识别AI作业这篇文章
（其实你们知道吗，我一开始以为ACGO在编辑文章时可以写HTML的代码，嘤嘤嘤~）

源代码（python的知道吧，不会用的私我或者自己去网上找教程）

import subprocess
import tempfile
import os
import json
import re
from pathlib import Path
import hashlib

class CppAIDetector:
    def __init__(self, clang_path="clang", python_path="python3"):
        """
        初始化检测器
        """
        self.clang_path = clang_path
        self.python_path = python_path
        self.rule_patterns = self._load_rule_patterns()
    
    def _load_rule_patterns(self):
        """加载已知的AI生成C++代码特征规则"""
        return {
            # 控制流模式
            "excessive_template_usage": {
                "pattern": r"template\s*<[^>]*>\s*(class|typename)\s+\w+",
                "threshold": 5,
                "weight": 0.15
            },
            # 内存管理模式
            "smart_pointer_patterns": {
                "pattern": r"std::(unique_ptr|shared_ptr|weak_ptr)<[^>]*>",
                "threshold": 3,
                "weight": 0.12
            },
            # 错误处理模式
            "exception_handling_patterns": {
                "pattern": r"try\s*\{[^}]*\}\s*catch\s*\([^)]*\)",
                "threshold": 2,
                "weight": 0.10
            },
            # 循环结构模式
            "range_based_loops": {
                "pattern": r"for\s*\(\s*auto\s*&?\s*\w+\s*:\s*\w+\s*\)",
                "threshold": 3,
                "weight": 0.08
            },
            # 类型推断模式
            "auto_type_usage": {
                "pattern": r"auto\s+\w+\s*=",
                "threshold": 5,
                "weight": 0.10
            }
        }
    
    def analyze_with_clang_tidy(self, code_content, checks=None):
        """
        使用clang-tidy进行静态分析[2](@ref)
        """
        if checks is None:
            checks = "modernize-*,readability-*"
        
        with tempfile.NamedTemporaryFile(mode='w', suffix='.cpp', delete=False) as f:
            f.write(code_content)
            temp_file = f.name
        
        try:
            cmd = [
                self.clang_path + "-tidy",
                temp_file,
                f"-checks={checks}",
                "--"
            ]
            
            result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
            
            # 分析输出
            warnings = len(re.findall(r'warning:', result.stdout))
            errors = len(re.findall(r'error:', result.stdout))
            
            return {
                "success": True,
                "warnings": warnings,
                "errors": errors,
                "raw_output": result.stdout
            }
        except subprocess.TimeoutExpired:
            return {"success": False, "error": "分析超时"}
        except Exception as e:
            return {"success": False, "error": str(e)}
        finally:
            os.unlink(temp_file)
    
    def extract_structural_features(self, code_content):
        """
        提取代码结构特征[6](@ref)
        """
        features = {}
        
        # 基本统计
        features['total_lines'] = len(code_content.split('\n'))
        features['total_chars'] = len(code_content)
        
        # 注释分析
        single_line_comments = len(re.findall(r'//[^\n]*', code_content))
        multi_line_comments = len(re.findall(r'/\*.*?\*/', code_content, re.DOTALL))
        features['comment_density'] = (single_line_comments + multi_line_comments) / max(1, features['total_lines'])
        
        # 代码熵分析
        cleaned_code = re.sub(r'\s+', '', code_content)
        if len(cleaned_code) > 0:
            char_freq = {}
            for char in cleaned_code:
                char_freq[char] = char_freq.get(char, 0) + 1
            
            entropy = 0.0
            for count in char_freq.values():
                p = count / len(cleaned_code)
                entropy -= p * math.log2(p)
            features['entropy'] = entropy
        else:
            features['entropy'] = 0
        
        # 模式匹配
        for pattern_name, pattern_config in self.rule_patterns.items():
            matches = len(re.findall(pattern_config['pattern'], code_content, re.MULTILINE | re.DOTALL))
            features[pattern_name] = matches
        
        return features
    
    def calculate_ai_probability(self, features):
        """
        基于特征计算AI生成概率[1,6](@ref)
        """
        score = 0.0
        max_score = 0.0
        
        # 1. 代码风格一致性评分（AI代码通常更一致）
        consistency_indicators = 0
        total_indicators = 0
        
        for pattern_name, pattern_config in self.rule_patterns.items():
            matches = features.get(pattern_name, 0)
            threshold = pattern_config['threshold']
            weight = pattern_config['weight']
            
            if matches > threshold:
                consistency_indicators += 1
                score += weight * min(matches / threshold, 2.0)  #  capped at 2x
            total_indicators += 1
            max_score += weight * 2.0  # 最大可能得分
        
        # 风格一致性占比
        if total_indicators > 0:
            style_consistency = consistency_indicators / total_indicators
            score += style_consistency * 0.3
            max_score += 0.3
        
        # 2. 注释密度（AI代码可能注释较少或模式化）
        comment_density = features.get('comment_density', 0)
        if comment_density < 0.1:  # 低注释密度
            score += 0.15
        elif comment_density > 0.3:  # 高注释密度（可能是教学代码）
            score -= 0.1
        max_score += 0.15
        
        # 3. 熵值分析（AI代码可能有特定熵值范围）
        entropy = features.get('entropy', 0)
        if 3.5 < entropy < 4.5:  # 经验性的AI代码熵值范围
            score += 0.2
        max_score += 0.2
        
        # 4. 代码规模调整
        total_lines = features.get('total_lines', 0)
        if total_lines < 20:  # 过短代码可信度降低
            score *= 0.7
        elif total_lines > 200:  # 长代码更可能是人工编写
            score *= 0.9
        
        # 转换为概率
        probability = (score / max_score) * 100 if max_score > 0 else 0
        return min(probability, 100)
    
    def advanced_ast_analysis(self, code_content):
        """
        高级AST分析（需要clang Python绑定）
        """
        try:
            import clang.cindex
        except ImportError:
            return {"success": False, "error": "请安装clang Python绑定: pip install clang"}
        
        try:
            index = clang.cindex.Index.create()
            with tempfile.NamedTemporaryFile(mode='w', suffix='.cpp', delete=False) as f:
                f.write(code_content)
                temp_file = f.name
            
            tu = index.parse(temp_file, args=['-std=c++17'])
            
            ast_metrics = {
                'function_count': 0,
                'average_function_length': 0,
                'max_nesting_depth': 0,
                'template_instantiation_count': 0
            }
            
            # 简单的AST遍历统计
            def analyze_ast(node, depth=0):
                ast_metrics['max_nesting_depth'] = max(ast_metrics['max_nesting_depth'], depth)
                
                if node.kind == clang.cindex.CursorKind.FUNCTION_DECL:
                    ast_metrics['function_count'] += 1
                
                if (node.kind == clang.cindex.CursorKind.CLASS_TEMPLATE or 
                    node.kind == clang.cindex.CursorKind.FUNCTION_TEMPLATE):
                    ast_metrics['template_instantiation_count'] += 1
                
                for child in node.get_children():
                    analyze_ast(child, depth + 1)
            
            if tu.cursor:
                analyze_ast(tu.cursor)
            
            os.unlink(temp_file)
            ast_metrics['success'] = True
            return ast_metrics
            
        except Exception as e:
            return {"success": False, "error": f"AST分析失败: {str(e)}"}
    
    def comprehensive_detect(self, code_content, use_ast_analysis=True):
        """
        综合检测入口函数
        """
        if len(code_content.strip()) < 100:
            return {
                "ai_probability": 0,
                "verdict": "代码过短，无法进行可靠分析",
                "confidence": "low",
                "details": {"error": "代码长度不足100字符"}
            }
        
        try:
            # 1. 基础特征提取
            features = self.extract_structural_features(code_content)
            
            # 2. 静态分析
            static_analysis = self.analyze_with_clang_tidy(code_content)
            
            # 3. 高级AST分析（可选）
            ast_analysis = None
            if use_ast_analysis:
                ast_analysis = self.advanced_ast_analysis(code_content)
            
            # 4. 计算综合概率
            ai_probability = self.calculate_ai_probability(features)
            
            # 5. 生成判定结果
            if ai_probability >= 70:
                verdict = "⚠️ 高概率为AI生成"
                confidence = "high"
            elif ai_probability >= 40:
                verdict = "🤔 可能包含AI生成代码，建议人工复核"
                confidence = "medium"
            else:
                verdict = "✅ 低概率为AI生成"
                confidence = "medium"
            
            result = {
                "ai_probability": round(ai_probability, 2),
                "verdict": verdict,
                "confidence": confidence,
                "details": {
                    "features": features,
                    "static_analysis": static_analysis,
                    "ast_analysis": ast_analysis
                }
            }
            
            return result
            
        except Exception as e:
            return {
                "ai_probability": 0,
                "verdict": "分析过程中发生错误",
                "confidence": "low",
                "details": {"error": str(e)}
            }

# 使用示例和测试代码
def main():
    detector = CppAIDetector()
    
    # 测试代码示例
    test_codes = [
        """
        #include <iostream>
        #include <vector>
        #include <algorithm>
        
        int main() {
            std::vector<int> numbers = {1, 5, 3, 4, 2};
            std::sort(numbers.begin(), numbers.end());
            
            for (const auto& num : numbers) {
                std::cout << num << " ";
            }
            
            return 0;
        }
        """,
        
        """
        // 一个复杂的模板元编程示例
        template<typename T>
        class AdvancedCalculator {
        public:
            T add(T a, T b) { return a + b; }
            T multiply(T a, T b) { return a * b; }
        };
        
        template<>
        class AdvancedCalculator<std::string> {
        public:
            std::string add(std::string a, std::string b) { return a + b; }
        };
        """
    ]
    
    for i, code in enumerate(test_codes, 1):
        print(f"分析测试代码 #{i}:")
        print("=" * 50)
        
        result = detector.comprehensive_detect(code)
        
        print(f"AI生成概率: {result['ai_probability']}%")
        print(f"判定结果: {result['verdict']}")
        print(f"置信度: {result['confidence']}")
        
        if 'features' in result['details']:
            features = result['details']['features']
            print("主要特征:")
            for feature, value in list(features.items())[:5]:  # 显示前5个特征
                print(f"  {feature}: {value}")
        
        print()

if __name__ == "__main__":
    main()

此版本为V.0.1.0公测版本，不代表正式版本全部，欢迎大家使用提出建议，作者知道了会更新修改哒~

版本日志：
V.0.1.0 公测版 2025年12月7日更新

不知道大家康懂木有，还是给大家解释一下
技术原理
核心思路借鉴了UIUC团队的KNighter框架：不直接让大模型判断，而是利用其能力生成特定的静态分析规则，再通过成熟的代码分析工具执行这些规则，从而将AI的“洞察力”转化为可重复、可解释的检测流程。
1.规则生成与模式匹配
通过分析大量AI生成代码的共性特征，生成特定的检测规则，避免了大模型直接分析的全部代码带来的高计算成本和"幻觉"风险。
2.多维度特征融合
结合了传统静态分析（clang-tidy）、代码结构特征（AST分析）和统计特征（代码熵、注释密度等），形成一个综合判断体系。
3.可解释的评分机制
每个特征都有明确的权重和贡献度，用户可以清楚理解判定依据，避免了"黑箱"判断。
注意事项
这只是一个工具，也会有误判的时候
代码长度要求：至少需要100字符以上的代码才能进行有效分析
语言特性覆盖：对某些高级C++特性（如模板元编程）的检测仍在优化中
对抗性规避：经过精心修改的AI代码仍可能规避检测
误报可能性：风格极为规范的人工代码可能被误判为AI生成

如何食用
大家可以自己去网上找一找如何编写python代码的教程，我简单讲一讲
第一步：安装Python
首先，你需要在电脑上安装Python。
访问官网：打开浏览器，访问 Python官方网站（python.org ）。
下载安装包：进入"Downloads"菜单，网站通常会自动推荐适合你操作系统的安装包（比如Windows系统会推荐.exe文件，macOS会推荐.pkg文件）。直接点击下载即可。选择Python 3.7或更高的版本。
运行安装：双击下载好的安装包。在安装过程中，请务必勾选 "Add Python to PATH" 这个选项，这能让你在电脑的任何地方都能轻松运行Python。之后按照安装向导的提示完成安装即可。
验证安装：同时按下键盘上的 Win + R 键（如果你是Windows用户），输入 cmd 然后按回车，打开命令提示符窗口。输入以下命令并回车：python --version
如果安装成功，它会显示Python的版本号（例如 Python 3.10.14 ）。
第二步：准备“检测工具”和“待检测文件”
现在你需要准备两样东西：一是我们要用的Python检测程序，二是你想要检测的C++代码文件。
创建项目文件夹：在你的电脑上找一个合适的位置，新建一个文件夹，可以命名为 ai_code_detector 。这个文件夹将作为我们的工作目录。

放置检测脚本：将我前面提供的完整Python代码（ CppAIDetector 类等）保存为一个名为 detector.py 的文件，并放入这个文件夹。

放置（c++)代码：将你想要检测的C++代码文件（例如 main.cpp ）也复制到这个文件夹里。
第三步：安装所需的Python库
我们的检测脚本需要一些额外的Python库才能工作。你只需要安装一次。
打开命令行（命令提示符或终端），确保当前路径是你的项目文件夹，然后运行以下命令来安装所需的库：
pip install nltk
第四步：运行检测程序
一切就绪，现在可以运行检测程序了。

在 detector.py 文件的末尾，确保有类似下面的代码块。这段代码的作用是创建一个检测器实例，并读取你的C++文件进行分析：

if __name__ == "__main__":
    # 创建检测器
    detector = CppAIDetector()

    # 读取你的C++代码文件，请将'main.cpp'替换为你的实际文件名
    with open('main.cpp', 'r', encoding='utf-8') as f:
        cpp_code = f.read()

    # 开始分析
    results = detector.comprehensive_detect(cpp_code)

    # 打印结果
    print("\n" + "="*50)
    print("AI代码生成检测报告")
    print("="*50)
    print(f"AI生成概率: {results['ai_probability']}%")
    print(f"综合判断: {results['verdict']}")
    print(f"置信度: {results['confidence']}")
    if 'features' in results['details']:
        print("\n关键特征指标:")
        features = results['details']['features']
        print(f"  - 代码长度: {features.get('total_lines', 'N/A')} 行")
        print(f"  - 注释密度: {features.get('comment_density', 0):.3f}")
        print(f"  - 文本熵值: {features.get('entropy', 0):.3f}")

在命令行中，运行以下命令来启动检测：
python detector.py
第五步：理解检测结果
程序运行后，你会看到一个报告。你需要关注的核心指标是 AI生成概率。

高概率（例如 > 70%）：这段C++代码在风格特征上有很多与AI生成代码相似的地方，需要你重点关注。

中低概率（例如 < 40%）：这段代码更可能由人工编写。

中间概率：这是一个“灰色地带”，结果不明确，需要你结合自己对代码和作者的了解进行进一步判断。

好啦好啦，那么这篇文章展开到这就结束了。
爆肝了好久，希望大家支持一下~
版本号和更新日志在前面代码部分
大家记得看看，会有持续更新的~

有帮助，赞一个

去预览

0/2000

全部评论 1

we go!
有啥建议写在这里，作者会康到的

1周前来自湖北
0

全部评论 1

热门讨论