{"count":10,"benchmarks":[{"slug":"swe-bench-pro","name":"SWE-Bench Pro","category":"code","description":"Scale AI contamination-resistant 1,865-task code benchmark (Python/Go/TS/JS, 41 repos).","human_baseline":50,"human_baseline_source_url":"https://scale.com/leaderboard/swe_bench_pro","human_baseline_note":"Scale AI Pro release (Sep 2025) targeted human-feasibility ~50% under a 1h work-cap.","leaderboard_url":"https://labs.scale.com/leaderboard/swe_bench_pro_public","data_source_url":"https://labs.scale.com/leaderboard/swe_bench_pro_public","fetch_frequency":"every 6h","sota_score":null,"sota_model":null,"sota_provider":null,"sota_date":null,"last_fetched_at":null,"gap":null,"status":"pending","crossed":false,"crossover_date":null,"crossover_model":null,"days_since_crossover":null,"eta_to_crossover":null},{"slug":"swe-bench-verified","name":"SWE-Bench Verified","category":"code","description":"Human-verified 500-task subset of SWE-Bench (real GitHub issues + tests). Live scores parsed from swe-bench/experiments.","human_baseline":50,"human_baseline_source_url":"https://openai.com/index/introducing-swe-bench-verified/","human_baseline_note":"OpenAI Verified-subset selection criterion: tasks a competent engineer can resolve in <1h; ~50% pass rate target.","leaderboard_url":"https://www.swebench.com/","data_source_url":"https://api.github.com/repos/swe-bench/experiments/contents/evaluation/verified","fetch_frequency":"every 6h","sota_score":null,"sota_model":null,"sota_provider":null,"sota_date":null,"last_fetched_at":null,"gap":null,"status":"pending","crossed":false,"crossover_date":null,"crossover_model":null,"days_since_crossover":null,"eta_to_crossover":null},{"slug":"osworld","name":"OSWorld","category":"computer-use","description":"Real-OS computer-use benchmark (369 tasks across Ubuntu/Windows apps). The benchmark Berkeley's contamination study highlighted as gaming-resistant.","human_baseline":72.36,"human_baseline_source_url":"https://arxiv.org/abs/2404.07972","human_baseline_note":"Average individual human Success Rate on OSWorld, Xie et al. NeurIPS 2024 §4.2.","leaderboard_url":"https://os-world.github.io/","data_source_url":"https://raw.githubusercontent.com/xlang-ai/OSWorld/main/README.md","fetch_frequency":"every 6h","sota_score":null,"sota_model":null,"sota_provider":null,"sota_date":null,"last_fetched_at":null,"gap":null,"status":"pending","crossed":false,"crossover_date":null,"crossover_model":null,"days_since_crossover":null,"eta_to_crossover":null},{"slug":"gpqa-diamond","name":"GPQA Diamond","category":"knowledge","description":"Graduate-level Google-Proof Q&A, Diamond subset.","human_baseline":65,"human_baseline_source_url":"https://arxiv.org/abs/2311.12022","human_baseline_note":"GPQA Rein et al. 2023: PhD-domain experts achieve ~65% on Diamond subset.","leaderboard_url":"https://raw.githubusercontent.com/idavidrein/gpqa/main/README.md","data_source_url":"https://raw.githubusercontent.com/idavidrein/gpqa/main/README.md","fetch_frequency":"every 6h","sota_score":null,"sota_model":null,"sota_provider":null,"sota_date":null,"last_fetched_at":null,"gap":null,"status":"pending","crossed":false,"crossover_date":null,"crossover_model":null,"days_since_crossover":null,"eta_to_crossover":null},{"slug":"hle","name":"Humanity's Last Exam","category":"knowledge","description":"Scale AI + CAIS exam (2,500 questions) targeting frontier-of-knowledge reasoning. PhD-baseline.","human_baseline":65,"human_baseline_source_url":"https://agi.safe.ai/","human_baseline_note":"Humanity's Last Exam paper (Phan et al., Jan 2025): PhD-domain expert baseline.","leaderboard_url":"https://artificialanalysis.ai/evaluations/humanitys-last-exam","data_source_url":"https://artificialanalysis.ai/evaluations/humanitys-last-exam","fetch_frequency":"every 6h","sota_score":null,"sota_model":null,"sota_provider":null,"sota_date":null,"last_fetched_at":null,"gap":null,"status":"pending","crossed":false,"crossover_date":null,"crossover_model":null,"days_since_crossover":null,"eta_to_crossover":null},{"slug":"mmlu","name":"MMLU","category":"knowledge","description":"57-task knowledge benchmark from Hendrycks et al. 2021. Saturating.","human_baseline":89.8,"human_baseline_source_url":"https://arxiv.org/abs/2009.03300","human_baseline_note":"Hendrycks et al. 2021 MMLU paper: human expert-domain baseline.","leaderboard_url":"https://epoch.ai/data/benchmarks","data_source_url":"https://epoch.ai/data/benchmarks","fetch_frequency":"every 6h","sota_score":null,"sota_model":null,"sota_provider":null,"sota_date":null,"last_fetched_at":null,"gap":null,"status":"pending","crossed":false,"crossover_date":null,"crossover_model":null,"days_since_crossover":null,"eta_to_crossover":null},{"slug":"aime-2025","name":"AIME 2025","category":"math","description":"American Invitational Mathematics Examination, 2025 edition; live tracked by matharena.ai.","human_baseline":5,"human_baseline_source_url":"https://artofproblemsolving.com/wiki/index.php/American_Invitational_Mathematics_Examination","human_baseline_note":"AIME 2025: ~5/15 questions correct is the AIME→USAMO qualifying cutoff (top human-competitor distribution converted to %, ~33%).","leaderboard_url":"https://matharena.ai/","data_source_url":"https://matharena.ai/","fetch_frequency":"every 6h","sota_score":null,"sota_model":null,"sota_provider":null,"sota_date":null,"last_fetched_at":null,"gap":null,"status":"pending","crossed":false,"crossover_date":null,"crossover_model":null,"days_since_crossover":null,"eta_to_crossover":null},{"slug":"frontier-math","name":"FrontierMath","category":"math","description":"Epoch AI olympiad-/research-level math benchmark (Nov 2024). The hardest open frontier.","human_baseline":75,"human_baseline_source_url":"https://epoch.ai/frontiermath","human_baseline_note":"Epoch AI FrontierMath (Nov 2024): domain-expert mathematician baseline on Tier 1-3 problems (estimated).","leaderboard_url":"https://epoch.ai/benchmarks/frontiermath","data_source_url":"https://epoch.ai/benchmarks/frontiermath","fetch_frequency":"every 6h","sota_score":38,"sota_model":"gemini-3-pro-preview","sota_provider":"Epoch AI","sota_date":null,"last_fetched_at":"2026-06-02T18:00:34.702Z","gap":-37,"status":"not-crossed","crossed":false,"crossover_date":null,"crossover_model":null,"days_since_crossover":null,"eta_to_crossover":null},{"slug":"arc-agi-1","name":"ARC-AGI-1","category":"reasoning","description":"François Chollet's abstraction-and-reasoning corpus, v1.","human_baseline":80,"human_baseline_source_url":"https://arcprize.org/arc","human_baseline_note":"ARC-AGI-1 average individual human performance reported on arcprize.org (Mechanical Turk panel, 2024).","leaderboard_url":"https://arcprize.org/leaderboard","data_source_url":"https://arcprize.org/leaderboard","fetch_frequency":"every 6h","sota_score":null,"sota_model":null,"sota_provider":null,"sota_date":null,"last_fetched_at":null,"gap":null,"status":"pending","crossed":false,"crossover_date":null,"crossover_model":null,"days_since_crossover":null,"eta_to_crossover":null},{"slug":"arc-agi-2","name":"ARC-AGI-2","category":"reasoning","description":"Harder revision of ARC-AGI launched May 2025; SOTA passed human baseline May 2026.","human_baseline":66,"human_baseline_source_url":"https://arxiv.org/abs/2505.11831","human_baseline_note":"ARC-AGI-2 paper, average individual human accuracy (calibration panel May 2025).","leaderboard_url":"https://arcprize.org/leaderboard","data_source_url":"https://arcprize.org/leaderboard","fetch_frequency":"every 6h","sota_score":null,"sota_model":null,"sota_provider":null,"sota_date":null,"last_fetched_at":null,"gap":null,"status":"pending","crossed":false,"crossover_date":null,"crossover_model":null,"days_since_crossover":null,"eta_to_crossover":null}]}