{"token_budgets":{"gpqa":{"input":1500,"output":2500,"source":"https://github.com/idavidrein/gpqa","note":"Diamond items are short; we budget for a chain-of-thought solution that fits in 2,500 output tokens."},"mmlu-pro":{"input":900,"output":1500,"source":"https://github.com/TIGER-AI-Lab/MMLU-Pro","note":"MCQ stems are short; output covers a brief justification and choice."},"aime":{"input":600,"output":12000,"source":"https://artofproblemsolving.com/wiki/index.php/AIME_Problems_and_Solutions","note":"AIME is heavy chain-of-thought territory; output dwarfs input for top models."},"math-500":{"input":500,"output":8000,"source":"https://github.com/openai/prm800k","note":"Competition-math prompts are short; solutions run long."},"hle":{"input":2500,"output":6000,"source":"https://lastexam.ai","note":"HLE prompts are longer (multi-paragraph), and most strong models reason extensively."}}}