{"benchmark_id":"usamo25","name":"USAMO25","parent_benchmark":null,"categories":["math","reasoning"],"modality":"text","multilingual":false,"max_score":1.0,"language":"en","description":"The 2025 United States of America Mathematical Olympiad (USAMO) benchmark consists of six challenging mathematical problems requiring rigorous proof-based reasoning. USAMO is the most prestigious high school mathematics competition in the United States, serving as the final round of the American Mathematics Competitions series. This benchmark evaluates models on mathematical problem-solving capabilities beyond simple numerical computation, focusing on formal mathematical reasoning and proof generation.","paper_link":"https://arxiv.org/abs/2503.21934","implementation_link":null,"verified":false,"created_at":"2026-05-07T16:53:26.637810+00:00","updated_at":"2026-07-05T18:28:04.180835+00:00","statistics":{"total_models":3,"average_score":0.6566666666666666,"min_score":0.375,"max_score":0.976,"score_stddev":0.30226533597707383,"verified_count":0,"self_reported_count":3},"child_benchmarks":[],"linked_dataset":null,"models":[{"rank":1,"model_id":"claude-mythos-preview","model_name":"Claude Mythos Preview","organization_id":"anthropic","organization_name":"Anthropic","organization_country":"US","score":0.976,"normalized_score":0.976,"verified":false,"self_reported":true,"self_reported_source":"https://www.anthropic.com/claude-mythos-preview-system-card","analysis_method":"USAMO 2026 math proofs. Opus 4.6: 42.3%, GPT-5.4: 95.2%, Gemini 3.1 Pro: 74.4%.","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2026-04-07","param_count":null,"is_open_source":false,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":2,"model_id":"grok-4-heavy","model_name":"Grok-4 Heavy","organization_id":"xai","organization_name":"xAI","organization_country":"US","score":0.619,"normalized_score":0.619,"verified":false,"self_reported":true,"self_reported_source":"https://x.com/xai/status/1943158495588815072","analysis_method":"accuracy","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-07-09","param_count":null,"is_open_source":false,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":3,"model_id":"grok-4","model_name":"Grok-4","organization_id":"xai","organization_name":"xAI","organization_country":"US","score":0.375,"normalized_score":0.375,"verified":false,"self_reported":true,"self_reported_source":"https://x.com/xai/status/1943158495588815072","analysis_method":"accuracy","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-07-09","param_count":null,"is_open_source":false,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null}]}