{"benchmark_id":"math-500","name":"MATH-500","parent_benchmark":null,"categories":["math","reasoning"],"modality":"text","multilingual":false,"max_score":1.0,"language":"en","description":"MATH-500 is a subset of the MATH dataset containing 500 challenging competition mathematics problems from AMC 10, AMC 12, AIME, and other mathematics competitions. Each problem includes full step-by-step solutions and spans multiple difficulty levels across seven mathematical subjects including Prealgebra, Algebra, Number Theory, Counting and Probability, Geometry, Intermediate Algebra, and Precalculus.","paper_link":"https://arxiv.org/abs/2103.03874","implementation_link":null,"verified":false,"created_at":"2026-05-07T16:53:24.267735+00:00","updated_at":"2026-07-05T18:27:52.020832+00:00","statistics":{"total_models":32,"average_score":0.93235625,"min_score":0.6902,"max_score":0.992,"score_stddev":0.07176786475359512,"verified_count":0,"self_reported_count":32},"child_benchmarks":[],"linked_dataset":{"dataset_id":"4dcaad23-271f-40d1-a31b-71befa220fe5","slug":"math-500","organization_id":"1fa61e5b-539e-4be7-a0e2-1aacf0310fc8","display_name":"MATH-500","leaderboards":[{"id":"d4adca47-dfa8-4080-bfd6-c9d0a01e21c5","name":"Official MATH-500","metric_name":"accuracy","metric_source":"column_metric","subset_name":"test","sort_direction":"desc"}]},"models":[{"rank":1,"model_id":"longcat-flash-thinking","model_name":"LongCat-Flash-Thinking","organization_id":"meituan","organization_name":"Meituan","organization_country":"CN","score":0.992,"normalized_score":0.992,"verified":false,"self_reported":true,"self_reported_source":"https://github.com/meituan-longcat/LongCat-Flash-Thinking","analysis_method":"Mean@1","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-09-22","param_count":560000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":2,"model_id":"sarvam-105b","model_name":"Sarvam-105B","organization_id":"sarvamai","organization_name":"Sarvam AI","organization_country":"IN","score":0.986,"normalized_score":0.986,"verified":false,"self_reported":true,"self_reported_source":"https://www.sarvam.ai/blogs/sarvam-30b-105b","analysis_method":null,"verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2026-03-06","param_count":105000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":3,"model_id":"glm-4.5","model_name":"GLM-4.5","organization_id":"zai-org","organization_name":"Zhipu AI","organization_country":"CN","score":0.982,"normalized_score":0.982,"verified":false,"self_reported":true,"self_reported_source":"https://z.ai/blog/glm-4.5","analysis_method":"standard","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-07-28","param_count":355000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":4,"model_id":"glm-4.5-air","model_name":"GLM-4.5-Air","organization_id":"zai-org","organization_name":"Zhipu AI","organization_country":"CN","score":0.981,"normalized_score":0.981,"verified":false,"self_reported":true,"self_reported_source":"https://z.ai/blog/glm-4.5","analysis_method":"standard","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-07-28","param_count":106000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":5,"model_id":"nvidia-nemotron-nano-9b-v2","model_name":"Nemotron Nano 9B v2","organization_id":"nvidia","organization_name":"NVIDIA","organization_country":"US","score":0.978,"normalized_score":0.978,"verified":false,"self_reported":true,"self_reported_source":"https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2/modelcard","analysis_method":"Score, Reasoning On","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-08-18","param_count":8900000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":6,"model_id":"kimi-k2-instruct","model_name":"Kimi K2 Instruct","organization_id":"moonshotai","organization_name":"Moonshot AI","organization_country":"CN","score":0.974,"normalized_score":0.974,"verified":false,"self_reported":true,"self_reported_source":"https://moonshotai.github.io/Kimi-K2/","analysis_method":"Accuracy","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-07-11","param_count":1000000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":6,"model_id":"kimi-k2-instruct-0905","model_name":"Kimi K2-Instruct-0905","organization_id":"moonshotai","organization_name":"Moonshot AI","organization_country":"CN","score":0.974,"normalized_score":0.974,"verified":false,"self_reported":true,"self_reported_source":"https://moonshotai.github.io/Kimi-K2/","analysis_method":"Accuracy","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-09-05","param_count":1000000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":8,"model_id":"sarvam-30b","model_name":"Sarvam-30B","organization_id":"sarvamai","organization_name":"Sarvam AI","organization_country":"IN","score":0.97,"normalized_score":0.97,"verified":false,"self_reported":true,"self_reported_source":"https://www.sarvam.ai/blogs/sarvam-30b-105b","analysis_method":null,"verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2026-03-06","param_count":30000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":8,"model_id":"llama-3.1-nemotron-ultra-253b-v1","model_name":"Llama 3.1 Nemotron Ultra 253B v1","organization_id":"nvidia","organization_name":"NVIDIA","organization_country":"US","score":0.97,"normalized_score":0.97,"verified":false,"self_reported":true,"self_reported_source":"https://build.nvidia.com/nvidia/llama-3_1-nemotron-ultra-253b-v1/modelcard","analysis_method":"Pass@1, Reasoning","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-04-07","param_count":253000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":10,"model_id":"longcat-flash-lite","model_name":"LongCat-Flash-Lite","organization_id":"meituan","organization_name":"Meituan","organization_country":"CN","score":0.968,"normalized_score":0.968,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/meituan-longcat/LongCat-Flash-Lite","analysis_method":null,"verification_date":null,"provider_id":"meituan","input_cost_per_million":0.1,"output_cost_per_million":0.4,"context_window":256000,"announcement_date":"2026-02-05","param_count":68500000000,"is_open_source":true,"is_new":false,"best_latency":1.5,"latency_provider":"Meituan","best_throughput":500.0,"throughput_provider":"Meituan","context_provider":"Meituan"},{"rank":10,"model_id":"minimax-m1-80k","model_name":"MiniMax M1 80K","organization_id":"minimax","organization_name":"MiniMax","organization_country":"CN","score":0.968,"normalized_score":0.968,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-06-16","param_count":456000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":12,"model_id":"llama-3.3-nemotron-super-49b-v1","model_name":"Llama-3.3 Nemotron Super 49B v1","organization_id":"nvidia","organization_name":"NVIDIA","organization_country":"US","score":0.966,"normalized_score":0.966,"verified":false,"self_reported":true,"self_reported_source":"https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1/modelcard","analysis_method":"Pass@1, Reasoning On","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-03-18","param_count":49900000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":13,"model_id":"longcat-flash-chat","model_name":"LongCat-Flash-Chat","organization_id":"meituan","organization_name":"Meituan","organization_country":"CN","score":0.964,"normalized_score":0.964,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/meituan-longcat/LongCat-Flash-Chat","analysis_method":null,"verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-08-29","param_count":560000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":14,"model_id":"kimi-k1.5","model_name":"Kimi-k1.5","organization_id":"moonshotai","organization_name":"Moonshot AI","organization_country":"CN","score":0.962,"normalized_score":0.962,"verified":false,"self_reported":true,"self_reported_source":"https://github.com/MoonshotAI/Kimi-k1.5","analysis_method":"Exact Match","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-01-20","param_count":null,"is_open_source":false,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":14,"model_id":"claude-3-7-sonnet-20250219","model_name":"Claude 3.7 Sonnet","organization_id":"anthropic","organization_name":"Anthropic","organization_country":"US","score":0.962,"normalized_score":0.962,"verified":false,"self_reported":true,"self_reported_source":"https://www.anthropic.com/news/claude-3-7-sonnet","analysis_method":"","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-02-24","param_count":null,"is_open_source":false,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":16,"model_id":"minimax-m1-40k","model_name":"MiniMax M1 40K","organization_id":"minimax","organization_name":"MiniMax","organization_country":"CN","score":0.96,"normalized_score":0.96,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-06-16","param_count":456000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":17,"model_id":"deepseek-r1-zero","model_name":"DeepSeek R1 Zero","organization_id":"deepseek","organization_name":"DeepSeek","organization_country":"CN","score":0.959,"normalized_score":0.959,"verified":false,"self_reported":true,"self_reported_source":"https://arxiv.org/abs/2501.12948","analysis_method":"Pass@1","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-01-20","param_count":671000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":18,"model_id":"llama-3.1-nemotron-nano-8b-v1","model_name":"Llama 3.1 Nemotron Nano 8B V1","organization_id":"nvidia","organization_name":"NVIDIA","organization_country":"US","score":0.954,"normalized_score":0.954,"verified":false,"self_reported":true,"self_reported_source":"https://build.nvidia.com/nvidia/llama-3_1-nemotron-nano-8b-v1/modelcard","analysis_method":"Pass@1, Reasoning","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-03-18","param_count":8000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":19,"model_id":"phi-4-mini-reasoning","model_name":"Phi 4 Mini Reasoning","organization_id":"microsoft","organization_name":"Microsoft","organization_country":"US","score":0.946,"normalized_score":0.946,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/microsoft/Phi-4-mini-reasoning","analysis_method":"Standard evaluation","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-04-30","param_count":3800000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":20,"model_id":"deepseek-r1-distill-llama-70b","model_name":"DeepSeek R1 Distill Llama 70B","organization_id":"deepseek","organization_name":"DeepSeek","organization_country":"CN","score":0.945,"normalized_score":0.945,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B","analysis_method":"Pass@1","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-01-20","param_count":70600000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":21,"model_id":"deepseek-r1-distill-qwen-32b","model_name":"DeepSeek R1 Distill Qwen 32B","organization_id":"deepseek","organization_name":"DeepSeek","organization_country":"CN","score":0.943,"normalized_score":0.943,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B","analysis_method":"Pass@1","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-01-20","param_count":32800000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":22,"model_id":"deepseek-v3-0324","model_name":"DeepSeek-V3 0324","organization_id":"deepseek","organization_name":"DeepSeek","organization_country":"CN","score":0.94,"normalized_score":0.94,"verified":false,"self_reported":true,"self_reported_source":"https://api-docs.deepseek.com/news/news250325","analysis_method":"Pass@1","verification_date":null,"provider_id":"novita","input_cost_per_million":0.28,"output_cost_per_million":1.14,"context_window":163840,"announcement_date":"2025-03-25","param_count":671000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":"Novita","best_throughput":null,"throughput_provider":"Novita","context_provider":"Novita"},{"rank":23,"model_id":"deepseek-r1-distill-qwen-14b","model_name":"DeepSeek R1 Distill Qwen 14B","organization_id":"deepseek","organization_name":"DeepSeek","organization_country":"CN","score":0.939,"normalized_score":0.939,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B","analysis_method":"Pass@1","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-01-20","param_count":14800000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":24,"model_id":"deepseek-r1-distill-qwen-7b","model_name":"DeepSeek R1 Distill Qwen 7B","organization_id":"deepseek","organization_name":"DeepSeek","organization_country":"CN","score":0.928,"normalized_score":0.928,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B","analysis_method":"Pass@1","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-01-20","param_count":7620000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":25,"model_id":"qwq-32b-preview","model_name":"QwQ-32B-Preview","organization_id":"qwen","organization_name":"Alibaba Cloud / Qwen Team","organization_country":"CN","score":0.906,"normalized_score":0.906,"verified":false,"self_reported":true,"self_reported_source":"https://qwenlm.github.io/blog/qwq-32b-preview/","analysis_method":"accuracy","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2024-11-28","param_count":32500000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":25,"model_id":"qwq-32b","model_name":"QwQ-32B","organization_id":"qwen","organization_name":"Alibaba Cloud / Qwen Team","organization_country":"CN","score":0.906,"normalized_score":0.906,"verified":false,"self_reported":true,"self_reported_source":"https://qwen-ai.com/qwq-32b/","analysis_method":"accuracy","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-03-05","param_count":32500000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":27,"model_id":"deepseek-v3","model_name":"DeepSeek-V3","organization_id":"deepseek","organization_name":"DeepSeek","organization_country":"CN","score":0.902,"normalized_score":0.902,"verified":false,"self_reported":true,"self_reported_source":"https://github.com/deepseek-ai/DeepSeek-V3","analysis_method":"Exact Match","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2024-12-25","param_count":671000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":28,"model_id":"o1-mini","model_name":"o1-mini","organization_id":"openai","organization_name":"OpenAI","organization_country":"US","score":0.9,"normalized_score":0.9,"verified":false,"self_reported":true,"self_reported_source":"https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/","analysis_method":"0-shot Chain of Thought","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2024-09-12","param_count":null,"is_open_source":false,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":29,"model_id":"deepseek-r1-distill-llama-8b","model_name":"DeepSeek R1 Distill Llama 8B","organization_id":"deepseek","organization_name":"DeepSeek","organization_country":"CN","score":0.891,"normalized_score":0.891,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B","analysis_method":"Pass@1","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-01-20","param_count":8030000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":30,"model_id":"deepseek-r1-distill-qwen-1.5b","model_name":"DeepSeek R1 Distill Qwen 1.5B","organization_id":"deepseek","organization_name":"DeepSeek","organization_country":"CN","score":0.839,"normalized_score":0.839,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B","analysis_method":"Pass@1","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-01-20","param_count":1780000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":31,"model_id":"granite-3.3-8b-base","model_name":"Granite 3.3 8B Base","organization_id":"ibm","organization_name":"IBM","organization_country":"US","score":0.6902,"normalized_score":0.6902,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/ibm-granite/granite-3.3-8b-base","analysis_method":"Not specified","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-04-16","param_count":8170000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":31,"model_id":"granite-3.3-8b-instruct","model_name":"Granite 3.3 8B Instruct","organization_id":"ibm","organization_name":"IBM","organization_country":"US","score":0.6902,"normalized_score":0.6902,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/ibm-granite/granite-3.3-8b-instruct","analysis_method":"Not specified","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-04-16","param_count":8000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null}]}