{"benchmark_id":"mmlu-(cot)","name":"MMLU (CoT)","parent_benchmark":null,"categories":["math","reasoning","language","legal","finance","general","healthcare"],"modality":"text","multilingual":false,"max_score":1.0,"language":"en","description":"Chain-of-Thought variant of the Massive Multitask Language Understanding benchmark, evaluating language models across 57 tasks including elementary mathematics, US history, computer science, law, and other professional and academic subjects. This version uses chain-of-thought prompting to elicit step-by-step reasoning.","paper_link":"https://arxiv.org/abs/2009.03300","implementation_link":null,"verified":false,"created_at":"2026-05-07T16:53:24.717367+00:00","updated_at":"2026-07-05T18:27:54.364054+00:00","statistics":{"total_models":3,"average_score":0.8253333333333334,"min_score":0.73,"max_score":0.886,"score_stddev":0.08357830659527228,"verified_count":0,"self_reported_count":3},"child_benchmarks":[],"linked_dataset":null,"models":[{"rank":1,"model_id":"llama-3.1-405b-instruct","model_name":"Llama 3.1 405B Instruct","organization_id":"meta","organization_name":"Meta","organization_country":"US","score":0.886,"normalized_score":0.886,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct","analysis_method":"0-shot, macro_avg/acc","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2024-07-23","param_count":405000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":2,"model_id":"llama-3.1-70b-instruct","model_name":"Llama 3.1 70B Instruct","organization_id":"meta","organization_name":"Meta","organization_country":"US","score":0.86,"normalized_score":0.86,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct","analysis_method":"0-shot Chain-of-Thought","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2024-07-23","param_count":70000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":3,"model_id":"llama-3.1-8b-instruct","model_name":"Llama 3.1 8B Instruct","organization_id":"meta","organization_name":"Meta","organization_country":"US","score":0.73,"normalized_score":0.73,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct","analysis_method":"0-shot","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2024-07-23","param_count":8000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null}]}