{"model_id":"minimax-m2","name":"MiniMax M2","organization":{"id":"minimax","name":"MiniMax","website":"https://www.minimaxi.com/"},"description":"MiniMax M2 is an open-source large language model by MiniMax, built for agents and coding tasks. It delivers state-of-the-art tool use, reasoning, and search performance while maintaining exceptional cost-efficiency and speed, priced at just 8% of Claude 3.5 Sonnet’s cost and running at nearly double its inference speed (≈100 TPS). Designed for end-to-end agentic workflows, it excels at long-chain tool calling across Shell, Browser, Python, and other MCP tools. While slightly behind top overseas models in programming, it ranks among the best domestic models and top five globally on the Artificial Analysis benchmark. M2 powers the MiniMax Agent platform, available in Lightning Mode for fast tasks and Pro Mode for complex multi-step reasoning, and its weights, API, and deployment guides are freely available on Hugging Face, vLLM, and SGLang.","release_date":"2025-10-27","announcement_date":"2025-10-27","multimodal":false,"knowledge_cutoff":null,"param_count":230000000000,"training_tokens":null,"available_in_zeroeval":false,"reviews_count":0,"reviews_avg_rating":0,"license":{"name":"MIT","allow_commercial":true},"model_family":null,"fine_tuned_from":null,"tags":{"moe":"true","thinking":"true"},"sources":{"api_ref":"https://platform.minimax.io/docs/guides/text-generation","playground":null,"paper":null,"scorecard_blog":"https://www.minimax.io/news/minimax-m2","repo":"https://github.com/MiniMax-AI/MiniMax-M2","weights":"https://huggingface.co/MiniMaxAI/MiniMax-M2"},"benchmarks":[{"benchmark_id":"aa-index","name":"AA-Index","description":"No official academic documentation found for this benchmark. Extensive research through ArXiv, IEEE/ACL/NeurIPS papers, and university research sites yielded no peer-reviewed sources for an 'aa-index' benchmark. This entry requires verification from official academic sources.","categories":["general"],"modality":"text","max_score":1.0,"score":0.61,"normalized_score":0.61,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"aime-2025","name":"AIME 2025","description":"All 30 problems from the 2025 American Invitational Mathematics Examination (AIME I and AIME II), testing olympiad-level mathematical reasoning with integer answers from 000-999. Used as an AI benchmark to evaluate large language models' ability to solve complex mathematical problems requiring multi-step logical deductions and structured symbolic reasoning.","categories":["math","reasoning"],"modality":"text","max_score":1.0,"score":0.78,"normalized_score":0.78,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"browsecomp","name":"BrowseComp","description":"BrowseComp is a benchmark comprising 1,266 questions that challenge AI agents to persistently navigate the internet in search of hard-to-find, entangled information. The benchmark measures agents' ability to exercise persistence in information gathering, demonstrate creativity in web navigation, and find concise, verifiable answers. Despite the difficulty of the questions, BrowseComp is simple and easy-to-use, as predicted answers are short and easily verifiable against reference answers.","categories":["agents","reasoning","search"],"modality":"text","max_score":1.0,"score":0.44,"normalized_score":0.44,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"browsecomp-zh","name":"BrowseComp-zh","description":"A high-difficulty benchmark purpose-built to comprehensively evaluate LLM agents on the Chinese web, consisting of 289 multi-hop questions spanning 11 diverse domains including Film & TV, Technology, Medicine, and History. Questions are reverse-engineered from short, objective, and easily verifiable answers, requiring sophisticated reasoning and information reconciliation beyond basic retrieval. The benchmark addresses linguistic, infrastructural, and censorship-related complexities in Chinese web environments.","categories":["reasoning","search"],"modality":"text","max_score":1.0,"score":0.485,"normalized_score":0.485,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"gpqa","name":"GPQA","description":"A challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry. Questions are Google-proof and extremely difficult, with PhD experts reaching 65% accuracy.","categories":["biology","chemistry","general","physics","reasoning"],"modality":"text","max_score":1.0,"score":0.78,"normalized_score":0.78,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":"Diamond subset","verification_date":null,"verification_notes":null},{"benchmark_id":"humanity's-last-exam","name":"Humanity's Last Exam","description":"Humanity's Last Exam (HLE) is a multi-modal academic benchmark with 2,500 questions across mathematics, humanities, and natural sciences, designed to test LLM capabilities at the frontier of human knowledge with unambiguous, verifiable solutions","categories":["math","reasoning"],"modality":"multimodal","max_score":1.0,"score":0.125,"normalized_score":0.125,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":"without tools","verification_date":null,"verification_notes":null},{"benchmark_id":"if","name":"IF","description":"Instruction-Following Evaluation (IFEval) benchmark for large language models, focusing on verifiable instructions with 25 types of instructions and around 500 prompts containing one or more verifiable constraints","categories":["general","structured_output"],"modality":"text","max_score":1.0,"score":0.72,"normalized_score":0.72,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"livecodebench","name":"LiveCodeBench","description":"LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.","categories":["code","general","reasoning"],"modality":"text","max_score":1.0,"score":0.83,"normalized_score":0.83,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"mmlu-pro","name":"MMLU-Pro","description":"A more robust and challenging multi-task language understanding benchmark that extends MMLU by expanding multiple-choice options from 4 to 10, eliminating trivial questions, and focusing on reasoning-intensive tasks. Features over 12,000 curated questions across 14 domains and causes a 16-33% accuracy drop compared to original MMLU.","categories":["general","language","math","reasoning"],"modality":"text","max_score":1.0,"score":0.82,"normalized_score":0.82,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"multi-swe-bench","name":"Multi-SWE-Bench","description":"A multilingual benchmark for issue resolving that evaluates Large Language Models' ability to resolve software issues across diverse programming ecosystems. Covers 7 programming languages (Java, TypeScript, JavaScript, Go, Rust, C, and C++) with 1,632 high-quality instances carefully annotated by 68 expert annotators. Addresses limitations of existing benchmarks that focus almost exclusively on Python.","categories":["code","reasoning"],"modality":"text","max_score":1.0,"score":0.362,"normalized_score":0.362,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"scicode","name":"SciCode","description":"SciCode is a research coding benchmark curated by scientists that challenges language models to code solutions for scientific problems. It contains 338 subproblems decomposed from 80 challenging main problems across 16 natural science sub-fields including mathematics, physics, chemistry, biology, and materials science. Problems require knowledge recall, reasoning, and code synthesis skills.","categories":["biology","chemistry","code","math","physics","reasoning"],"modality":"text","max_score":1.0,"score":0.36,"normalized_score":0.36,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"swe-bench-multilingual","name":"SWE-bench Multilingual","description":"A multilingual benchmark for issue resolving in software engineering that covers Java, TypeScript, JavaScript, Go, Rust, C, and C++. Contains 1,632 high-quality instances carefully annotated from 2,456 candidates by 68 expert annotators, designed to evaluate Large Language Models across diverse software ecosystems beyond Python.","categories":["code","reasoning"],"modality":"text","max_score":1.0,"score":0.565,"normalized_score":0.565,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"swe-bench-verified","name":"SWE-Bench Verified","description":"A verified subset of 500 software engineering problems from real GitHub issues, validated by human annotators for evaluating language models' ability to resolve real-world coding issues by generating patches for Python codebases.","categories":["code","frontend_development","reasoning"],"modality":"text","max_score":1.0,"score":0.694,"normalized_score":0.694,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"tau2-telecom","name":"Tau2 Telecom","description":"τ²-Bench telecom domain evaluates conversational agents in a dual-control environment modeled as a Dec-POMDP, where both agent and user use tools in shared telecommunications troubleshooting scenarios that test coordination and communication capabilities.","categories":["communication","reasoning"],"modality":"text","max_score":1.0,"score":0.87,"normalized_score":0.87,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"tau-bench","name":"Tau-bench","description":"τ-bench: A benchmark for tool-agent-user interaction in real-world domains. Tests language agents' ability to interact with users and follow domain-specific rules through dynamic conversations using API tools and policy guidelines across retail and airline domains. Evaluates consistency and reliability of agent behavior over multiple trials.","categories":["agents","general","reasoning","tool_calling"],"modality":"text","max_score":1.0,"score":0.772,"normalized_score":0.772,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"terminal-bench","name":"Terminal-Bench","description":"Terminal-Bench is a benchmark for testing AI agents in real terminal environments. It evaluates how well agents can handle real-world, end-to-end tasks autonomously, including compiling code, training models, setting up servers, system administration, security tasks, data science workflows, and cybersecurity vulnerabilities. The benchmark consists of a dataset of ~100 hand-crafted, human-verified tasks and an execution harness that connects language models to a terminal sandbox.","categories":["agents","code","reasoning"],"modality":"text","max_score":1.0,"score":0.463,"normalized_score":0.463,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null}],"providers":[{"provider_id":"minimax","name":"MiniMax","website":"https://platform.minimax.io","deprecated":false,"deprecated_at":null,"pricing":{"input_per_million":0.3,"output_per_million":1.2},"quantization":null,"limits":{"max_input_tokens":1000000,"max_output_tokens":1000000},"performance":{"throughput":70.0,"latency":4.0},"features":{"web_search":null,"function_calling":null,"structured_output":null,"code_execution":null,"batch_inference":null,"finetuning":null},"modalities":{"input":{"text":true,"image":false,"audio":false,"video":false},"output":{"text":true,"image":false,"audio":false,"video":false}}},{"provider_id":"novita","name":"Novita","website":"https://novita.ai/","deprecated":false,"deprecated_at":null,"pricing":{"input_per_million":0.3,"output_per_million":1.2},"quantization":"bf16","limits":{"max_input_tokens":204800,"max_output_tokens":131072},"performance":{"throughput":null,"latency":null},"features":{"web_search":null,"function_calling":null,"structured_output":null,"code_execution":null,"batch_inference":null,"finetuning":null},"modalities":{"input":{"text":true,"image":false,"audio":false,"video":false},"output":{"text":true,"image":false,"audio":false,"video":false}}}],"benchmark_rankings":[{"benchmark_id":"aime-2025","benchmark_name":"AIME 2025","models":[{"model_id":"kimi-k2-thinking-0905","model_name":"Kimi K2-Thinking-0905","score":1.0,"rank":1,"is_current_model":false},{"model_id":"gpt-5.2-pro-2025-12-11","model_name":"GPT-5.2 Pro","score":1.0,"rank":1,"is_current_model":false},{"model_id":"grok-4-heavy","model_name":"Grok-4 Heavy","score":1.0,"rank":1,"is_current_model":false},{"model_id":"gpt-5.2-2025-12-11","model_name":"GPT-5.2","score":1.0,"rank":1,"is_current_model":false},{"model_id":"gemini-3-pro-preview","model_name":"Gemini 3 Pro","score":1.0,"rank":1,"is_current_model":false},{"model_id":"claude-opus-4-6","model_name":"Claude Opus 4.6","score":0.9979,"rank":6,"is_current_model":false},{"model_id":"gemini-3-flash-preview","model_name":"Gemini 3 Flash","score":0.997,"rank":7,"is_current_model":false},{"model_id":"longcat-flash-thinking-2601","model_name":"LongCat-Flash-Thinking-2601","score":0.996,"rank":8,"is_current_model":false},{"model_id":"gpt-5.1-high-2025-11-12","model_name":"GPT-5.1 High","score":0.996,"rank":8,"is_current_model":false},{"model_id":"nemotron-3-nano-30b-a3b","model_name":"Nemotron 3 Nano (30B A3B)","score":0.992,"rank":10,"is_current_model":false},{"model_id":"minimax-m2","model_name":"MiniMax M2","score":0.78,"rank":65,"is_current_model":true}]},{"benchmark_id":"browsecomp","benchmark_name":"BrowseComp","models":[{"model_id":"gemini-3.1-pro-preview","model_name":"Gemini 3.1 Pro","score":0.859,"rank":1,"is_current_model":false},{"model_id":"claude-opus-4-6","model_name":"Claude Opus 4.6","score":0.84,"rank":2,"is_current_model":false},{"model_id":"gpt-5.4","model_name":"GPT-5.4","score":0.827,"rank":3,"is_current_model":false},{"model_id":"gpt-5.2-pro-2025-12-11","model_name":"GPT-5.2 Pro","score":0.779,"rank":4,"is_current_model":false},{"model_id":"seed-2.0-pro","model_name":"Seed 2.0 Pro","score":0.773,"rank":5,"is_current_model":false},{"model_id":"minimax-m2.5","model_name":"MiniMax M2.5","score":0.763,"rank":6,"is_current_model":false},{"model_id":"glm-5","model_name":"GLM-5","score":0.759,"rank":7,"is_current_model":false},{"model_id":"kimi-k2.5","model_name":"Kimi K2.5","score":0.749,"rank":8,"is_current_model":false},{"model_id":"claude-sonnet-4-6","model_name":"Claude Sonnet 4.6","score":0.747,"rank":9,"is_current_model":false},{"model_id":"step-3.5-flash","model_name":"Step-3.5-Flash","score":0.69,"rank":10,"is_current_model":false},{"model_id":"qwen3.5-397b-a17b","model_name":"Qwen3.5-397B-A17B","score":0.69,"rank":10,"is_current_model":false},{"model_id":"minimax-m2","model_name":"MiniMax M2","score":0.44,"rank":28,"is_current_model":true}]},{"benchmark_id":"browsecomp-zh","benchmark_name":"BrowseComp-zh","models":[{"model_id":"qwen3.5-397b-a17b","model_name":"Qwen3.5-397B-A17B","score":0.703,"rank":1,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.699,"rank":2,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.695,"rank":3,"is_current_model":false},{"model_id":"longcat-flash-thinking-2601","model_name":"LongCat-Flash-Thinking-2601","score":0.69,"rank":4,"is_current_model":false},{"model_id":"glm-4.7","model_name":"GLM-4.7","score":0.666,"rank":5,"is_current_model":false},{"model_id":"deepseek-reasoner","model_name":"DeepSeek-V3.2 (Thinking)","score":0.65,"rank":6,"is_current_model":false},{"model_id":"kimi-k2-thinking-0905","model_name":"Kimi K2-Thinking-0905","score":0.623,"rank":7,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.621,"rank":8,"is_current_model":false},{"model_id":"deepseek-v3.1","model_name":"DeepSeek-V3.1","score":0.492,"rank":9,"is_current_model":false},{"model_id":"minimax-m2","model_name":"MiniMax M2","score":0.485,"rank":10,"is_current_model":true}]},{"benchmark_id":"gpqa","benchmark_name":"GPQA","models":[{"model_id":"gemini-3.1-pro-preview","model_name":"Gemini 3.1 Pro","score":0.943,"rank":1,"is_current_model":false},{"model_id":"gpt-5.2-pro-2025-12-11","model_name":"GPT-5.2 Pro","score":0.932,"rank":2,"is_current_model":false},{"model_id":"gpt-5.4","model_name":"GPT-5.4","score":0.928,"rank":3,"is_current_model":false},{"model_id":"gpt-5.2-2025-12-11","model_name":"GPT-5.2","score":0.924,"rank":4,"is_current_model":false},{"model_id":"gemini-3-pro-preview","model_name":"Gemini 3 Pro","score":0.919,"rank":5,"is_current_model":false},{"model_id":"claude-opus-4-6","model_name":"Claude Opus 4.6","score":0.913,"rank":6,"is_current_model":false},{"model_id":"gemini-3-flash-preview","model_name":"Gemini 3 Flash","score":0.904,"rank":7,"is_current_model":false},{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.904,"rank":7,"is_current_model":false},{"model_id":"claude-sonnet-4-6","model_name":"Claude Sonnet 4.6","score":0.899,"rank":9,"is_current_model":false},{"model_id":"seed-2.0-pro","model_name":"Seed 2.0 Pro","score":0.889,"rank":10,"is_current_model":false},{"model_id":"minimax-m2","model_name":"MiniMax M2","score":0.78,"rank":65,"is_current_model":true}]},{"benchmark_id":"humanity's-last-exam","benchmark_name":"Humanity's Last Exam","models":[{"model_id":"claude-opus-4-6","model_name":"Claude Opus 4.6","score":0.531,"rank":1,"is_current_model":false},{"model_id":"gemini-3.1-pro-preview","model_name":"Gemini 3.1 Pro","score":0.514,"rank":2,"is_current_model":false},{"model_id":"kimi-k2-thinking-0905","model_name":"Kimi K2-Thinking-0905","score":0.51,"rank":3,"is_current_model":false},{"model_id":"grok-4-heavy","model_name":"Grok-4 Heavy","score":0.507,"rank":4,"is_current_model":false},{"model_id":"kimi-k2.5","model_name":"Kimi K2.5","score":0.502,"rank":5,"is_current_model":false},{"model_id":"claude-sonnet-4-6","model_name":"Claude Sonnet 4.6","score":0.49,"rank":6,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.485,"rank":7,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.475,"rank":8,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.474,"rank":9,"is_current_model":false},{"model_id":"gemini-3-pro-preview","model_name":"Gemini 3 Pro","score":0.458,"rank":10,"is_current_model":false},{"model_id":"minimax-m2","model_name":"MiniMax M2","score":0.125,"rank":48,"is_current_model":true}]},{"benchmark_id":"livecodebench","benchmark_name":"LiveCodeBench","models":[{"model_id":"deepseek-reasoner","model_name":"DeepSeek-V3.2 (Thinking)","score":0.833,"rank":1,"is_current_model":false},{"model_id":"minimax-m2","model_name":"MiniMax M2","score":0.83,"rank":2,"is_current_model":true},{"model_id":"longcat-flash-thinking-2601","model_name":"LongCat-Flash-Thinking-2601","score":0.828,"rank":3,"is_current_model":false},{"model_id":"nemotron-3-super-120b-a12b","model_name":"Nemotron 3 Super (120B A12B)","score":0.8119,"rank":4,"is_current_model":false},{"model_id":"grok-3-mini","model_name":"Grok-3 Mini","score":0.804,"rank":5,"is_current_model":false},{"model_id":"grok-4-fast","model_name":"Grok 4 Fast","score":0.8,"rank":6,"is_current_model":false},{"model_id":"longcat-flash-thinking","model_name":"LongCat-Flash-Thinking","score":0.794,"rank":7,"is_current_model":false},{"model_id":"grok-3","model_name":"Grok-3","score":0.794,"rank":7,"is_current_model":false},{"model_id":"grok-4-heavy","model_name":"Grok-4 Heavy","score":0.794,"rank":7,"is_current_model":false},{"model_id":"grok-4","model_name":"Grok-4","score":0.79,"rank":10,"is_current_model":false}]},{"benchmark_id":"mmlu-pro","benchmark_name":"MMLU-Pro","models":[{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.885,"rank":1,"is_current_model":false},{"model_id":"minimax-m2.1","model_name":"MiniMax M2.1","score":0.88,"rank":2,"is_current_model":false},{"model_id":"qwen3.5-397b-a17b","model_name":"Qwen3.5-397B-A17B","score":0.878,"rank":3,"is_current_model":false},{"model_id":"kimi-k2.5","model_name":"Kimi K2.5","score":0.871,"rank":4,"is_current_model":false},{"model_id":"ernie-5.0","model_name":"ERNIE 5.0","score":0.87,"rank":5,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.867,"rank":6,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.861,"rank":7,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.853,"rank":8,"is_current_model":false},{"model_id":"gemma-4-31b-it","model_name":"Gemma 4 31B","score":0.852,"rank":9,"is_current_model":false},{"model_id":"deepseek-reasoner","model_name":"DeepSeek-V3.2 (Thinking)","score":0.85,"rank":10,"is_current_model":false},{"model_id":"deepseek-r1-0528","model_name":"DeepSeek-R1-0528","score":0.85,"rank":10,"is_current_model":false},{"model_id":"deepseek-v3.2-exp","model_name":"DeepSeek-V3.2-Exp","score":0.85,"rank":10,"is_current_model":false},{"model_id":"minimax-m2","model_name":"MiniMax M2","score":0.82,"rank":30,"is_current_model":true}]},{"benchmark_id":"swe-bench-multilingual","benchmark_name":"SWE-bench Multilingual","models":[{"model_id":"claude-opus-4-6","model_name":"Claude Opus 4.6","score":0.7783,"rank":1,"is_current_model":false},{"model_id":"minimax-m2.7","model_name":"MiniMax M2.7","score":0.765,"rank":2,"is_current_model":false},{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.738,"rank":3,"is_current_model":false},{"model_id":"kimi-k2.5","model_name":"Kimi K2.5","score":0.73,"rank":4,"is_current_model":false},{"model_id":"minimax-m2.1","model_name":"MiniMax M2.1","score":0.725,"rank":5,"is_current_model":false},{"model_id":"mimo-v2-pro","model_name":"MiMo-V2-Pro","score":0.717,"rank":6,"is_current_model":false},{"model_id":"mimo-v2-flash","model_name":"MiMo-V2-Flash","score":0.717,"rank":6,"is_current_model":false},{"model_id":"deepseek-reasoner","model_name":"DeepSeek-V3.2 (Thinking)","score":0.702,"rank":8,"is_current_model":false},{"model_id":"qwen3.5-397b-a17b","model_name":"Qwen3.5-397B-A17B","score":0.693,"rank":9,"is_current_model":false},{"model_id":"glm-4.7","model_name":"GLM-4.7","score":0.667,"rank":10,"is_current_model":false},{"model_id":"minimax-m2","model_name":"MiniMax M2","score":0.565,"rank":13,"is_current_model":true}]},{"benchmark_id":"swe-bench-verified","benchmark_name":"SWE-Bench Verified","models":[{"model_id":"claude-opus-4-5-20251101","model_name":"Claude Opus 4.5","score":0.809,"rank":1,"is_current_model":false},{"model_id":"claude-opus-4-6","model_name":"Claude Opus 4.6","score":0.808,"rank":2,"is_current_model":false},{"model_id":"gemini-3.1-pro-preview","model_name":"Gemini 3.1 Pro","score":0.806,"rank":3,"is_current_model":false},{"model_id":"minimax-m2.5","model_name":"MiniMax M2.5","score":0.802,"rank":4,"is_current_model":false},{"model_id":"gpt-5.2-2025-12-11","model_name":"GPT-5.2","score":0.8,"rank":5,"is_current_model":false},{"model_id":"claude-sonnet-4-6","model_name":"Claude Sonnet 4.6","score":0.796,"rank":6,"is_current_model":false},{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.788,"rank":7,"is_current_model":false},{"model_id":"gemini-3-flash-preview","model_name":"Gemini 3 Flash","score":0.78,"rank":8,"is_current_model":false},{"model_id":"mimo-v2-pro","model_name":"MiMo-V2-Pro","score":0.78,"rank":8,"is_current_model":false},{"model_id":"glm-5","model_name":"GLM-5","score":0.778,"rank":10,"is_current_model":false},{"model_id":"minimax-m2","model_name":"MiniMax M2","score":0.694,"rank":40,"is_current_model":true}]},{"benchmark_id":"tau2-telecom","benchmark_name":"Tau2 Telecom","models":[{"model_id":"longcat-flash-thinking-2601","model_name":"LongCat-Flash-Thinking-2601","score":0.993,"rank":1,"is_current_model":false},{"model_id":"claude-opus-4-6","model_name":"Claude Opus 4.6","score":0.993,"rank":1,"is_current_model":false},{"model_id":"gpt-5.4","model_name":"GPT-5.4","score":0.989,"rank":3,"is_current_model":false},{"model_id":"gpt-5.2-2025-12-11","model_name":"GPT-5.2","score":0.987,"rank":4,"is_current_model":false},{"model_id":"claude-opus-4-5-20251101","model_name":"Claude Opus 4.5","score":0.982,"rank":5,"is_current_model":false},{"model_id":"claude-sonnet-4-6","model_name":"Claude Sonnet 4.6","score":0.979,"rank":6,"is_current_model":false},{"model_id":"mimo-v2-pro","model_name":"MiMo-V2-Pro","score":0.968,"rank":7,"is_current_model":false},{"model_id":"gpt-5-2025-08-07","model_name":"GPT-5","score":0.967,"rank":8,"is_current_model":false},{"model_id":"gpt-5.1-thinking-2025-11-12","model_name":"GPT-5.1 Thinking","score":0.956,"rank":9,"is_current_model":false},{"model_id":"gpt-5.1-instant-2025-11-12","model_name":"GPT-5.1 Instant","score":0.956,"rank":9,"is_current_model":false},{"model_id":"gpt-5.1-2025-11-13","model_name":"GPT-5.1","score":0.956,"rank":9,"is_current_model":false},{"model_id":"minimax-m2","model_name":"MiniMax M2","score":0.87,"rank":14,"is_current_model":true}]},{"benchmark_id":"terminal-bench","benchmark_name":"Terminal-Bench","models":[{"model_id":"claude-sonnet-4-5-20250929","model_name":"Claude Sonnet 4.5","score":0.5,"rank":1,"is_current_model":false},{"model_id":"minimax-m2.1","model_name":"MiniMax M2.1","score":0.479,"rank":2,"is_current_model":false},{"model_id":"kimi-k2-thinking-0905","model_name":"Kimi K2-Thinking-0905","score":0.471,"rank":3,"is_current_model":false},{"model_id":"minimax-m2","model_name":"MiniMax M2","score":0.463,"rank":4,"is_current_model":true},{"model_id":"claude-opus-4-1-20250805","model_name":"Claude Opus 4.1","score":0.433,"rank":5,"is_current_model":false},{"model_id":"claude-haiku-4-5-20251001","model_name":"Claude Haiku 4.5","score":0.41,"rank":6,"is_current_model":false},{"model_id":"glm-4.6","model_name":"GLM-4.6","score":0.405,"rank":7,"is_current_model":false},{"model_id":"longcat-flash-chat","model_name":"LongCat-Flash-Chat","score":0.3951,"rank":8,"is_current_model":false},{"model_id":"claude-opus-4-20250514","model_name":"Claude Opus 4","score":0.392,"rank":9,"is_current_model":false},{"model_id":"deepseek-v3.2-exp","model_name":"DeepSeek-V3.2-Exp","score":0.377,"rank":10,"is_current_model":false}]}],"comparison_model":{"model_id":"gemini-3.1-pro-preview","name":"Gemini 3.1 Pro","organization_name":"Google","release_date":"2026-02-19","announcement_date":"2026-02-19","knowledge_cutoff":"2025-01-31","param_count":null,"multimodal":true,"license":{"name":"Proprietary","allow_commercial":false},"benchmarks":{"apex-agents":0.335,"arc-agi-v2":0.771,"browsecomp":0.859,"gdpval-aa":1317.0,"gpqa":0.943,"humanity's-last-exam":0.514,"livecodebench-pro":2887.0,"mcp-atlas":0.692,"mmmlu":0.926,"mmmu-pro":0.805,"mrcr-v2-(8-needle)":0.263,"scicode":0.59,"swe-bench-pro":0.542,"swe-bench-verified":0.806,"t2-bench":0.993,"terminal-bench-2":0.685},"provider":{"name":"Google","input_cost":2.5,"output_cost":15.0,"max_input_tokens":1048576,"max_output_tokens":65536,"modalities":{"input":{"text":false,"image":true,"audio":false,"video":false},"output":{"text":true,"image":false,"audio":false,"video":false}}}}}