{"model_id":"qwen3-vl-4b-instruct","name":"Qwen3 VL 4B Instruct","organization":{"id":"qwen","name":"Alibaba Cloud / Qwen Team","website":"https://qwenlm.github.io"},"description":"Qwen3-VL is a large multimodal model that unifies vision, language, and reasoning to achieve human-level perception and cognition across text, images, and video. Built on a 235B-parameter architecture, it integrates early joint training of visual and textual modalities for strong language grounding. The model supports up to a 1 million-token context window and excels at visual understanding, spatial reasoning, long video comprehension, and tool-based interaction. It can generate code from images, perform precise 2D/3D object grounding, and operate digital interfaces like a visual agent. The “Instruct” version rivals Gemini 2.5 Pro in perception benchmarks, while the “Thinking” version leads in multimodal reasoning and STEM tasks. With multilingual OCR, creative writing, and fine-grained scene interpretation, Qwen3-VL establishes a new open-source frontier for integrated vision-language intelligence.","release_date":"2025-09-22","announcement_date":"2025-09-22","multimodal":true,"knowledge_cutoff":null,"param_count":4000000000,"training_tokens":null,"available_in_zeroeval":true,"reviews_count":0,"reviews_avg_rating":0,"license":{"name":"Apache 2.0","allow_commercial":true},"model_family":null,"fine_tuned_from":null,"tags":{"moe":"false","tuning":"instruct","thinking":"false"},"sources":{"api_ref":"https://help.aliyun.com/zh/model-studio/use-qwen-by-calling-api","playground":"https://chat.qwen.ai/","paper":null,"scorecard_blog":"https://qwen.ai/blog?id=99f0335c4ad9ff6153e517418d48535ab6d8afef&from=research.latest-advancements-list","repo":"https://github.com/QwenLM/Qwen3-VL","weights":"https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct"},"benchmarks":[{"benchmark_id":"ai2d","name":"AI2D","description":"AI2D is a dataset of 4,903 illustrative diagrams from grade school natural sciences (such as food webs, human physiology, and life cycles) with over 15,000 multiple choice questions and answers. The benchmark evaluates diagram understanding and visual reasoning capabilities, requiring models to interpret diagrammatic elements, relationships, and structure to answer questions about scientific concepts represented in visual form.","categories":["multimodal","reasoning","vision"],"modality":"multimodal","max_score":1.0,"score":0.841,"normalized_score":0.841,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":"TEST","verification_date":null,"verification_notes":null},{"benchmark_id":"aime-2025","name":"AIME 2025","description":"All 30 problems from the 2025 American Invitational Mathematics Examination (AIME I and AIME II), testing olympiad-level mathematical reasoning with integer answers from 000-999. Used as an AI benchmark to evaluate large language models' ability to solve complex mathematical problems requiring multi-step logical deductions and structured symbolic reasoning.","categories":["math","reasoning"],"modality":"text","max_score":1.0,"score":0.466,"normalized_score":0.466,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"bfcl-v3","name":"BFCL-v3","description":"Berkeley Function Calling Leaderboard v3 (BFCL-v3) is an advanced benchmark that evaluates large language models' function calling capabilities through multi-turn and multi-step interactions. It introduces extended conversational exchanges where models must retain contextual information across turns and execute multiple internal function calls for complex user requests. The benchmark includes 1000 test cases across domains like vehicle control, trading bots, travel booking, and file system management, using state-based evaluation to verify both system state changes and execution path correctness.","categories":["agents","finance","general","reasoning","structured_output","tool_calling"],"modality":"text","max_score":1.0,"score":0.633,"normalized_score":0.633,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"blink","name":"BLINK","description":"BLINK: Multimodal Large Language Models Can See but Not Perceive. A benchmark for multimodal language models focusing on core visual perception abilities. Reformats 14 classic computer vision tasks into 3,807 multiple-choice questions paired with single or multiple images and visual prompting. Tasks include relative depth estimation, visual correspondence, forensics detection, multi-view reasoning, counting, object localization, and spatial reasoning that humans can solve 'within a blink'.","categories":["3d","multimodal","reasoning","spatial_reasoning","vision"],"modality":"multimodal","max_score":1.0,"score":0.658,"normalized_score":0.658,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"cc-ocr","name":"CC-OCR","description":"A comprehensive OCR benchmark for evaluating Large Multimodal Models (LMMs) in literacy. Comprises four OCR-centric tracks: multi-scene text reading, multilingual text reading, document parsing, and key information extraction. Contains 39 subsets with 7,058 fully annotated images, 41% sourced from real applications. Tests capabilities including text grounding, multi-orientation text recognition, and detecting hallucination/repetition across diverse visual challenges.","categories":["multimodal","structured_output","text-to-image","vision"],"modality":"multimodal","max_score":1.0,"score":0.762,"normalized_score":0.762,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":"overall","verification_date":null,"verification_notes":null},{"benchmark_id":"charadessta","name":"CharadesSTA","description":"Charades-STA is a benchmark dataset for temporal activity localization via language queries, extending the Charades dataset with sentence temporal annotations. It contains 12,408 training and 3,720 testing segment-sentence pairs from videos with natural language descriptions and precise temporal boundaries for localizing activities based on language queries.","categories":["language","multimodal","video"],"modality":"multimodal","max_score":1.0,"score":0.555,"normalized_score":0.555,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"charxiv-d","name":"CharXiv-D","description":"CharXiv-D is the descriptive questions subset of the CharXiv benchmark, designed to assess multimodal large language models' ability to extract basic information from scientific charts. It contains descriptive questions covering information extraction, enumeration, pattern recognition, and counting across 2,323 diverse charts from arXiv papers, all curated and verified by human experts.","categories":["multimodal","reasoning","structured_output","vision"],"modality":"multimodal","max_score":1.0,"score":0.762,"normalized_score":0.762,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":"DQ","verification_date":null,"verification_notes":null},{"benchmark_id":"charxiv-r","name":"CharXiv-R","description":"CharXiv-R is the reasoning component of the CharXiv benchmark, focusing on complex reasoning questions that require synthesizing information across visual chart elements. It evaluates multimodal large language models on their ability to understand and reason about scientific charts from arXiv papers through various reasoning tasks.","categories":["multimodal","reasoning","vision"],"modality":"multimodal","max_score":1.0,"score":0.397,"normalized_score":0.397,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":"RQ","verification_date":null,"verification_notes":null},{"benchmark_id":"docvqatest","name":"DocVQAtest","description":"DocVQA is a Visual Question Answering benchmark on document images containing 50,000 questions defined on 12,000+ document images. The benchmark focuses on understanding document structure and content to answer questions about various document types including letters, memos, notes, and reports from the UCSF Industry Documents Library.","categories":["multimodal","vision"],"modality":"multimodal","max_score":1.0,"score":0.953,"normalized_score":0.953,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"erqa","name":"ERQA","description":"Embodied Reasoning Question Answering benchmark consisting of 400 multiple-choice visual questions across spatial reasoning, trajectory reasoning, action reasoning, state estimation, and multi-view reasoning for evaluating AI capabilities in physical world interactions","categories":["reasoning","spatial_reasoning","vision"],"modality":"multimodal","max_score":1.0,"score":0.413,"normalized_score":0.413,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"hallusion-bench","name":"Hallusion Bench","description":"A comprehensive benchmark designed to evaluate image-context reasoning in large visual-language models (LVLMs) by challenging models with 346 images and 1,129 carefully crafted questions to assess language hallucination and visual illusion","categories":["reasoning","vision"],"modality":"multimodal","max_score":1.0,"score":0.576,"normalized_score":0.576,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"hmmt25","name":"HMMT25","description":"Harvard-MIT Mathematics Tournament 2025 - A prestigious student-organized mathematics competition for high school students featuring two tournaments (November 2025 at MIT and February 2026 at Harvard) with individual tests, team rounds, and guts rounds","categories":["math"],"modality":"text","max_score":1.0,"score":0.307,"normalized_score":0.307,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"ifeval","name":"IFEval","description":"Instruction-Following Evaluation (IFEval) benchmark for large language models, focusing on verifiable instructions with 25 types of instructions and around 500 prompts containing one or more verifiable constraints","categories":["general","instruction_following","structured_output"],"modality":"text","max_score":1.0,"score":0.823,"normalized_score":0.823,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"include","name":"Include","description":"Include benchmark - specific documentation not found in official sources","categories":["general"],"modality":"text","max_score":1.0,"score":0.614,"normalized_score":0.614,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"infovqatest","name":"InfoVQAtest","description":"InfoVQA test set with infographic images requiring joint reasoning over document layout, textual content, graphical elements, and data visualizations with elementary reasoning and arithmetic skills","categories":["multimodal","vision"],"modality":"multimodal","max_score":1.0,"score":0.803,"normalized_score":0.803,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"livebench-20241125","name":"LiveBench 20241125","description":"LiveBench is a challenging, contamination-limited LLM benchmark that addresses test set contamination by releasing new questions monthly based on recently-released datasets, arXiv papers, news articles, and IMDb movie synopses. It comprises tasks across math, coding, reasoning, language, instruction following, and data analysis with verifiable, objective ground-truth answers.","categories":["general","math","reasoning"],"modality":"text","max_score":1.0,"score":0.609,"normalized_score":0.609,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"livecodebench-v6","name":"LiveCodeBench v6","description":"LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.","categories":["general","reasoning"],"modality":"text","max_score":1.0,"score":0.379,"normalized_score":0.379,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":"25.02-25.05","verification_date":null,"verification_notes":null},{"benchmark_id":"lvbench","name":"LVBench","description":"LVBench is an extreme long video understanding benchmark designed to evaluate multimodal models on videos up to two hours in duration. It contains 6 major categories and 21 subcategories, with videos averaging five times longer than existing datasets. The benchmark addresses applications requiring comprehension of extremely long videos.","categories":["long_context","multimodal","vision"],"modality":"multimodal","max_score":1.0,"score":0.562,"normalized_score":0.562,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"mathvision","name":"MathVision","description":"MATH-Vision is a dataset designed to measure multimodal mathematical reasoning capabilities. It focuses on evaluating how well models can solve mathematical problems that require both visual understanding and mathematical reasoning, bridging the gap between visual and mathematical domains.","categories":["math","multimodal","vision"],"modality":"multimodal","max_score":1.0,"score":0.516,"normalized_score":0.516,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"mathvista-mini","name":"MathVista-Mini","description":"MathVista-Mini is a smaller version of the MathVista benchmark that evaluates mathematical reasoning in visual contexts. It consists of examples derived from multimodal datasets involving mathematics, combining challenges from diverse mathematical and visual tasks to assess foundation models' ability to solve problems requiring both visual understanding and mathematical reasoning.","categories":["math","multimodal","vision"],"modality":"multimodal","max_score":1.0,"score":0.737,"normalized_score":0.737,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"mlvu-m","name":"MLVU-M","description":"MLVU-M benchmark","categories":["general"],"modality":"text","max_score":1.0,"score":0.753,"normalized_score":0.753,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":"MCQ","verification_date":null,"verification_notes":null},{"benchmark_id":"mmbench-v1.1","name":"MMBench-V1.1","description":"Version 1.1 of MMBench, an improved bilingual benchmark for assessing multi-modal capabilities of vision-language models through multiple-choice questions in both English and Chinese, providing systematic evaluation across diverse vision-language tasks.","categories":["multimodal","reasoning","vision"],"modality":"multimodal","max_score":1.0,"score":0.851,"normalized_score":0.851,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":"en, V1.1","verification_date":null,"verification_notes":null},{"benchmark_id":"mmlu","name":"MMLU","description":"Massive Multitask Language Understanding benchmark testing knowledge across 57 diverse subjects including STEM, humanities, social sciences, and professional domains","categories":["general","language","math","reasoning"],"modality":"text","max_score":1.0,"score":0.772,"normalized_score":0.772,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"mmlu-pro","name":"MMLU-Pro","description":"A more robust and challenging multi-task language understanding benchmark that extends MMLU by expanding multiple-choice options from 4 to 10, eliminating trivial questions, and focusing on reasoning-intensive tasks. Features over 12,000 curated questions across 14 domains and causes a 16-33% accuracy drop compared to original MMLU.","categories":["general","language","math","reasoning"],"modality":"text","max_score":1.0,"score":0.671,"normalized_score":0.671,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"mmlu-prox","name":"MMLU-ProX","description":"Extended version of MMLU-Pro providing additional challenging multiple-choice questions for evaluating language models across diverse academic and professional domains. Built on the foundation of the Massive Multitask Language Understanding benchmark framework.","categories":["general","language","math","reasoning"],"modality":"text","max_score":1.0,"score":0.594,"normalized_score":0.594,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"mmlu-redux","name":"MMLU-Redux","description":"An improved version of the MMLU benchmark featuring manually re-annotated questions to identify and correct errors in the original dataset. Provides more reliable evaluation metrics for language models by addressing dataset quality issues found in the original MMLU.","categories":["general","language","math","reasoning"],"modality":"text","max_score":1.0,"score":0.815,"normalized_score":0.815,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"mm-mt-bench","name":"MM-MT-Bench","description":"A multi-turn LLM-as-a-judge evaluation benchmark for testing multimodal instruction-tuned models' ability to follow user instructions in multi-turn dialogues and answer open-ended questions in a zero-shot manner.","categories":["communication","multimodal"],"modality":"multimodal","max_score":100.0,"score":7.5,"normalized_score":0.075,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"mmmu-pro","name":"MMMU-Pro","description":"A more robust multi-discipline multimodal understanding benchmark that enhances MMMU through a three-step process: filtering text-only answerable questions, augmenting candidate options, and introducing vision-only input settings. Achieves significantly lower model performance (16.8-26.9%) compared to original MMMU, providing more rigorous evaluation that closely mimics real-world scenarios.","categories":["general","multimodal","reasoning","vision"],"modality":"multimodal","max_score":1.0,"score":0.532,"normalized_score":0.532,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":"full","verification_date":null,"verification_notes":null},{"benchmark_id":"mmmu-(val)","name":"MMMU (val)","description":"Validation set of the Massive Multi-discipline Multimodal Understanding and Reasoning benchmark. Features college-level multimodal questions across 6 core disciplines (Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, Tech & Engineering) spanning 30 subjects and 183 subfields with diverse image types including charts, diagrams, maps, and tables.","categories":["general","healthcare","multimodal","reasoning","vision"],"modality":"multimodal","max_score":1.0,"score":0.674,"normalized_score":0.674,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":"AI","verification_date":null,"verification_notes":null},{"benchmark_id":"mmstar","name":"MMStar","description":"MMStar is an elite vision-indispensable multimodal benchmark comprising 1,500 challenge samples meticulously selected by humans to evaluate 6 core capabilities and 18 detailed axes. The benchmark addresses issues of visual content unnecessity and unintentional data leakage in existing multimodal evaluations.","categories":["general","multimodal","reasoning","vision"],"modality":"multimodal","max_score":1.0,"score":0.698,"normalized_score":0.698,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"muirbench","name":"MuirBench","description":"A comprehensive benchmark for robust multi-image understanding capabilities of multimodal LLMs. Consists of 12 diverse multi-image tasks involving 10 categories of multi-image relations (e.g., multiview, temporal relations, narrative, complementary). Comprises 11,264 images and 2,600 multiple-choice questions created in a pairwise manner, where each standard instance is paired with an unanswerable variant for reliable assessment.","categories":["multimodal","reasoning","vision"],"modality":"multimodal","max_score":1.0,"score":0.638,"normalized_score":0.638,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"mvbench","name":"MVBench","description":"A comprehensive multi-modal video understanding benchmark covering 20 challenging video tasks that require temporal understanding beyond single-frame analysis. Tasks span from perception to cognition, including action recognition, temporal reasoning, spatial reasoning, object interaction, scene transition, and counterfactual inference. Uses a novel static-to-dynamic method to systematically generate video tasks from existing annotations.","categories":["multimodal","reasoning","spatial_reasoning","video","vision"],"modality":"multimodal","max_score":1.0,"score":0.689,"normalized_score":0.689,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"ocrbench","name":"OCRBench","description":"OCRBench: Comprehensive evaluation benchmark for assessing Optical Character Recognition (OCR) capabilities in Large Multimodal Models across text recognition, scene text VQA, and document understanding tasks","categories":["image_to_text","vision"],"modality":"multimodal","max_score":1.0,"score":881.0,"normalized_score":0.881,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"ocrbench-v2-(en)","name":"OCRBench-V2 (en)","description":"OCRBench v2 English subset: Enhanced benchmark for evaluating Large Multimodal Models on visual text localization and reasoning with English text content","categories":["image_to_text","vision"],"modality":"multimodal","max_score":1.0,"score":0.637,"normalized_score":0.637,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":"en/zh - using en value","verification_date":null,"verification_notes":null},{"benchmark_id":"ocrbench-v2-(zh)","name":"OCRBench-V2 (zh)","description":"OCRBench v2 Chinese subset: Enhanced benchmark for evaluating Large Multimodal Models on visual text localization and reasoning with Chinese text content","categories":["image_to_text","vision"],"modality":"multimodal","max_score":1.0,"score":0.576,"normalized_score":0.576,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":"zh","verification_date":null,"verification_notes":null},{"benchmark_id":"odinw","name":"ODinW","description":"Object Detection in the Wild (ODinW) benchmark for evaluating object detection models' task-level transfer ability across diverse real-world datasets in terms of prediction accuracy and adaptation efficiency","categories":["vision"],"modality":"image","max_score":1.0,"score":0.482,"normalized_score":0.482,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":"13","verification_date":null,"verification_notes":null},{"benchmark_id":"osworld","name":"OSWorld","description":"OSWorld: The first-of-its-kind scalable, real computer environment for multimodal agents, supporting task setup, execution-based evaluation, and interactive learning across Ubuntu, Windows, and macOS with 369 computer tasks involving real web and desktop applications, OS file I/O, and multi-application workflows","categories":["agents","general","multimodal","vision"],"modality":"multimodal","max_score":1.0,"score":0.262,"normalized_score":0.262,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"polymath","name":"PolyMATH","description":"Polymath is a challenging multi-modal mathematical reasoning benchmark designed to evaluate the general cognitive reasoning abilities of Multi-modal Large Language Models (MLLMs). The benchmark comprises 5,000 manually collected high-quality images of cognitive textual and visual challenges across 10 distinct categories, including pattern recognition, spatial reasoning, and relative reasoning.","categories":["math","multimodal","reasoning","spatial_reasoning","vision"],"modality":"multimodal","max_score":1.0,"score":0.288,"normalized_score":0.288,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"realworldqa","name":"RealWorldQA","description":"RealWorldQA is a benchmark designed to evaluate basic real-world spatial understanding capabilities of multimodal models. The initial release consists of over 700 anonymized images taken from vehicles and other real-world scenarios, each accompanied by a question and easily verifiable answer. Released by xAI as part of their Grok-1.5 Vision preview to test models' ability to understand natural scenes and spatial relationships in everyday visual contexts.","categories":["spatial_reasoning","vision"],"modality":"multimodal","max_score":1.0,"score":0.709,"normalized_score":0.709,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"screenspot","name":"ScreenSpot","description":"ScreenSpot is the first realistic GUI grounding benchmark that encompasses mobile, desktop, and web environments. The dataset comprises over 1,200 instructions from iOS, Android, macOS, Windows and Web environments, along with annotated element types (text and icon/widget), designed to evaluate visual GUI agents' ability to accurately locate screen elements based on natural language instructions.","categories":["grounding","multimodal","spatial_reasoning","vision"],"modality":"multimodal","max_score":1.0,"score":0.94,"normalized_score":0.94,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"screenspot-pro","name":"ScreenSpot Pro","description":"ScreenSpot-Pro is a novel GUI grounding benchmark designed to rigorously evaluate the grounding capabilities of multimodal large language models (MLLMs) in professional high-resolution computing environments. The benchmark comprises 1,581 instructions across 23 applications spanning 5 industries and 3 operating systems, featuring authentic high-resolution images from professional domains with expert annotations. Unlike previous benchmarks that focus on cropped screenshots in consumer applications, ScreenSpot-Pro addresses the complexity and diversity of real-world professional software scenarios, revealing significant performance gaps in current MLLM GUI perception capabilities.","categories":["grounding","multimodal","spatial_reasoning","vision"],"modality":"multimodal","max_score":1.0,"score":0.595,"normalized_score":0.595,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"simpleqa","name":"SimpleQA","description":"SimpleQA is a factuality benchmark developed by OpenAI that measures the short-form factual accuracy of large language models. The benchmark contains 4,326 short, fact-seeking questions that are adversarially collected and designed to have single, indisputable answers. Questions cover diverse topics from science and technology to entertainment, and the benchmark also measures model calibration by evaluating whether models know what they know.","categories":["factuality","general","reasoning"],"modality":"text","max_score":1.0,"score":0.48,"normalized_score":0.48,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":"VQA","verification_date":null,"verification_notes":null},{"benchmark_id":"supergpqa","name":"SuperGPQA","description":"SuperGPQA is a comprehensive benchmark that evaluates large language models across 285 graduate-level academic disciplines. The benchmark contains 25,957 questions covering 13 broad disciplinary areas including Engineering, Medicine, Science, and Law, with specialized fields in light industry, agriculture, and service-oriented domains. It employs a Human-LLM collaborative filtering mechanism with over 80 expert annotators to create challenging questions that assess graduate-level knowledge and reasoning capabilities.","categories":["chemistry","economics","finance","general","healthcare","legal","math","physics","reasoning"],"modality":"text","max_score":1.0,"score":0.403,"normalized_score":0.403,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"videommmu","name":"VideoMMMU","description":"Video-MMMU evaluates Large Multimodal Models' ability to acquire knowledge from expert-level professional videos across six disciplines through three cognitive stages: perception, comprehension, and adaptation. Contains 300 videos and 900 human-annotated questions spanning Art, Business, Science, Medicine, Humanities, and Engineering.","categories":["healthcare","multimodal","reasoning","vision"],"modality":"multimodal","max_score":1.0,"score":0.562,"normalized_score":0.562,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null},{"benchmark_id":"writingbench","name":"WritingBench","description":"A comprehensive benchmark for evaluating large language models' generative writing capabilities across 6 core writing domains (Academic & Engineering, Finance & Business, Politics & Law, Literature & Art, Education, Advertising & Marketing) and 100 subdomains. Contains 1,239 queries with a query-dependent evaluation framework that dynamically generates 5 instance-specific assessment criteria for each writing task, using a fine-tuned critic model to score responses on style, format, and length dimensions.","categories":["communication","creativity","finance","legal","writing"],"modality":"text","max_score":1.0,"score":0.825,"normalized_score":0.825,"verified":false,"self_reported":true,"self_reported_source":null,"analysis_method":null,"verification_date":null,"verification_notes":null}],"providers":[{"provider_id":"deepinfra","name":"DeepInfra","website":"https://deepinfra.com/","deprecated":false,"deprecated_at":null,"pricing":{"input_per_million":0.1,"output_per_million":0.6},"quantization":"fp8","limits":{"max_input_tokens":262144,"max_output_tokens":262144},"performance":{"throughput":null,"latency":null},"features":{"web_search":null,"function_calling":null,"structured_output":null,"code_execution":null,"batch_inference":null,"finetuning":null},"modalities":{"input":{"text":true,"image":true,"audio":false,"video":false},"output":{"text":true,"image":false,"audio":false,"video":false}}}],"benchmark_rankings":[{"benchmark_id":"ai2d","benchmark_name":"AI2D","models":[{"model_id":"claude-3-5-sonnet-20241022","model_name":"Claude 3.5 Sonnet","score":0.947,"rank":1,"is_current_model":false},{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.944,"rank":2,"is_current_model":false},{"model_id":"gpt-4o-2024-08-06","model_name":"GPT-4o","score":0.942,"rank":3,"is_current_model":false},{"model_id":"pixtral-large","model_name":"Pixtral Large","score":0.938,"rank":4,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.933,"rank":5,"is_current_model":false},{"model_id":"mistral-small-3.2-24b-instruct-2506","model_name":"Mistral Small 3.2 24B Instruct","score":0.9291,"rank":6,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.929,"rank":7,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.926,"rank":8,"is_current_model":false},{"model_id":"llama-3.2-90b-instruct","model_name":"Llama 3.2 90B Instruct","score":0.923,"rank":9,"is_current_model":false},{"model_id":"llama-3.2-11b-instruct","model_name":"Llama 3.2 11B Instruct","score":0.911,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.841,"rank":24,"is_current_model":true}]},{"benchmark_id":"aime-2025","benchmark_name":"AIME 2025","models":[{"model_id":"gemini-3-pro-preview","model_name":"Gemini 3 Pro","score":1.0,"rank":1,"is_current_model":false},{"model_id":"grok-4-heavy","model_name":"Grok-4 Heavy","score":1.0,"rank":1,"is_current_model":false},{"model_id":"kimi-k2-thinking-0905","model_name":"Kimi K2-Thinking-0905","score":1.0,"rank":1,"is_current_model":false},{"model_id":"gpt-5.2-pro-2025-12-11","model_name":"GPT-5.2 Pro","score":1.0,"rank":1,"is_current_model":false},{"model_id":"gpt-5.2-2025-12-11","model_name":"GPT-5.2","score":1.0,"rank":1,"is_current_model":false},{"model_id":"claude-opus-4-6","model_name":"Claude Opus 4.6","score":0.9979,"rank":6,"is_current_model":false},{"model_id":"gemini-3-flash-preview","model_name":"Gemini 3 Flash","score":0.997,"rank":7,"is_current_model":false},{"model_id":"gpt-5.1-high-2025-11-12","model_name":"GPT-5.1 High","score":0.996,"rank":8,"is_current_model":false},{"model_id":"longcat-flash-thinking-2601","model_name":"LongCat-Flash-Thinking-2601","score":0.996,"rank":8,"is_current_model":false},{"model_id":"nemotron-3-nano-30b-a3b","model_name":"Nemotron 3 Nano (30B A3B)","score":0.992,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.466,"rank":96,"is_current_model":true}]},{"benchmark_id":"bfcl-v3","benchmark_name":"BFCL-v3","models":[{"model_id":"glm-4.5","model_name":"GLM-4.5","score":0.778,"rank":1,"is_current_model":false},{"model_id":"glm-4.5-air","model_name":"GLM-4.5-Air","score":0.764,"rank":2,"is_current_model":false},{"model_id":"longcat-flash-thinking","model_name":"LongCat-Flash-Thinking","score":0.744,"rank":3,"is_current_model":false},{"model_id":"qwen3-next-80b-a3b-thinking","model_name":"Qwen3-Next-80B-A3B-Thinking","score":0.72,"rank":4,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.719,"rank":5,"is_current_model":false},{"model_id":"qwen3-235b-a22b-thinking-2507","model_name":"Qwen3-235B-A22B-Thinking-2507","score":0.719,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.717,"rank":7,"is_current_model":false},{"model_id":"qwen3-235b-a22b-instruct-2507","model_name":"Qwen3-235B-A22B-Instruct-2507","score":0.709,"rank":8,"is_current_model":false},{"model_id":"qwen3-next-80b-a3b-instruct","model_name":"Qwen3-Next-80B-A3B-Instruct","score":0.703,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":0.702,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.633,"rank":17,"is_current_model":true}]},{"benchmark_id":"blink","benchmark_name":"BLINK","models":[{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.707,"rank":1,"is_current_model":false},{"model_id":"qwen3-vl-8b-instruct","model_name":"Qwen3 VL 8B Instruct","score":0.691,"rank":2,"is_current_model":false},{"model_id":"qwen3-vl-8b-thinking","model_name":"Qwen3 VL 8B Thinking","score":0.687,"rank":3,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.685,"rank":4,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-instruct","model_name":"Qwen3 VL 30B A3B Instruct","score":0.677,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":0.673,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.671,"rank":7,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.658,"rank":8,"is_current_model":true},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":0.654,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-4b-thinking","model_name":"Qwen3 VL 4B Thinking","score":0.634,"rank":10,"is_current_model":false}]},{"benchmark_id":"cc-ocr","benchmark_name":"CC-OCR","models":[{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.834,"rank":1,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.822,"rank":2,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.818,"rank":3,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.815,"rank":4,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.81,"rank":5,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.807,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-instruct","model_name":"Qwen3 VL 30B A3B Instruct","score":0.807,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":0.803,"rank":8,"is_current_model":false},{"model_id":"qwen3-vl-8b-instruct","model_name":"Qwen3 VL 8B Instruct","score":0.799,"rank":9,"is_current_model":false},{"model_id":"qwen2.5-vl-72b","model_name":"Qwen2.5 VL 72B Instruct","score":0.798,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.762,"rank":15,"is_current_model":true}]},{"benchmark_id":"charadessta","benchmark_name":"CharadesSTA","models":[{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.648,"rank":1,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-instruct","model_name":"Qwen3 VL 30B A3B Instruct","score":0.635,"rank":2,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.635,"rank":2,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.628,"rank":4,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":0.627,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":0.612,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-8b-thinking","model_name":"Qwen3 VL 8B Thinking","score":0.599,"rank":7,"is_current_model":false},{"model_id":"qwen3-vl-4b-thinking","model_name":"Qwen3 VL 4B Thinking","score":0.59,"rank":8,"is_current_model":false},{"model_id":"qwen3-vl-8b-instruct","model_name":"Qwen3 VL 8B Instruct","score":0.56,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.555,"rank":10,"is_current_model":true}]},{"benchmark_id":"charxiv-d","benchmark_name":"CharXiv-D","models":[{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":0.905,"rank":1,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.902,"rank":2,"is_current_model":false},{"model_id":"gpt-4.5","model_name":"GPT-4.5","score":0.9,"rank":3,"is_current_model":false},{"model_id":"gpt-4.1-mini-2025-04-14","model_name":"GPT-4.1 mini","score":0.884,"rank":4,"is_current_model":false},{"model_id":"gpt-4.1-2025-04-14","model_name":"GPT-4.1","score":0.879,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":0.869,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-8b-thinking","model_name":"Qwen3 VL 8B Thinking","score":0.859,"rank":7,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-instruct","model_name":"Qwen3 VL 30B A3B Instruct","score":0.855,"rank":8,"is_current_model":false},{"model_id":"gpt-4o-2024-08-06","model_name":"GPT-4o","score":0.853,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-4b-thinking","model_name":"Qwen3 VL 4B Thinking","score":0.839,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.762,"rank":12,"is_current_model":true}]},{"benchmark_id":"charxiv-r","benchmark_name":"CharXiv-R","models":[{"model_id":"gpt-5.2-2025-12-11","model_name":"GPT-5.2","score":0.821,"rank":1,"is_current_model":false},{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.815,"rank":2,"is_current_model":false},{"model_id":"gemini-3-pro-preview","model_name":"Gemini 3 Pro","score":0.814,"rank":3,"is_current_model":false},{"model_id":"gpt-5-2025-08-07","model_name":"GPT-5","score":0.811,"rank":4,"is_current_model":false},{"model_id":"gemini-3-flash-preview","model_name":"Gemini 3 Flash","score":0.803,"rank":5,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.795,"rank":6,"is_current_model":false},{"model_id":"o3-2025-04-16","model_name":"o3","score":0.786,"rank":7,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.775,"rank":8,"is_current_model":false},{"model_id":"kimi-k2.5","model_name":"Kimi K2.5","score":0.775,"rank":8,"is_current_model":false},{"model_id":"claude-opus-4-6","model_name":"Claude Opus 4.6","score":0.774,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.397,"rank":28,"is_current_model":true}]},{"benchmark_id":"docvqatest","benchmark_name":"DocVQAtest","models":[{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.971,"rank":1,"is_current_model":false},{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":0.969,"rank":2,"is_current_model":false},{"model_id":"qwen2-vl-72b","model_name":"Qwen2-VL-72B-Instruct","score":0.965,"rank":3,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.965,"rank":3,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.961,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-8b-instruct","model_name":"Qwen3 VL 8B Instruct","score":0.961,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.953,"rank":7,"is_current_model":true},{"model_id":"qwen3-vl-8b-thinking","model_name":"Qwen3 VL 8B Thinking","score":0.953,"rank":7,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-instruct","model_name":"Qwen3 VL 30B A3B Instruct","score":0.95,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":0.95,"rank":9,"is_current_model":false}]},{"benchmark_id":"erqa","benchmark_name":"ERQA","models":[{"model_id":"gpt-5-2025-08-07","model_name":"GPT-5","score":0.657,"rank":1,"is_current_model":false},{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.657,"rank":1,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.648,"rank":3,"is_current_model":false},{"model_id":"o3-2025-04-16","model_name":"o3","score":0.64,"rank":4,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.62,"rank":5,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.605,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.525,"rank":7,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.523,"rank":8,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.513,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":0.488,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.413,"rank":16,"is_current_model":true}]},{"benchmark_id":"hallusion-bench","benchmark_name":"Hallusion Bench","models":[{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.7,"rank":1,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.679,"rank":2,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.676,"rank":3,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.674,"rank":4,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.667,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":0.66,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-8b-thinking","model_name":"Qwen3 VL 8B Thinking","score":0.654,"rank":7,"is_current_model":false},{"model_id":"qwen3-vl-4b-thinking","model_name":"Qwen3 VL 4B Thinking","score":0.641,"rank":8,"is_current_model":false},{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":0.638,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.632,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.576,"rank":13,"is_current_model":true}]},{"benchmark_id":"hmmt25","benchmark_name":"HMMT25","models":[{"model_id":"grok-4-heavy","model_name":"Grok-4 Heavy","score":0.967,"rank":1,"is_current_model":false},{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.946,"rank":2,"is_current_model":false},{"model_id":"qwen3.5-397b-a17b","model_name":"Qwen3.5-397B-A17B","score":0.927,"rank":3,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.903,"rank":4,"is_current_model":false},{"model_id":"grok-4","model_name":"Grok-4","score":0.9,"rank":5,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.898,"rank":6,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.892,"rank":7,"is_current_model":false},{"model_id":"sarvam-105b","model_name":"Sarvam-105B","score":0.858,"rank":8,"is_current_model":false},{"model_id":"qwen3-235b-a22b-thinking-2507","model_name":"Qwen3-235B-A22B-Thinking-2507","score":0.839,"rank":9,"is_current_model":false},{"model_id":"qwen3.5-9b","model_name":"Qwen3.5-9B","score":0.829,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.307,"rank":23,"is_current_model":true}]},{"benchmark_id":"ifeval","benchmark_name":"IFEval","models":[{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.95,"rank":1,"is_current_model":false},{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.943,"rank":2,"is_current_model":false},{"model_id":"o3-mini","model_name":"o3-mini","score":0.939,"rank":3,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.934,"rank":4,"is_current_model":false},{"model_id":"claude-3-7-sonnet-20250219","model_name":"Claude 3.7 Sonnet","score":0.932,"rank":5,"is_current_model":false},{"model_id":"qwen3.5-397b-a17b","model_name":"Qwen3.5-397B-A17B","score":0.926,"rank":6,"is_current_model":false},{"model_id":"nova-pro","model_name":"Nova Pro","score":0.921,"rank":7,"is_current_model":false},{"model_id":"llama-3.3-70b-instruct","model_name":"Llama 3.3 70B Instruct","score":0.921,"rank":7,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.919,"rank":9,"is_current_model":false},{"model_id":"qwen3.5-9b","model_name":"Qwen3.5-9B","score":0.915,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.823,"rank":47,"is_current_model":true}]},{"benchmark_id":"include","benchmark_name":"Include","models":[{"model_id":"qwen3.5-397b-a17b","model_name":"Qwen3.5-397B-A17B","score":0.856,"rank":1,"is_current_model":false},{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.851,"rank":2,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.828,"rank":3,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.816,"rank":4,"is_current_model":false},{"model_id":"qwen3-235b-a22b-thinking-2507","model_name":"Qwen3-235B-A22B-Thinking-2507","score":0.81,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.8,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.8,"rank":6,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.797,"rank":8,"is_current_model":false},{"model_id":"qwen3-235b-a22b-instruct-2507","model_name":"Qwen3-235B-A22B-Instruct-2507","score":0.795,"rank":9,"is_current_model":false},{"model_id":"qwen3-next-80b-a3b-instruct","model_name":"Qwen3-Next-80B-A3B-Instruct","score":0.789,"rank":10,"is_current_model":false},{"model_id":"qwen3-next-80b-a3b-thinking","model_name":"Qwen3-Next-80B-A3B-Thinking","score":0.789,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.614,"rank":22,"is_current_model":true}]},{"benchmark_id":"infovqatest","benchmark_name":"InfoVQAtest","models":[{"model_id":"kimi-k2.5","model_name":"Kimi K2.5","score":0.926,"rank":1,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.895,"rank":2,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.892,"rank":3,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.892,"rank":3,"is_current_model":false},{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":0.87,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":0.86,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-8b-thinking","model_name":"Qwen3 VL 8B Thinking","score":0.86,"rank":6,"is_current_model":false},{"model_id":"qwen2-vl-72b","model_name":"Qwen2-VL-72B-Instruct","score":0.845,"rank":8,"is_current_model":false},{"model_id":"qwen3-vl-8b-instruct","model_name":"Qwen3 VL 8B Instruct","score":0.831,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-4b-thinking","model_name":"Qwen3 VL 4B Thinking","score":0.83,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.803,"rank":12,"is_current_model":true}]},{"benchmark_id":"livebench-20241125","benchmark_name":"LiveBench 20241125","models":[{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.796,"rank":1,"is_current_model":false},{"model_id":"qwen3-235b-a22b-thinking-2507","model_name":"Qwen3-235B-A22B-Thinking-2507","score":0.784,"rank":2,"is_current_model":false},{"model_id":"qwen3-next-80b-a3b-thinking","model_name":"Qwen3-Next-80B-A3B-Thinking","score":0.766,"rank":3,"is_current_model":false},{"model_id":"qwen3-next-80b-a3b-instruct","model_name":"Qwen3-Next-80B-A3B-Instruct","score":0.758,"rank":4,"is_current_model":false},{"model_id":"qwen3-235b-a22b-instruct-2507","model_name":"Qwen3-235B-A22B-Instruct-2507","score":0.754,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.748,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.747,"rank":7,"is_current_model":false},{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":0.722,"rank":8,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":0.721,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-8b-thinking","model_name":"Qwen3 VL 8B Thinking","score":0.698,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.609,"rank":14,"is_current_model":true}]},{"benchmark_id":"livecodebench-v6","benchmark_name":"LiveCodeBench v6","models":[{"model_id":"seed-2.0-pro","model_name":"Seed 2.0 Pro","score":0.878,"rank":1,"is_current_model":false},{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.871,"rank":2,"is_current_model":false},{"model_id":"step-3.5-flash","model_name":"Step-3.5-Flash","score":0.864,"rank":3,"is_current_model":false},{"model_id":"kimi-k2.5","model_name":"Kimi K2.5","score":0.85,"rank":4,"is_current_model":false},{"model_id":"glm-4.7","model_name":"GLM-4.7","score":0.849,"rank":5,"is_current_model":false},{"model_id":"qwen3.5-397b-a17b","model_name":"Qwen3.5-397B-A17B","score":0.836,"rank":6,"is_current_model":false},{"model_id":"kimi-k2-thinking-0905","model_name":"Kimi K2-Thinking-0905","score":0.831,"rank":7,"is_current_model":false},{"model_id":"glm-4.6","model_name":"GLM-4.6","score":0.828,"rank":8,"is_current_model":false},{"model_id":"gpt-oss-120b-high","model_name":"GPT OSS 120B High","score":0.819,"rank":9,"is_current_model":false},{"model_id":"seed-2.0-lite","model_name":"Seed 2.0 Lite","score":0.817,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.379,"rank":40,"is_current_model":true}]},{"benchmark_id":"lvbench","benchmark_name":"LVBench","models":[{"model_id":"kimi-k2.5","model_name":"Kimi K2.5","score":0.759,"rank":1,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.744,"rank":2,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.736,"rank":3,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.714,"rank":4,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.677,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":0.638,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.636,"rank":7,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.626,"rank":8,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-instruct","model_name":"Qwen3 VL 30B A3B Instruct","score":0.625,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":0.592,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.562,"rank":12,"is_current_model":true}]},{"benchmark_id":"mathvision","benchmark_name":"MathVision","models":[{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.88,"rank":1,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.862,"rank":2,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.86,"rank":3,"is_current_model":false},{"model_id":"gemma-4-31b-it","model_name":"Gemma 4 31B","score":0.856,"rank":4,"is_current_model":false},{"model_id":"kimi-k2.5","model_name":"Kimi K2.5","score":0.842,"rank":5,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.839,"rank":6,"is_current_model":false},{"model_id":"gemma-4-26b-a4b-it","model_name":"Gemma 4 26B-A4B","score":0.824,"rank":7,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.746,"rank":8,"is_current_model":false},{"model_id":"step3-vl-10b","model_name":"Step3-VL-10B","score":0.708,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.702,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.516,"rank":20,"is_current_model":true}]},{"benchmark_id":"mathvista-mini","benchmark_name":"MathVista-Mini","models":[{"model_id":"kimi-k2.5","model_name":"Kimi K2.5","score":0.901,"rank":1,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.878,"rank":2,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.874,"rank":3,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.862,"rank":4,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.859,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.858,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.849,"rank":7,"is_current_model":false},{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":0.838,"rank":8,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":0.819,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-8b-thinking","model_name":"Qwen3 VL 8B Thinking","score":0.814,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.737,"rank":16,"is_current_model":true}]},{"benchmark_id":"mmbench-v1.1","benchmark_name":"MMBench-V1.1","models":[{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.928,"rank":1,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.926,"rank":2,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.915,"rank":3,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.908,"rank":4,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.906,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.899,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":0.889,"rank":7,"is_current_model":false},{"model_id":"qwen3-vl-8b-thinking","model_name":"Qwen3 VL 8B Thinking","score":0.875,"rank":8,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-instruct","model_name":"Qwen3 VL 30B A3B Instruct","score":0.87,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-4b-thinking","model_name":"Qwen3 VL 4B Thinking","score":0.867,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.851,"rank":11,"is_current_model":true}]},{"benchmark_id":"mmlu","benchmark_name":"MMLU","models":[{"model_id":"gpt-5-2025-08-07","model_name":"GPT-5","score":0.925,"rank":1,"is_current_model":false},{"model_id":"o1-2024-12-17","model_name":"o1","score":0.918,"rank":2,"is_current_model":false},{"model_id":"o1-preview","model_name":"o1-preview","score":0.908,"rank":3,"is_current_model":false},{"model_id":"gpt-4.5","model_name":"GPT-4.5","score":0.908,"rank":3,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.906,"rank":5,"is_current_model":false},{"model_id":"sarvam-105b","model_name":"Sarvam-105B","score":0.906,"rank":5,"is_current_model":false},{"model_id":"claude-3-5-sonnet-20240620","model_name":"Claude 3.5 Sonnet","score":0.904,"rank":7,"is_current_model":false},{"model_id":"claude-3-5-sonnet-20241022","model_name":"Claude 3.5 Sonnet","score":0.904,"rank":7,"is_current_model":false},{"model_id":"kimi-k2-0905","model_name":"Kimi K2 0905","score":0.902,"rank":9,"is_current_model":false},{"model_id":"gpt-4.1-2025-04-14","model_name":"GPT-4.1","score":0.902,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.772,"rank":71,"is_current_model":true}]},{"benchmark_id":"mmlu-pro","benchmark_name":"MMLU-Pro","models":[{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.885,"rank":1,"is_current_model":false},{"model_id":"minimax-m2.1","model_name":"MiniMax M2.1","score":0.88,"rank":2,"is_current_model":false},{"model_id":"qwen3.5-397b-a17b","model_name":"Qwen3.5-397B-A17B","score":0.878,"rank":3,"is_current_model":false},{"model_id":"kimi-k2.5","model_name":"Kimi K2.5","score":0.871,"rank":4,"is_current_model":false},{"model_id":"ernie-5.0","model_name":"ERNIE 5.0","score":0.87,"rank":5,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.867,"rank":6,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.861,"rank":7,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.853,"rank":8,"is_current_model":false},{"model_id":"gemma-4-31b-it","model_name":"Gemma 4 31B","score":0.852,"rank":9,"is_current_model":false},{"model_id":"deepseek-reasoner","model_name":"DeepSeek-V3.2 (Thinking)","score":0.85,"rank":10,"is_current_model":false},{"model_id":"deepseek-r1-0528","model_name":"DeepSeek-R1-0528","score":0.85,"rank":10,"is_current_model":false},{"model_id":"deepseek-v3.2-exp","model_name":"DeepSeek-V3.2-Exp","score":0.85,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.671,"rank":79,"is_current_model":true}]},{"benchmark_id":"mmlu-prox","benchmark_name":"MMLU-ProX","models":[{"model_id":"qwen3.5-397b-a17b","model_name":"Qwen3.5-397B-A17B","score":0.847,"rank":1,"is_current_model":false},{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.847,"rank":1,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.822,"rank":3,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.822,"rank":3,"is_current_model":false},{"model_id":"qwen3-235b-a22b-thinking-2507","model_name":"Qwen3-235B-A22B-Thinking-2507","score":0.81,"rank":5,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.81,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.806,"rank":7,"is_current_model":false},{"model_id":"qwen3-235b-a22b-instruct-2507","model_name":"Qwen3-235B-A22B-Instruct-2507","score":0.794,"rank":8,"is_current_model":false},{"model_id":"nemotron-3-super-120b-a12b","model_name":"Nemotron 3 Super (120B A12B)","score":0.7936,"rank":9,"is_current_model":false},{"model_id":"qwen3-next-80b-a3b-thinking","model_name":"Qwen3-Next-80B-A3B-Thinking","score":0.787,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.594,"rank":23,"is_current_model":true}]},{"benchmark_id":"mmlu-redux","benchmark_name":"MMLU-Redux","models":[{"model_id":"qwen3.5-397b-a17b","model_name":"Qwen3.5-397B-A17B","score":0.949,"rank":1,"is_current_model":false},{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.945,"rank":2,"is_current_model":false},{"model_id":"kimi-k2-thinking-0905","model_name":"Kimi K2-Thinking-0905","score":0.944,"rank":3,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.94,"rank":4,"is_current_model":false},{"model_id":"qwen3-235b-a22b-thinking-2507","model_name":"Qwen3-235B-A22B-Thinking-2507","score":0.938,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.937,"rank":6,"is_current_model":false},{"model_id":"deepseek-r1-0528","model_name":"DeepSeek-R1-0528","score":0.934,"rank":7,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.933,"rank":8,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.932,"rank":9,"is_current_model":false},{"model_id":"qwen3-235b-a22b-instruct-2507","model_name":"Qwen3-235B-A22B-Instruct-2507","score":0.931,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.815,"rank":33,"is_current_model":true}]},{"benchmark_id":"mm-mt-bench","benchmark_name":"MM-MT-Bench","models":[{"model_id":"mistral-large-3-2509","model_name":"Mistral Large 3","score":84.9,"rank":1,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":8.5,"rank":2,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":8.5,"rank":2,"is_current_model":false},{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":8.4,"rank":4,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":8.3,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-instruct","model_name":"Qwen3 VL 30B A3B Instruct","score":8.1,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-8b-thinking","model_name":"Qwen3 VL 8B Thinking","score":8.0,"rank":7,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":7.9,"rank":8,"is_current_model":false},{"model_id":"qwen3-vl-8b-instruct","model_name":"Qwen3 VL 8B Instruct","score":7.7,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-4b-thinking","model_name":"Qwen3 VL 4B Thinking","score":7.7,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":7.5,"rank":11,"is_current_model":true}]},{"benchmark_id":"mmmu-pro","benchmark_name":"MMMU-Pro","models":[{"model_id":"gemini-3-flash-preview","model_name":"Gemini 3 Flash","score":0.812,"rank":1,"is_current_model":false},{"model_id":"gpt-5.4","model_name":"GPT-5.4","score":0.812,"rank":1,"is_current_model":false},{"model_id":"gemini-3-pro-preview","model_name":"Gemini 3 Pro","score":0.81,"rank":3,"is_current_model":false},{"model_id":"gemini-3.1-pro-preview","model_name":"Gemini 3.1 Pro","score":0.805,"rank":4,"is_current_model":false},{"model_id":"gpt-5.2-2025-12-11","model_name":"GPT-5.2","score":0.795,"rank":5,"is_current_model":false},{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.788,"rank":6,"is_current_model":false},{"model_id":"kimi-k2.5","model_name":"Kimi K2.5","score":0.785,"rank":7,"is_current_model":false},{"model_id":"gpt-5-2025-08-07","model_name":"GPT-5","score":0.784,"rank":8,"is_current_model":false},{"model_id":"claude-opus-4-6","model_name":"Claude Opus 4.6","score":0.773,"rank":9,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.769,"rank":10,"is_current_model":false},{"model_id":"gemma-4-31b-it","model_name":"Gemma 4 31B","score":0.769,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.532,"rank":32,"is_current_model":true}]},{"benchmark_id":"mmmu-(val)","benchmark_name":"MMMU (val)","models":[{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.781,"rank":1,"is_current_model":false},{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":0.76,"rank":2,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":0.76,"rank":2,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-instruct","model_name":"Qwen3 VL 30B A3B Instruct","score":0.742,"rank":4,"is_current_model":false},{"model_id":"qwen3-vl-8b-thinking","model_name":"Qwen3 VL 8B Thinking","score":0.741,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-4b-thinking","model_name":"Qwen3 VL 4B Thinking","score":0.708,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-8b-instruct","model_name":"Qwen3 VL 8B Instruct","score":0.696,"rank":7,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.674,"rank":8,"is_current_model":true},{"model_id":"gemma-3-27b-it","model_name":"Gemma 3 27B","score":0.649,"rank":9,"is_current_model":false},{"model_id":"gemma-3-12b-it","model_name":"Gemma 3 12B","score":0.596,"rank":10,"is_current_model":false}]},{"benchmark_id":"mmstar","benchmark_name":"MMStar","models":[{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.833,"rank":1,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.829,"rank":2,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.819,"rank":3,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.81,"rank":4,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.794,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.787,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.784,"rank":7,"is_current_model":false},{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":0.777,"rank":8,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":0.755,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-8b-thinking","model_name":"Qwen3 VL 8B Thinking","score":0.753,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.698,"rank":15,"is_current_model":true}]},{"benchmark_id":"muirbench","benchmark_name":"MuirBench","models":[{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.803,"rank":1,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.801,"rank":2,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":0.776,"rank":3,"is_current_model":false},{"model_id":"qwen3-vl-8b-thinking","model_name":"Qwen3 VL 8B Thinking","score":0.768,"rank":4,"is_current_model":false},{"model_id":"qwen3-vl-4b-thinking","model_name":"Qwen3 VL 4B Thinking","score":0.75,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":0.728,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.728,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-8b-instruct","model_name":"Qwen3 VL 8B Instruct","score":0.644,"rank":8,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.638,"rank":9,"is_current_model":true},{"model_id":"qwen3-vl-30b-a3b-instruct","model_name":"Qwen3 VL 30B A3B Instruct","score":0.629,"rank":10,"is_current_model":false}]},{"benchmark_id":"mvbench","benchmark_name":"MVBench","models":[{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.766,"rank":1,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.748,"rank":2,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.746,"rank":3,"is_current_model":false},{"model_id":"qwen2-vl-72b","model_name":"Qwen2-VL-72B-Instruct","score":0.736,"rank":4,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.732,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":0.728,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-instruct","model_name":"Qwen3 VL 30B A3B Instruct","score":0.723,"rank":7,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":0.72,"rank":8,"is_current_model":false},{"model_id":"qwen2.5-vl-72b","model_name":"Qwen2.5 VL 72B Instruct","score":0.704,"rank":9,"is_current_model":false},{"model_id":"qwen2.5-omni-7b","model_name":"Qwen2.5-Omni-7B","score":0.703,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.689,"rank":14,"is_current_model":true}]},{"benchmark_id":"ocrbench","benchmark_name":"OCRBench","models":[{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":920.0,"rank":1,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-instruct","model_name":"Qwen3 VL 30B A3B Instruct","score":903.0,"rank":2,"is_current_model":false},{"model_id":"qwen3-vl-8b-instruct","model_name":"Qwen3 VL 8B Instruct","score":896.0,"rank":3,"is_current_model":false},{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":895.0,"rank":4,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":881.0,"rank":5,"is_current_model":true},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":875.0,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":855.0,"rank":7,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":839.0,"rank":8,"is_current_model":false},{"model_id":"qwen3-vl-8b-thinking","model_name":"Qwen3 VL 8B Thinking","score":819.0,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-4b-thinking","model_name":"Qwen3 VL 4B Thinking","score":808.0,"rank":10,"is_current_model":false}]},{"benchmark_id":"ocrbench-v2-(en)","benchmark_name":"OCRBench-V2 (en)","models":[{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.684,"rank":1,"is_current_model":false},{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":0.674,"rank":2,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.671,"rank":3,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.668,"rank":4,"is_current_model":false},{"model_id":"qwen3-vl-8b-instruct","model_name":"Qwen3 VL 8B Instruct","score":0.654,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-8b-thinking","model_name":"Qwen3 VL 8B Thinking","score":0.639,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.637,"rank":7,"is_current_model":true},{"model_id":"qwen3-vl-30b-a3b-instruct","model_name":"Qwen3 VL 30B A3B Instruct","score":0.632,"rank":8,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":0.626,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-4b-thinking","model_name":"Qwen3 VL 4B Thinking","score":0.618,"rank":10,"is_current_model":false}]},{"benchmark_id":"ocrbench-v2-(zh)","benchmark_name":"OCRBench-V2 (zh)","models":[{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.635,"rank":1,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.621,"rank":2,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.618,"rank":3,"is_current_model":false},{"model_id":"qwen3-vl-8b-instruct","model_name":"Qwen3 VL 8B Instruct","score":0.612,"rank":4,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":0.604,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":0.592,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-8b-thinking","model_name":"Qwen3 VL 8B Thinking","score":0.592,"rank":6,"is_current_model":false},{"model_id":"qwen2.5-vl-32b","model_name":"Qwen2.5 VL 32B Instruct","score":0.591,"rank":8,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-instruct","model_name":"Qwen3 VL 30B A3B Instruct","score":0.578,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.576,"rank":10,"is_current_model":true}]},{"benchmark_id":"odinw","benchmark_name":"ODinW","models":[{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.518,"rank":1,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.486,"rank":2,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.482,"rank":3,"is_current_model":true},{"model_id":"qwen3-vl-30b-a3b-instruct","model_name":"Qwen3 VL 30B A3B Instruct","score":0.475,"rank":4,"is_current_model":false},{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":0.466,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-8b-instruct","model_name":"Qwen3 VL 8B Instruct","score":0.447,"rank":6,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.445,"rank":7,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.432,"rank":8,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.426,"rank":9,"is_current_model":false},{"model_id":"qwen2.5-omni-7b","model_name":"Qwen2.5-Omni-7B","score":0.424,"rank":10,"is_current_model":false}]},{"benchmark_id":"osworld","benchmark_name":"OSWorld","models":[{"model_id":"claude-opus-4-6","model_name":"Claude Opus 4.6","score":0.727,"rank":1,"is_current_model":false},{"model_id":"claude-sonnet-4-6","model_name":"Claude Sonnet 4.6","score":0.725,"rank":2,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.667,"rank":3,"is_current_model":false},{"model_id":"claude-opus-4-5-20251101","model_name":"Claude Opus 4.5","score":0.663,"rank":4,"is_current_model":false},{"model_id":"glm-5v-turbo","model_name":"GLM-5V-Turbo","score":0.623,"rank":5,"is_current_model":false},{"model_id":"claude-sonnet-4-5-20250929","model_name":"Claude Sonnet 4.5","score":0.614,"rank":6,"is_current_model":false},{"model_id":"claude-haiku-4-5-20251001","model_name":"Claude Haiku 4.5","score":0.507,"rank":7,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.41,"rank":8,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.381,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-8b-thinking","model_name":"Qwen3 VL 8B Thinking","score":0.339,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-8b-instruct","model_name":"Qwen3 VL 8B Instruct","score":0.339,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.262,"rank":16,"is_current_model":true}]},{"benchmark_id":"polymath","benchmark_name":"PolyMATH","models":[{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.774,"rank":1,"is_current_model":false},{"model_id":"qwen3.5-397b-a17b","model_name":"Qwen3.5-397B-A17B","score":0.733,"rank":2,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.712,"rank":3,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.689,"rank":4,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.644,"rank":5,"is_current_model":false},{"model_id":"qwen3-235b-a22b-thinking-2507","model_name":"Qwen3-235B-A22B-Thinking-2507","score":0.601,"rank":6,"is_current_model":false},{"model_id":"qwen3.5-9b","model_name":"Qwen3.5-9B","score":0.573,"rank":7,"is_current_model":false},{"model_id":"qwen3-next-80b-a3b-thinking","model_name":"Qwen3-Next-80B-A3B-Thinking","score":0.563,"rank":8,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.52,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":0.517,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.288,"rank":19,"is_current_model":true}]},{"benchmark_id":"realworldqa","benchmark_name":"RealWorldQA","models":[{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.854,"rank":1,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.851,"rank":2,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.841,"rank":3,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.837,"rank":4,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.813,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.793,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":0.79,"rank":7,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.784,"rank":8,"is_current_model":false},{"model_id":"qwen2-vl-72b","model_name":"Qwen2-VL-72B-Instruct","score":0.778,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":0.774,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.709,"rank":15,"is_current_model":true}]},{"benchmark_id":"screenspot","benchmark_name":"ScreenSpot","models":[{"model_id":"qwen3-vl-32b-instruct","model_name":"Qwen3 VL 32B Instruct","score":0.958,"rank":1,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.957,"rank":2,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.954,"rank":3,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.954,"rank":3,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-instruct","model_name":"Qwen3 VL 30B A3B Instruct","score":0.947,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":0.947,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-8b-instruct","model_name":"Qwen3 VL 8B Instruct","score":0.944,"rank":7,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.94,"rank":8,"is_current_model":true},{"model_id":"qwen3-vl-8b-thinking","model_name":"Qwen3 VL 8B Thinking","score":0.936,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-4b-thinking","model_name":"Qwen3 VL 4B Thinking","score":0.929,"rank":10,"is_current_model":false}]},{"benchmark_id":"screenspot-pro","benchmark_name":"ScreenSpot Pro","models":[{"model_id":"gpt-5.2-2025-12-11","model_name":"GPT-5.2","score":0.863,"rank":1,"is_current_model":false},{"model_id":"gemini-3-pro-preview","model_name":"Gemini 3 Pro","score":0.727,"rank":2,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.704,"rank":3,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.703,"rank":4,"is_current_model":false},{"model_id":"gemini-3-flash-preview","model_name":"Gemini 3 Flash","score":0.691,"rank":5,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.686,"rank":6,"is_current_model":false},{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.682,"rank":7,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.62,"rank":8,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.618,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-instruct","model_name":"Qwen3 VL 30B A3B Instruct","score":0.605,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.595,"rank":11,"is_current_model":true}]},{"benchmark_id":"simpleqa","benchmark_name":"SimpleQA","models":[{"model_id":"deepseek-v3.2-exp","model_name":"DeepSeek-V3.2-Exp","score":0.971,"rank":1,"is_current_model":false},{"model_id":"grok-4-fast","model_name":"Grok 4 Fast","score":0.95,"rank":2,"is_current_model":false},{"model_id":"deepseek-v3.1","model_name":"DeepSeek-V3.1","score":0.934,"rank":3,"is_current_model":false},{"model_id":"deepseek-r1-0528","model_name":"DeepSeek-R1-0528","score":0.923,"rank":4,"is_current_model":false},{"model_id":"ernie-5.0","model_name":"ERNIE 5.0","score":0.75,"rank":5,"is_current_model":false},{"model_id":"gemini-3-pro-preview","model_name":"Gemini 3 Pro","score":0.721,"rank":6,"is_current_model":false},{"model_id":"gemini-3-flash-preview","model_name":"Gemini 3 Flash","score":0.687,"rank":7,"is_current_model":false},{"model_id":"gpt-4.5","model_name":"GPT-4.5","score":0.625,"rank":8,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.554,"rank":9,"is_current_model":false},{"model_id":"qwen3-235b-a22b-instruct-2507","model_name":"Qwen3-235B-A22B-Instruct-2507","score":0.543,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.48,"rank":15,"is_current_model":true}]},{"benchmark_id":"supergpqa","benchmark_name":"SuperGPQA","models":[{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.716,"rank":1,"is_current_model":false},{"model_id":"qwen3.5-397b-a17b","model_name":"Qwen3.5-397B-A17B","score":0.704,"rank":2,"is_current_model":false},{"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","score":0.671,"rank":3,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.656,"rank":4,"is_current_model":false},{"model_id":"qwen3-max","model_name":"Qwen3 Max","score":0.651,"rank":5,"is_current_model":false},{"model_id":"qwen3-235b-a22b-thinking-2507","model_name":"Qwen3-235B-A22B-Thinking-2507","score":0.649,"rank":6,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.643,"rank":7,"is_current_model":false},{"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","score":0.634,"rank":8,"is_current_model":false},{"model_id":"qwen3-235b-a22b-instruct-2507","model_name":"Qwen3-235B-A22B-Instruct-2507","score":0.626,"rank":9,"is_current_model":false},{"model_id":"qwen3-next-80b-a3b-thinking","model_name":"Qwen3-Next-80B-A3B-Thinking","score":0.608,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.403,"rank":26,"is_current_model":true}]},{"benchmark_id":"videommmu","benchmark_name":"VideoMMMU","models":[{"model_id":"gemini-3-pro-preview","model_name":"Gemini 3 Pro","score":0.876,"rank":1,"is_current_model":false},{"model_id":"gemini-3-flash-preview","model_name":"Gemini 3 Flash","score":0.869,"rank":2,"is_current_model":false},{"model_id":"kimi-k2.5","model_name":"Kimi K2.5","score":0.866,"rank":3,"is_current_model":false},{"model_id":"gpt-5.2-2025-12-11","model_name":"GPT-5.2","score":0.859,"rank":4,"is_current_model":false},{"model_id":"gemini-3.1-flash-lite-preview","model_name":"Gemini 3.1 Flash-Lite","score":0.848,"rank":5,"is_current_model":false},{"model_id":"gpt-5-2025-08-07","model_name":"GPT-5","score":0.846,"rank":6,"is_current_model":false},{"model_id":"qwen3.6-plus","model_name":"Qwen3.6 Plus","score":0.84,"rank":7,"is_current_model":false},{"model_id":"gemini-2.5-pro-preview-06-05","model_name":"Gemini 2.5 Pro Preview 06-05","score":0.836,"rank":8,"is_current_model":false},{"model_id":"o3-2025-04-16","model_name":"o3","score":0.833,"rank":9,"is_current_model":false},{"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","score":0.823,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.562,"rank":22,"is_current_model":true}]},{"benchmark_id":"writingbench","benchmark_name":"WritingBench","models":[{"model_id":"qwen3-235b-a22b-thinking-2507","model_name":"Qwen3-235B-A22B-Thinking-2507","score":0.883,"rank":1,"is_current_model":false},{"model_id":"qwen3-next-80b-a3b-instruct","model_name":"Qwen3-Next-80B-A3B-Instruct","score":0.873,"rank":2,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-thinking","model_name":"Qwen3 VL 235B A22B Thinking","score":0.867,"rank":3,"is_current_model":false},{"model_id":"qwen3-vl-32b-thinking","model_name":"Qwen3 VL 32B Thinking","score":0.862,"rank":4,"is_current_model":false},{"model_id":"qwen3-vl-235b-a22b-instruct","model_name":"Qwen3 VL 235B A22B Instruct","score":0.855,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-8b-thinking","model_name":"Qwen3 VL 8B Thinking","score":0.855,"rank":5,"is_current_model":false},{"model_id":"qwen3-vl-30b-a3b-thinking","model_name":"Qwen3 VL 30B A3B Thinking","score":0.852,"rank":7,"is_current_model":false},{"model_id":"qwen3-235b-a22b-instruct-2507","model_name":"Qwen3-235B-A22B-Instruct-2507","score":0.852,"rank":7,"is_current_model":false},{"model_id":"qwen3-next-80b-a3b-thinking","model_name":"Qwen3-Next-80B-A3B-Thinking","score":0.846,"rank":9,"is_current_model":false},{"model_id":"qwen3-vl-4b-thinking","model_name":"Qwen3 VL 4B Thinking","score":0.84,"rank":10,"is_current_model":false},{"model_id":"qwen3-vl-4b-instruct","model_name":"Qwen3 VL 4B Instruct","score":0.825,"rank":14,"is_current_model":true}]}],"comparison_model":{"model_id":"gpt-5.2-pro-2025-12-11","name":"GPT-5.2 Pro","organization_name":"OpenAI","release_date":"2025-12-11","announcement_date":"2025-12-11","knowledge_cutoff":null,"param_count":null,"multimodal":true,"license":{"name":"Proprietary","allow_commercial":false},"benchmarks":{"aime-2025":1.0,"arc-agi":0.905,"arc-agi-v2":0.542,"browsecomp":0.779,"gpqa":0.932,"hmmt-2025":1.0,"humanity's-last-exam":0.366},"provider":{"name":"OpenAI","input_cost":21.0,"output_cost":168.0,"max_input_tokens":400000,"max_output_tokens":128000,"modalities":{"input":{"text":false,"image":true,"audio":false,"video":false},"output":{"text":true,"image":false,"audio":false,"video":false}}}}}