{"benchmark_id":"perceptiontest","name":"PerceptionTest","parent_benchmark":null,"categories":["multimodal","physics","reasoning","spatial_reasoning","video","vision"],"modality":"multimodal","multilingual":false,"max_score":1.0,"language":"en","description":"A novel multimodal video benchmark designed to evaluate perception and reasoning skills of pre-trained models across video, audio, and text modalities. Contains 11.6k real-world videos (average 23 seconds) filmed by participants worldwide, densely annotated with six types of labels. Focuses on skills (Memory, Abstraction, Physics, Semantics) and reasoning types (descriptive, explanatory, predictive, counterfactual). Shows significant performance gap between human baseline (91.4%) and state-of-the-art video QA models (46.2%).","paper_link":"https://arxiv.org/abs/2305.13786","implementation_link":null,"verified":false,"created_at":"2026-05-07T16:53:25.736881+00:00","updated_at":"2026-07-05T18:27:58.530726+00:00","statistics":{"total_models":2,"average_score":0.7184999999999999,"min_score":0.705,"max_score":0.732,"score_stddev":0.019091883092036722,"verified_count":0,"self_reported_count":2},"child_benchmarks":[],"linked_dataset":null,"models":[{"rank":1,"model_id":"qwen2.5-vl-72b","model_name":"Qwen2.5 VL 72B Instruct","organization_id":"qwen","organization_name":"Alibaba Cloud / Qwen Team","organization_country":"CN","score":0.732,"normalized_score":0.732,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct","analysis_method":"Score","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-01-26","param_count":72000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":2,"model_id":"qwen2.5-vl-7b","model_name":"Qwen2.5 VL 7B Instruct","organization_id":"qwen","organization_name":"Alibaba Cloud / Qwen Team","organization_country":"CN","score":0.705,"normalized_score":0.705,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct","analysis_method":"Score","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-01-26","param_count":8290000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null}]}