{"benchmark_id":"ruler","name":"RULER","parent_benchmark":null,"categories":["reasoning","long_context"],"modality":"text","multilingual":false,"max_score":1.0,"language":"en","description":"RULER v1 is a synthetic long-context benchmark for measuring how model quality degrades as input length increases. This packaging follows the public standalone NVIDIA RULER implementation with 13 official tasks spanning retrieval, multi-hop tracing, aggregation, and QA.","paper_link":"https://arxiv.org/abs/2404.06654","implementation_link":"https://github.com/NVIDIA/RULER","verified":false,"created_at":"2026-05-07T16:53:25.954974+00:00","updated_at":"2026-07-05T18:27:59.890849+00:00","statistics":{"total_models":4,"average_score":0.894125,"min_score":0.841,"max_score":0.947,"score_stddev":0.04725529070908352,"verified_count":0,"self_reported_count":4},"child_benchmarks":[{"benchmark_id":"ruler-1000k","name":"RULER 1000K","categories":["reasoning","long_context"],"modality":"text","max_score":1.0,"description":"RULER 1000K evaluates the official 13-task RULER v1 suite at a 1048576-token (1M) context budget."},{"benchmark_id":"ruler-128k","name":"RULER 128k","categories":["reasoning","long_context"],"modality":"text","max_score":1.0,"description":"RULER 128k evaluates the official 13-task RULER v1 suite at a 131072-token context budget."},{"benchmark_id":"ruler-16k","name":"RULER 16k","categories":["reasoning","long_context"],"modality":"text","max_score":1.0,"description":"RULER 16k evaluates the official 13-task RULER v1 suite at a 16384-token context budget."},{"benchmark_id":"ruler-2048k","name":"RULER 2048K","categories":["reasoning","long_context"],"modality":"text","max_score":1.0,"description":"RULER 2048K evaluates the official 13-task RULER v1 suite at a 2097152-token (2M) context budget."},{"benchmark_id":"ruler-32k","name":"RULER 32k","categories":["reasoning","long_context"],"modality":"text","max_score":1.0,"description":"RULER 32k evaluates the official 13-task RULER v1 suite at a 32768-token context budget."},{"benchmark_id":"ruler-4k","name":"RULER 4k","categories":["reasoning","long_context"],"modality":"text","max_score":1.0,"description":"RULER 4k evaluates the official 13-task RULER v1 suite at a 4096-token context budget."},{"benchmark_id":"ruler-512k","name":"RULER 512K","categories":["reasoning","long_context"],"modality":"text","max_score":1.0,"description":"RULER 512K evaluates the official 13-task RULER v1 suite at a 524288-token context budget."},{"benchmark_id":"ruler-64k","name":"RULER 64k","categories":["reasoning","long_context"],"modality":"text","max_score":1.0,"description":"RULER 64k evaluates the official 13-task RULER v1 suite at a 65536-token context budget."},{"benchmark_id":"ruler-8k","name":"RULER 8k","categories":["reasoning","long_context"],"modality":"text","max_score":1.0,"description":"RULER 8k evaluates the official 13-task RULER v1 suite at an 8192-token context budget."}],"linked_dataset":null,"models":[{"rank":1,"model_id":"nemotron-3-ultra-550b-a55b","model_name":"Nemotron 3 Ultra (550B A55B)","organization_id":"nvidia","organization_name":"NVIDIA","organization_country":"US","score":0.947,"normalized_score":0.947,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16","analysis_method":"1M","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2026-06-04","param_count":550000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":2,"model_id":"nemotron-3-super-120b-a12b","model_name":"Nemotron 3 Super (120B A12B)","organization_id":"nvidia","organization_name":"NVIDIA","organization_country":"US","score":0.9175,"normalized_score":0.9175,"verified":false,"self_reported":true,"self_reported_source":"https://build.nvidia.com/nvidia/nemotron-3-super-120b-a12b/modelcard","analysis_method":"100 @ 1M","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2026-03-11","param_count":120000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":3,"model_id":"phi-3.5-moe-instruct","model_name":"Phi-3.5-MoE-instruct","organization_id":"microsoft","organization_name":"Microsoft","organization_country":"US","score":0.871,"normalized_score":0.871,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/microsoft/Phi-3.5-MoE-instruct","analysis_method":"long context (128K) evaluation","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2024-08-23","param_count":60000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":4,"model_id":"phi-3.5-mini-instruct","model_name":"Phi-3.5-mini-instruct","organization_id":"microsoft","organization_name":"Microsoft","organization_country":"US","score":0.841,"normalized_score":0.841,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/microsoft/Phi-3.5-mini-instruct","analysis_method":"128k","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2024-08-23","param_count":3800000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null}]}