{"benchmark_id":"graphwalks-parents->128k","name":"Graphwalks parents >128k","parent_benchmark":null,"categories":["reasoning","spatial_reasoning","long_context"],"modality":"text","multilingual":false,"max_score":1.0,"language":"en","description":"A graph reasoning benchmark that evaluates language models' ability to find parent nodes in graphs with context length over 128k tokens, testing long-context reasoning and graph structure understanding.","paper_link":null,"implementation_link":null,"verified":false,"created_at":"2026-05-07T16:53:23.623483+00:00","updated_at":"2026-07-05T18:27:49.390746+00:00","statistics":{"total_models":7,"average_score":0.4445714285714285,"min_score":0.056,"max_score":0.954,"score_stddev":0.3524759363620242,"verified_count":0,"self_reported_count":7},"child_benchmarks":[],"linked_dataset":null,"models":[{"rank":1,"model_id":"claude-opus-4-6","model_name":"Claude Opus 4.6","organization_id":"anthropic","organization_name":"Anthropic","organization_country":"US","score":0.954,"normalized_score":0.954,"verified":false,"self_reported":true,"self_reported_source":"https://www-cdn.anthropic.com/0dd865075ad3132672ee0ab40b05a53f14cf5288.pdf","analysis_method":"GraphWalks Parents 256K subset of 1M. F1 score with max output tokens, 1M context window, average of 5 trials. At 64k output tokens: 95.1. Full 1M variant: 72.0 (max), 71.1 (64k).","verification_date":null,"provider_id":"anthropic","input_cost_per_million":5.0,"output_cost_per_million":25.0,"context_window":1000000,"announcement_date":"2026-02-05","param_count":null,"is_open_source":false,"is_new":false,"best_latency":0.5,"latency_provider":"Anthropic","best_throughput":42.0,"throughput_provider":"Anthropic","context_provider":"Anthropic"},{"rank":2,"model_id":"claude-opus-4-8","model_name":"Claude Opus 4.8","organization_id":"anthropic","organization_name":"Anthropic","organization_country":"US","score":0.833,"normalized_score":0.833,"verified":false,"self_reported":true,"self_reported_source":"https://www.anthropic.com/news/claude-opus-4-8","analysis_method":"F1 score on the 1M-token subset, averaged over 5 trials. 256K subset: 99.3%.","verification_date":null,"provider_id":"anthropic","input_cost_per_million":5.0,"output_cost_per_million":25.0,"context_window":1000000,"announcement_date":"2026-05-28","param_count":null,"is_open_source":false,"is_new":false,"best_latency":0.5,"latency_provider":"Anthropic","best_throughput":42.0,"throughput_provider":"Anthropic","context_provider":"Anthropic"},{"rank":3,"model_id":"gpt-5.5","model_name":"GPT-5.5","organization_id":"openai","organization_name":"OpenAI","organization_country":"US","score":0.585,"normalized_score":0.585,"verified":false,"self_reported":true,"self_reported_source":"https://openai.com/index/introducing-gpt-5-5/","analysis_method":"Graphwalks parents 1M f1. Reasoning effort xhigh.","verification_date":null,"provider_id":"openai","input_cost_per_million":5.0,"output_cost_per_million":30.0,"context_window":1050000,"announcement_date":"2026-04-23","param_count":null,"is_open_source":false,"is_new":false,"best_latency":null,"latency_provider":"OpenAI","best_throughput":null,"throughput_provider":"OpenAI","context_provider":"OpenAI"},{"rank":4,"model_id":"gpt-5.4","model_name":"GPT-5.4","organization_id":"openai","organization_name":"OpenAI","organization_country":"US","score":0.324,"normalized_score":0.324,"verified":false,"self_reported":true,"self_reported_source":"https://openai.com/index/introducing-gpt-5-4/","analysis_method":"Graphwalks parents 256K-1M (accuracy). Reasoning effort xhigh.","verification_date":null,"provider_id":"openai","input_cost_per_million":2.5,"output_cost_per_million":15.0,"context_window":1000000,"announcement_date":"2026-03-05","param_count":null,"is_open_source":false,"is_new":false,"best_latency":3.0,"latency_provider":"OpenAI","best_throughput":50.0,"throughput_provider":"OpenAI","context_provider":"OpenAI"},{"rank":5,"model_id":"gpt-4.1-2025-04-14","model_name":"GPT-4.1","organization_id":"openai","organization_name":"OpenAI","organization_country":"US","score":0.25,"normalized_score":0.25,"verified":false,"self_reported":true,"self_reported_source":"https://openai.com/index/introducing-gpt-5-for-developers/","analysis_method":"Internal benchmark","verification_date":null,"provider_id":"openai","input_cost_per_million":2.0,"output_cost_per_million":8.0,"context_window":1047576,"announcement_date":"2025-04-14","param_count":null,"is_open_source":false,"is_new":false,"best_latency":10.0,"latency_provider":"OpenAI","best_throughput":100.0,"throughput_provider":"OpenAI","context_provider":"OpenAI"},{"rank":6,"model_id":"gpt-4.1-mini-2025-04-14","model_name":"GPT-4.1 mini","organization_id":"openai","organization_name":"OpenAI","organization_country":"US","score":0.11,"normalized_score":0.11,"verified":false,"self_reported":true,"self_reported_source":"https://openai.com/index/introducing-gpt-5-for-developers/","analysis_method":"Internal benchmark","verification_date":null,"provider_id":"openai","input_cost_per_million":0.4,"output_cost_per_million":1.6,"context_window":1047576,"announcement_date":"2025-04-14","param_count":null,"is_open_source":false,"is_new":false,"best_latency":5.0,"latency_provider":"OpenAI","best_throughput":150.0,"throughput_provider":"OpenAI","context_provider":"OpenAI"},{"rank":7,"model_id":"gpt-4.1-nano-2025-04-14","model_name":"GPT-4.1 nano","organization_id":"openai","organization_name":"OpenAI","organization_country":"US","score":0.056,"normalized_score":0.056,"verified":false,"self_reported":true,"self_reported_source":"https://openai.com/index/gpt-4-1/","analysis_method":"Internal benchmark","verification_date":null,"provider_id":"openai","input_cost_per_million":0.1,"output_cost_per_million":0.4,"context_window":1047576,"announcement_date":"2025-04-14","param_count":null,"is_open_source":false,"is_new":false,"best_latency":2.0,"latency_provider":"OpenAI","best_throughput":200.0,"throughput_provider":"OpenAI","context_provider":"OpenAI"}]}