{"benchmark_id":"graphwalks-bfs->128k","name":"Graphwalks BFS >128k","parent_benchmark":null,"categories":["reasoning","spatial_reasoning","long_context"],"modality":"text","multilingual":false,"max_score":1.0,"language":"en","description":"A graph reasoning benchmark that evaluates language models' ability to perform breadth-first search (BFS) operations on graphs with context length over 128k tokens, testing long-context reasoning capabilities.","paper_link":null,"implementation_link":null,"verified":false,"created_at":"2026-05-07T16:53:23.596480+00:00","updated_at":"2026-07-05T18:27:49.310791+00:00","statistics":{"total_models":8,"average_score":0.391625,"min_score":0.029,"max_score":0.8,"score_stddev":0.28444880910279796,"verified_count":0,"self_reported_count":8},"child_benchmarks":[],"linked_dataset":null,"models":[{"rank":1,"model_id":"claude-mythos-preview","model_name":"Claude Mythos Preview","organization_id":"anthropic","organization_name":"Anthropic","organization_country":"US","score":0.8,"normalized_score":0.8,"verified":false,"self_reported":true,"self_reported_source":"https://www.anthropic.com/claude-mythos-preview-system-card","analysis_method":"GraphWalks BFS 256K–1M. Opus 4.6: 38.7%, GPT-5.4: 21.4%.","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2026-04-07","param_count":null,"is_open_source":false,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":2,"model_id":"claude-opus-4-8","model_name":"Claude Opus 4.8","organization_id":"anthropic","organization_name":"Anthropic","organization_country":"US","score":0.681,"normalized_score":0.681,"verified":false,"self_reported":true,"self_reported_source":"https://www.anthropic.com/news/claude-opus-4-8","analysis_method":"F1 score on the 1M-token subset, averaged over 5 trials. 256K subset: 85.9%.","verification_date":null,"provider_id":"anthropic","input_cost_per_million":5.0,"output_cost_per_million":25.0,"context_window":1000000,"announcement_date":"2026-05-28","param_count":null,"is_open_source":false,"is_new":false,"best_latency":0.5,"latency_provider":"Anthropic","best_throughput":42.0,"throughput_provider":"Anthropic","context_provider":"Anthropic"},{"rank":3,"model_id":"claude-opus-4-6","model_name":"Claude Opus 4.6","organization_id":"anthropic","organization_name":"Anthropic","organization_country":"US","score":0.615,"normalized_score":0.615,"verified":false,"self_reported":true,"self_reported_source":"https://www-cdn.anthropic.com/0dd865075ad3132672ee0ab40b05a53f14cf5288.pdf","analysis_method":"GraphWalks BFS 256K subset of 1M. F1 score with 64k output tokens, 1M context window, average of 5 trials. Max output tokens: 61.1. Full 1M variant: 41.2 (64k output).","verification_date":null,"provider_id":"anthropic","input_cost_per_million":5.0,"output_cost_per_million":25.0,"context_window":1000000,"announcement_date":"2026-02-05","param_count":null,"is_open_source":false,"is_new":false,"best_latency":0.5,"latency_provider":"Anthropic","best_throughput":42.0,"throughput_provider":"Anthropic","context_provider":"Anthropic"},{"rank":4,"model_id":"gpt-5.5","model_name":"GPT-5.5","organization_id":"openai","organization_name":"OpenAI","organization_country":"US","score":0.454,"normalized_score":0.454,"verified":false,"self_reported":true,"self_reported_source":"https://openai.com/index/introducing-gpt-5-5/","analysis_method":"Graphwalks BFS 1M f1. Reasoning effort xhigh.","verification_date":null,"provider_id":"openai","input_cost_per_million":5.0,"output_cost_per_million":30.0,"context_window":1050000,"announcement_date":"2026-04-23","param_count":null,"is_open_source":false,"is_new":false,"best_latency":null,"latency_provider":"OpenAI","best_throughput":null,"throughput_provider":"OpenAI","context_provider":"OpenAI"},{"rank":5,"model_id":"gpt-5.4","model_name":"GPT-5.4","organization_id":"openai","organization_name":"OpenAI","organization_country":"US","score":0.214,"normalized_score":0.214,"verified":false,"self_reported":true,"self_reported_source":"https://openai.com/index/introducing-gpt-5-4/","analysis_method":"Graphwalks BFS 256K-1M. Reasoning effort xhigh.","verification_date":null,"provider_id":"openai","input_cost_per_million":2.5,"output_cost_per_million":15.0,"context_window":1000000,"announcement_date":"2026-03-05","param_count":null,"is_open_source":false,"is_new":false,"best_latency":3.0,"latency_provider":"OpenAI","best_throughput":50.0,"throughput_provider":"OpenAI","context_provider":"OpenAI"},{"rank":6,"model_id":"gpt-4.1-2025-04-14","model_name":"GPT-4.1","organization_id":"openai","organization_name":"OpenAI","organization_country":"US","score":0.19,"normalized_score":0.19,"verified":false,"self_reported":true,"self_reported_source":"https://openai.com/index/introducing-gpt-5-for-developers/","analysis_method":"Internal benchmark","verification_date":null,"provider_id":"openai","input_cost_per_million":2.0,"output_cost_per_million":8.0,"context_window":1047576,"announcement_date":"2025-04-14","param_count":null,"is_open_source":false,"is_new":false,"best_latency":10.0,"latency_provider":"OpenAI","best_throughput":100.0,"throughput_provider":"OpenAI","context_provider":"OpenAI"},{"rank":7,"model_id":"gpt-4.1-mini-2025-04-14","model_name":"GPT-4.1 mini","organization_id":"openai","organization_name":"OpenAI","organization_country":"US","score":0.15,"normalized_score":0.15,"verified":false,"self_reported":true,"self_reported_source":"https://openai.com/index/introducing-gpt-5-for-developers/","analysis_method":"Internal benchmark","verification_date":null,"provider_id":"openai","input_cost_per_million":0.4,"output_cost_per_million":1.6,"context_window":1047576,"announcement_date":"2025-04-14","param_count":null,"is_open_source":false,"is_new":false,"best_latency":5.0,"latency_provider":"OpenAI","best_throughput":150.0,"throughput_provider":"OpenAI","context_provider":"OpenAI"},{"rank":8,"model_id":"gpt-4.1-nano-2025-04-14","model_name":"GPT-4.1 nano","organization_id":"openai","organization_name":"OpenAI","organization_country":"US","score":0.029,"normalized_score":0.029,"verified":false,"self_reported":true,"self_reported_source":"https://openai.com/index/gpt-4-1/","analysis_method":"Internal benchmark","verification_date":null,"provider_id":"openai","input_cost_per_million":0.1,"output_cost_per_million":0.4,"context_window":1047576,"announcement_date":"2025-04-14","param_count":null,"is_open_source":false,"is_new":false,"best_latency":2.0,"latency_provider":"OpenAI","best_throughput":200.0,"throughput_provider":"OpenAI","context_provider":"OpenAI"}]}