{"benchmark_id":"t2-bench","name":"t2-bench","parent_benchmark":null,"categories":["reasoning","agents","tool_calling"],"modality":"text","multilingual":false,"max_score":1.0,"language":"en","description":"t2-bench is a benchmark for evaluating agentic tool use capabilities, measuring how well models can select, sequence, and utilize tools to solve complex tasks. It tests autonomous planning and execution in multi-step scenarios.","paper_link":null,"implementation_link":null,"verified":false,"created_at":"2026-05-07T16:53:26.386840+00:00","updated_at":"2026-07-05T18:28:02.368757+00:00","statistics":{"total_models":23,"average_score":0.7296086956521737,"min_score":0.116,"max_score":0.993,"score_stddev":0.2044673210454279,"verified_count":0,"self_reported_count":23},"child_benchmarks":[],"linked_dataset":null,"models":[{"rank":1,"model_id":"gemini-3.1-pro-preview","model_name":"Gemini 3.1 Pro","organization_id":"google","organization_name":"Google","organization_country":"US","score":0.993,"normalized_score":0.993,"verified":false,"self_reported":true,"self_reported_source":"https://deepmind.google/models/evals-methodology/gemini-3-1-pro","analysis_method":"Telecom","verification_date":null,"provider_id":"google","input_cost_per_million":2.5,"output_cost_per_million":15.0,"context_window":1048576,"announcement_date":"2026-02-19","param_count":null,"is_open_source":false,"is_new":false,"best_latency":0.6,"latency_provider":"Google","best_throughput":90.0,"throughput_provider":"Google","context_provider":"Google"},{"rank":2,"model_id":"gemini-3-flash-preview","model_name":"Gemini 3 Flash","organization_id":"google","organization_name":"Google","organization_country":"US","score":0.902,"normalized_score":0.902,"verified":false,"self_reported":true,"self_reported_source":"https://blog.google/products/gemini/gemini-3-flash/","analysis_method":null,"verification_date":null,"provider_id":"google","input_cost_per_million":0.5,"output_cost_per_million":3.0,"context_window":1000000,"announcement_date":"2025-12-17","param_count":null,"is_open_source":false,"is_new":false,"best_latency":null,"latency_provider":"Google","best_throughput":null,"throughput_provider":"Google","context_provider":"Google"},{"rank":3,"model_id":"glm-5","model_name":"GLM-5","organization_id":"zai-org","organization_name":"Zhipu AI","organization_country":"CN","score":0.897,"normalized_score":0.897,"verified":false,"self_reported":true,"self_reported_source":"https://docs.z.ai/guides/llm/glm-5","analysis_method":null,"verification_date":null,"provider_id":"z","input_cost_per_million":1.0,"output_cost_per_million":3.2,"context_window":200000,"announcement_date":"2026-02-11","param_count":744000000000,"is_open_source":true,"is_new":false,"best_latency":3.0,"latency_provider":"ZAI","best_throughput":30.0,"throughput_provider":"ZAI","context_provider":"FriendliAI"},{"rank":4,"model_id":"qwen3.5-397b-a17b","model_name":"Qwen3.5-397B-A17B","organization_id":"qwen","organization_name":"Alibaba Cloud / Qwen Team","organization_country":"CN","score":0.867,"normalized_score":0.867,"verified":false,"self_reported":true,"self_reported_source":"https://qwenlm.github.io/blog/qwen3.5/","analysis_method":null,"verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2026-02-16","param_count":397000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":5,"model_id":"gemma-4-31b-it","model_name":"Gemma 4 31B","organization_id":"google","organization_name":"Google","organization_country":"US","score":0.864,"normalized_score":0.864,"verified":false,"self_reported":true,"self_reported_source":"https://blog.google/innovation-and-ai/technology/developers-tools/gemma-4/","analysis_method":"Retail","verification_date":null,"provider_id":"deepinfra","input_cost_per_million":0.13,"output_cost_per_million":0.38,"context_window":262144,"announcement_date":"2026-04-02","param_count":30700000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":"DeepInfra","best_throughput":null,"throughput_provider":"DeepInfra","context_provider":"Together"},{"rank":6,"model_id":"gemma-4-26b-a4b-it","model_name":"Gemma 4 26B-A4B","organization_id":"google","organization_name":"Google","organization_country":"US","score":0.855,"normalized_score":0.855,"verified":false,"self_reported":true,"self_reported_source":"https://blog.google/innovation-and-ai/technology/developers-tools/gemma-4/","analysis_method":"Retail","verification_date":null,"provider_id":"novita","input_cost_per_million":0.13,"output_cost_per_million":0.4,"context_window":262144,"announcement_date":"2026-04-02","param_count":25200000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":"Novita","best_throughput":null,"throughput_provider":"Novita","context_provider":"Novita"},{"rank":7,"model_id":"gemini-3-pro-preview","model_name":"Gemini 3 Pro","organization_id":"google","organization_name":"Google","organization_country":"US","score":0.854,"normalized_score":0.854,"verified":false,"self_reported":true,"self_reported_source":"https://blog.google/products/gemini/gemini-3","analysis_method":null,"verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-11-18","param_count":null,"is_open_source":false,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":8,"model_id":"qwen3.5-35b-a3b","model_name":"Qwen3.5-35B-A3B","organization_id":"qwen","organization_name":"Alibaba Cloud / Qwen Team","organization_country":"CN","score":0.812,"normalized_score":0.812,"verified":false,"self_reported":true,"self_reported_source":"https://qwen.ai/blog?id=qwen3.5","analysis_method":null,"verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2026-02-24","param_count":35000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":9,"model_id":"deepseek-v3.2-speciale","model_name":"DeepSeek-V3.2-Speciale","organization_id":"deepseek","organization_name":"DeepSeek","organization_country":"CN","score":0.803,"normalized_score":0.803,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/deepseek-ai/DeepSeek-V3.2/resolve/main/assets/paper.pdf","analysis_method":"Pass@1","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-12-01","param_count":685000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":9,"model_id":"deepseek-v3.2","model_name":"DeepSeek-V3.2","organization_id":"deepseek","organization_name":"DeepSeek","organization_country":"CN","score":0.803,"normalized_score":0.803,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/deepseek-ai/DeepSeek-V3.2/resolve/main/assets/paper.pdf","analysis_method":"Pass@1 (Agentic Tool Use)","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-12-01","param_count":685000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":11,"model_id":"deepseek-reasoner","model_name":"DeepSeek-V3.2 (Thinking)","organization_id":"deepseek","organization_name":"DeepSeek","organization_country":"CN","score":0.802,"normalized_score":0.802,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/deepseek-ai/DeepSeek-V3.2/resolve/main/assets/paper.pdf","analysis_method":"Pass@1","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-12-01","param_count":685000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":12,"model_id":"qwen3.5-4b","model_name":"Qwen3.5-4B","organization_id":"qwen","organization_name":"Alibaba Cloud / Qwen Team","organization_country":"CN","score":0.799,"normalized_score":0.799,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/Qwen/Qwen3.5-4B","analysis_method":"Thinking mode","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2026-03-02","param_count":4000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":13,"model_id":"qwen3.5-122b-a10b","model_name":"Qwen3.5-122B-A10B","organization_id":"qwen","organization_name":"Alibaba Cloud / Qwen Team","organization_country":"CN","score":0.795,"normalized_score":0.795,"verified":false,"self_reported":true,"self_reported_source":"https://qwen.ai/blog?id=qwen3.5","analysis_method":null,"verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2026-02-24","param_count":122000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":14,"model_id":"qwen3.5-9b","model_name":"Qwen3.5-9B","organization_id":"qwen","organization_name":"Alibaba Cloud / Qwen Team","organization_country":"CN","score":0.791,"normalized_score":0.791,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/Qwen/Qwen3.5-9B","analysis_method":"Thinking mode","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2026-03-02","param_count":9000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":15,"model_id":"qwen3.5-27b","model_name":"Qwen3.5-27B","organization_id":"qwen","organization_name":"Alibaba Cloud / Qwen Team","organization_country":"CN","score":0.79,"normalized_score":0.79,"verified":false,"self_reported":true,"self_reported_source":"https://qwen.ai/blog?id=qwen3.5","analysis_method":null,"verification_date":null,"provider_id":"novita","input_cost_per_million":0.3,"output_cost_per_million":2.4,"context_window":262144,"announcement_date":"2026-02-24","param_count":27000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":"Novita","best_throughput":null,"throughput_provider":"Novita","context_provider":"Novita"},{"rank":16,"model_id":"qwen3-max","model_name":"Qwen3 Max","organization_id":"qwen","organization_name":"Alibaba Cloud / Qwen Team","organization_country":"CN","score":0.748,"normalized_score":0.748,"verified":false,"self_reported":true,"self_reported_source":"https://qwenlm.github.io/blog/qwen3/","analysis_method":null,"verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-12-15","param_count":1000000000000,"is_open_source":false,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":17,"model_id":"k-exaone-236b-a23b","model_name":"K-EXAONE-236B-A23B","organization_id":"lg","organization_name":"LG AI Research","organization_country":"KR","score":0.732,"normalized_score":0.732,"verified":false,"self_reported":true,"self_reported_source":"https://friendli.ai/blog/k-exaone-on-serverless","analysis_method":null,"verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2025-12-31","param_count":236000000000,"is_open_source":false,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":18,"model_id":"gpt-oss-120b-high","model_name":"GPT OSS 120B High","organization_id":"openai","organization_name":"OpenAI","organization_country":"US","score":0.639,"normalized_score":0.639,"verified":false,"self_reported":true,"self_reported_source":"https://openai.com/","analysis_method":null,"verification_date":null,"provider_id":"openai","input_cost_per_million":0.1,"output_cost_per_million":0.5,"context_window":131072,"announcement_date":"2025-08-05","param_count":116800000000,"is_open_source":true,"is_new":false,"best_latency":6.5,"latency_provider":"OpenAI","best_throughput":100.0,"throughput_provider":"OpenAI","context_provider":"OpenAI"},{"rank":19,"model_id":"gemma-4-e4b-it","model_name":"Gemma 4 E4B","organization_id":"google","organization_name":"Google","organization_country":"US","score":0.575,"normalized_score":0.575,"verified":false,"self_reported":true,"self_reported_source":"https://blog.google/innovation-and-ai/technology/developers-tools/gemma-4/","analysis_method":"Retail","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2026-04-02","param_count":8000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":20,"model_id":"diffusiongemma-26b-a4b-it","model_name":"DiffusionGemma 26B-A4B","organization_id":"google","organization_name":"Google","organization_country":"US","score":0.562,"normalized_score":0.562,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/google/diffusiongemma-26B-A4B-it","analysis_method":"Tau2 average over 3","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2026-06-10","param_count":25200000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":21,"model_id":"qwen3.5-2b","model_name":"Qwen3.5-2B","organization_id":"qwen","organization_name":"Alibaba Cloud / Qwen Team","organization_country":"CN","score":0.488,"normalized_score":0.488,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/Qwen/Qwen3.5-2B","analysis_method":"Thinking mode","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2026-03-02","param_count":2000000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":22,"model_id":"gemma-4-e2b-it","model_name":"Gemma 4 E2B","organization_id":"google","organization_name":"Google","organization_country":"US","score":0.294,"normalized_score":0.294,"verified":false,"self_reported":true,"self_reported_source":"https://blog.google/innovation-and-ai/technology/developers-tools/gemma-4/","analysis_method":"Retail","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2026-04-02","param_count":5100000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null},{"rank":23,"model_id":"qwen3.5-0.8b","model_name":"Qwen3.5-0.8B","organization_id":"qwen","organization_name":"Alibaba Cloud / Qwen Team","organization_country":"CN","score":0.116,"normalized_score":0.116,"verified":false,"self_reported":true,"self_reported_source":"https://huggingface.co/Qwen/Qwen3.5-0.8B","analysis_method":"Thinking mode","verification_date":null,"provider_id":null,"input_cost_per_million":null,"output_cost_per_million":null,"context_window":null,"announcement_date":"2026-03-02","param_count":800000000,"is_open_source":true,"is_new":false,"best_latency":null,"latency_provider":null,"best_throughput":null,"throughput_provider":null,"context_provider":null}]}