textModels:
readonly
[
{
cachedInputTokenCost:
0.075;
description:
"GPT-4o mini ('o' for 'omni') is a fast,
affordable small model for focused tasks. It accepts both text and
image inputs, and produces text outputs (including Structured
Outputs). It is ideal for fine-tuning, and model outputs from a
larger model like GPT-4o can be distilled to GPT-4o-mini to produce
similar results at lower cost and latency. Knowledge cutoff: July
2025.";
inputTokenCost:
0.15;
maxInputTokens:
128000;
maxOutputTokens:
16384;
modelName:
"gpt-4o-mini";
outputTokenCost:
0.6;
outputTokensPerSecond:
65;
provider:
"openai";
type:
"text";
},
{
cachedInputTokenCost:
1.25;
description:
"GPT-4o ('o' for 'omni') is our versatile,
high-intelligence flagship model. It accepts both text and image
inputs, and produces text outputs (including Structured Outputs).
Knowledge cutoff: April 2024.";
inputTokenCost:
2.5;
maxInputTokens:
128000;
maxOutputTokens:
16384;
modelName:
"gpt-4o";
outputTokenCost:
10;
outputTokensPerSecond:
143;
provider:
"openai";
type:
"text";
},
{
cachedInputTokenCost:
0.5;
description:
"o3 is a reasoning model that sets a new standard for math,
science, coding, visual reasoning tasks, and technical writing. Part
of the o-series of reasoning models. Knowledge cutoff: June
2024.";
inputTokenCost:
2;
maxInputTokens:
200000;
maxOutputTokens:
100000;
modelName:
"o3";
outputTokenCost:
8;
outputTokensPerSecond:
94;
provider:
"openai";
reasoning:
{
canDisable:
false;
defaultLevel:
"medium";
levels:
readonly
["low",
"medium",
"high"];
outputsSignatures:
false;
outputsThinking:
false;
};
type:
"text";
},
{
cachedInputTokenCost:
0.55;
description:
"o3-mini is our most recent small reasoning model, providing
high intelligence at the same cost and latency targets of o1-mini.
o3-mini also supports key developer features, like Structured
Outputs, function calling, Batch API, and more. Like other models in
the o-series, it is designed to excel at science, math, and coding
tasks. Knowledge cutoff: June 2024.";
inputTokenCost:
1.1;
maxInputTokens:
500000;
maxOutputTokens:
100000;
modelName:
"o3-mini";
outputTokenCost:
4.4;
outputTokensPerSecond:
214;
provider:
"openai";
reasoning:
{
canDisable:
false;
defaultLevel:
"medium";
levels:
readonly
["low",
"medium",
"high"];
outputsSignatures:
false;
outputsThinking:
false;
};
type:
"text";
},
{
cachedInputTokenCost:
0.3;
description:
"Latest small o-series model optimized for fast, effective
reasoning with exceptional performance in coding and visual tasks.
Knowledge cutoff: June 2024.";
inputTokenCost:
0.6;
maxInputTokens:
200000;
maxOutputTokens:
100000;
modelName:
"o4-mini";
outputTokenCost:
2.4;
outputTokensPerSecond:
135;
provider:
"openai";
reasoning:
{
canDisable:
false;
defaultLevel:
"medium";
levels:
readonly
["low",
"medium",
"high"];
outputsSignatures:
false;
outputsThinking:
false;
};
type:
"text";
},
{
description:
"o3-pro uses more compute for complex reasoning tasks.
Available via Responses API only. Requests may take several minutes.
Knowledge cutoff: June 2024.";
inputTokenCost:
20;
maxInputTokens:
200000;
maxOutputTokens:
100000;
modelName:
"o3-pro";
outputTokenCost:
80;
provider:
"openai";
reasoning:
{
canDisable:
false;
outputsSignatures:
false;
outputsThinking:
false;
};
type:
"text";
},
{
cachedInputTokenCost:
7.5;
description:
"o1 is a reasoning model designed to excel at complex
reasoning tasks including science, math, and coding. The knowledge
cutoff for o1 models is October, 2023.";
inputTokenCost:
15;
maxInputTokens:
200000;
maxOutputTokens:
100000;
modelName:
"o1";
outputTokenCost:
60;
outputTokensPerSecond:
100;
provider:
"openai";
reasoning:
{
canDisable:
false;
defaultLevel:
"medium";
levels:
readonly
["low",
"medium",
"high"];
outputsSignatures:
false;
outputsThinking:
false;
};
type:
"text";
},
{
description:
"GPT-4 is an older version of a high-intelligence GPT model,
usable in Chat Completions. Learn more in the text generation guide.
The knowledge cutoff for the latest GPT-4 Turbo version is December,
2023.";
disabled:
true;
inputTokenCost:
10;
maxInputTokens:
128000;
maxOutputTokens:
4096;
modelName:
"gpt-4-turbo";
outputTokenCost:
30;
provider:
"openai";
type:
"text";
},
{
description:
"GPT-4 is an older version of a high-intelligence GPT model,
usable in Chat Completions. Learn more in the text generation guide.
The knowledge cutoff for the latest GPT-4 Turbo version is December,
2023.";
disabled:
true;
inputTokenCost:
30;
maxInputTokens:
8192;
maxOutputTokens:
8192;
modelName:
"gpt-4";
outputTokenCost:
60;
provider:
"openai";
type:
"text";
},
{
description:
"GPT-3.5 Turbo models can understand and generate natural
language or code and have been optimized for chat using the Chat
Completions API but work well for non-chat tasks as well.
gpt-4o-mini should be used in place of gpt-3.5-turbo, as it is
cheaper, more capable, multimodal, and just as fast.";
disabled:
true;
inputTokenCost:
0.5;
maxInputTokens:
16385;
maxOutputTokens:
4096;
modelName:
"gpt-3.5-turbo";
outputTokenCost:
1.5;
provider:
"openai";
type:
"text";
},
{
cachedInputTokenCost:
0.5;
description:
"GPT-4.1 excels at instruction following and tool calling with
1M token context window. Knowledge cutoff: June 2024.";
inputTokenCost:
2;
maxInputTokens:
1047576;
maxOutputTokens:
32768;
modelName:
"gpt-4.1";
outputTokenCost:
8;
outputTokensPerSecond:
105;
provider:
"openai";
type:
"text";
},
{
cachedInputTokenCost:
0.1;
description:
"GPT-4.1 mini excels at instruction following and tool calling
with 1M token context window and low latency. Knowledge cutoff: June
2024.";
inputTokenCost:
0.4;
maxInputTokens:
1047576;
maxOutputTokens:
32768;
modelName:
"gpt-4.1-mini";
outputTokenCost:
1.6;
outputTokensPerSecond:
78;
provider:
"openai";
type:
"text";
},
{
cachedInputTokenCost:
0.025;
description:
"GPT-4.1 nano is the fastest and most affordable GPT-4.1
variant with 1M token context window. Knowledge cutoff: June
2024.";
inputTokenCost:
0.1;
maxInputTokens:
1047576;
maxOutputTokens:
32768;
modelName:
"gpt-4.1-nano";
outputTokenCost:
0.4;
outputTokensPerSecond:
142;
provider:
"openai";
type:
"text";
},
{
cachedInputTokenCost:
0.125;
description:
"GPT-5 is a frontier reasoning model with 400K context window.
Supports reasoning tokens. Knowledge cutoff: September
2024.";
inputTokenCost:
1.25;
maxInputTokens:
400000;
maxOutputTokens:
128000;
modelName:
"gpt-5";
outputTokenCost:
10;
outputTokensPerSecond:
72;
provider:
"openai";
reasoning:
{
canDisable:
false;
defaultLevel:
"medium";
levels:
readonly
["minimal",
"low",
"medium",
"high"];
outputsSignatures:
false;
outputsThinking:
false;
};
type:
"text";
},
{
cachedInputTokenCost:
0.025;
description:
"GPT-5 mini is a faster, more cost-efficient version of GPT-5
with 400K context window. Knowledge cutoff: May 2024.";
inputTokenCost:
0.25;
maxInputTokens:
400000;
maxOutputTokens:
128000;
modelName:
"gpt-5-mini";
outputTokenCost:
2;
outputTokensPerSecond:
69;
provider:
"openai";
reasoning:
{
canDisable:
false;
defaultLevel:
"medium";
levels:
readonly
["minimal",
"low",
"medium",
"high"];
outputsSignatures:
false;
outputsThinking:
false;
};
type:
"text";
},
{
cachedInputTokenCost:
0.005;
description:
"GPT-5 nano is the fastest and most affordable GPT-5 variant
with 400K context window. Knowledge cutoff: May 2024.";
inputTokenCost:
0.05;
maxInputTokens:
400000;
maxOutputTokens:
128000;
modelName:
"gpt-5-nano";
outputTokenCost:
0.4;
outputTokensPerSecond:
140;
provider:
"openai";
reasoning:
{
canDisable:
false;
defaultLevel:
"medium";
levels:
readonly
["minimal",
"low",
"medium",
"high"];
outputsSignatures:
false;
outputsThinking:
false;
};
type:
"text";
},
{
cachedInputTokenCost:
0.125;
description:
"GPT-5.1 is the flagship model for coding and agentic tasks
with configurable reasoning effort. 400K context window. Knowledge
cutoff: September 2024.";
inputTokenCost:
1.25;
maxInputTokens:
400000;
maxOutputTokens:
128000;
modelName:
"gpt-5.1";
outputTokenCost:
10;
provider:
"openai";
reasoning:
{
canDisable:
true;
defaultLevel:
"none";
levels:
readonly
["none",
"low",
"medium",
"high"];
outputsSignatures:
false;
outputsThinking:
false;
};
type:
"text";
},
{
cachedInputTokenCost:
0.175;
description:
"GPT-5.2 is the flagship model for coding and agentic tasks
across industries. 400K context window. Knowledge cutoff: August
2025.";
inputTokenCost:
1.75;
maxInputTokens:
400000;
maxOutputTokens:
128000;
modelName:
"gpt-5.2";
outputTokenCost:
14;
outputTokensPerSecond:
61;
provider:
"openai";
reasoning:
{
canDisable:
true;
defaultLevel:
"none";
levels:
readonly
["none",
"low",
"medium",
"high"];
outputsSignatures:
false;
outputsThinking:
false;
};
type:
"text";
},
{
description:
"GPT-5.2 Pro uses more compute for complex reasoning tasks.
400K context window. Knowledge cutoff: August 2025.";
inputTokenCost:
21;
maxInputTokens:
400000;
maxOutputTokens:
128000;
modelName:
"gpt-5.2-pro";
outputTokenCost:
168;
provider:
"openai";
reasoning:
{
canDisable:
false;
outputsSignatures:
false;
outputsThinking:
false;
};
type:
"text";
},
{
cachedInputTokenCost:
0.25;
description:
"GPT-5.4 is the most capable and efficient frontier model for
complex professional work. 1M context window, state-of-the-art
coding and tool use. Standard pricing for ≤272K tokens, 2x
input/1.5x output for >272K. Knowledge cutoff: August
2025.";
inputTokenCost:
2.5;
maxInputTokens:
1050000;
maxOutputTokens:
128000;
modelName:
"gpt-5.4";
outputTokenCost:
15;
provider:
"openai";
reasoning:
{
canDisable:
true;
defaultLevel:
"none";
levels:
readonly
["none",
"low",
"medium",
"high",
"xhigh"];
outputsSignatures:
false;
outputsThinking:
false;
};
type:
"text";
},
{
description:
"GPT-5.4 Pro uses more compute for complex reasoning tasks. 1M
context window. Standard pricing for ≤272K tokens. Knowledge cutoff:
August 2025.";
inputTokenCost:
30;
maxInputTokens:
1050000;
maxOutputTokens:
128000;
modelName:
"gpt-5.4-pro";
outputTokenCost:
180;
provider:
"openai";
reasoning:
{
canDisable:
false;
defaultLevel:
"medium";
levels:
readonly
["medium",
"high",
"xhigh"];
outputsSignatures:
false;
outputsThinking:
false;
};
type:
"text";
},
{
description:
"Latest Gemini 3.1 Pro with 1M context window and 64K output.
Standard pricing for ≤200k tokens ($2.00 input/$12.00 output),
higher rates for >200k tokens ($4.00 input/$18.00 output).
Released Feb 2026.";
inputTokenCost:
2;
maxInputTokens:
1048576;
maxOutputTokens:
65536;
modelName:
"gemini-3.1-pro-preview";
outputTokenCost:
12;
outputTokensPerSecond:
112;
provider:
"google";
reasoning:
{
canDisable:
false;
defaultLevel:
"high";
levels:
readonly
["low",
"medium",
"high"];
outputsSignatures:
true;
outputsThinking:
true;
};
type:
"text";
},
{
description:
"DEPRECATED: Shut down March 9, 2026. Use
gemini-3.1-pro-preview instead.";
disabled:
true;
inputTokenCost:
2;
maxInputTokens:
1048576;
maxOutputTokens:
65536;
modelName:
"gemini-3-pro-preview";
outputTokenCost:
12;
provider:
"google";
type:
"text";
},
{
description:
"Latest Gemini 3 flash model with 1M context window and 64K
output. Outperforms 2.5 Pro while being 3x faster. Optimized for
agentic workflows and coding. Includes context caching for 90% cost
reductions.";
inputTokenCost:
0.5;
maxInputTokens:
1048576;
maxOutputTokens:
65536;
modelName:
"gemini-3-flash-preview";
outputTokenCost:
3;
outputTokensPerSecond:
146;
provider:
"google";
reasoning:
{
canDisable:
false;
defaultLevel:
"high";
levels:
readonly
["minimal",
"low",
"medium",
"high"];
outputsSignatures:
true;
outputsThinking:
true;
};
type:
"text";
},
{
description:
"Most cost-effective Gemini 3.1 model with thinking support
and 1M context window. 2.5x faster TTFA and 45% faster output than
2.5 Flash. Released March 2026.";
inputTokenCost:
0.25;
maxInputTokens:
1048576;
maxOutputTokens:
65536;
modelName:
"gemini-3.1-flash-lite-preview";
outputTokenCost:
1.5;
outputTokensPerSecond:
379;
provider:
"google";
reasoning:
{
canDisable:
false;
defaultLevel:
"minimal";
levels:
readonly
["minimal",
"low",
"medium",
"high"];
outputsSignatures:
true;
outputsThinking:
true;
};
type:
"text";
},
{
description:
"High-performance Gemini 2.5 model with 2M context window.
Adaptive thinking for complex reasoning and coding. Standard pricing
for ≤200k tokens ($1.25 input/$10.00 output), higher rates for
>200k tokens ($2.50 input/$15.00 output). Batch API: 50%
discount.";
inputTokenCost:
1.25;
maxInputTokens:
2097152;
maxOutputTokens:
65536;
modelName:
"gemini-2.5-pro";
outputTokenCost:
10;
outputTokensPerSecond:
134;
provider:
"google";
reasoning:
{
canDisable:
false;
outputsSignatures:
true;
outputsThinking:
true;
};
type:
"text";
},
{
description:
"Balanced Gemini 2.5 model with excellent performance-to-cost
ratio. Lightning-fast with controllable thinking budgets. 1M context
window. Context caching available for up to 75% cost
reduction.";
inputTokenCost:
0.3;
maxInputTokens:
1048576;
maxOutputTokens:
65536;
modelName:
"gemini-2.5-flash";
outputTokenCost:
2.5;
outputTokensPerSecond:
245;
provider:
"google";
reasoning:
{
canDisable:
true;
outputsSignatures:
true;
outputsThinking:
true;
};
type:
"text";
},
{
description:
"Most cost-effective Gemini 2.5 option for high-throughput
applications. 1M context window.";
inputTokenCost:
0.1;
maxInputTokens:
1048576;
maxOutputTokens:
65536;
modelName:
"gemini-2.5-flash-lite";
outputTokenCost:
0.4;
outputTokensPerSecond:
400;
provider:
"google";
reasoning:
{
canDisable:
true;
outputsSignatures:
false;
outputsThinking:
true;
};
type:
"text";
},
{
description:
"Workhorse model for all daily tasks. Strong overall
performance and supports real-time streaming Live API. 1M context
window. DEPRECATED: Will be shut down on March 31, 2026.";
disabled:
true;
inputTokenCost:
0.1;
maxInputTokens:
1048576;
maxOutputTokens:
8192;
modelName:
"gemini-2.0-flash";
outputTokenCost:
0.4;
outputTokensPerSecond:
213;
provider:
"google";
type:
"text";
},
{
description:
"Strongest model quality, especially for code & world
knowledge; 2M long context. In private beta.";
disabled:
true;
inputTokenCost:
0.5;
maxInputTokens:
2097152;
maxOutputTokens:
8192;
modelName:
"gemini-2.0-pro-exp-02-05";
outputTokenCost:
1.5;
provider:
"google";
type:
"text";
},
{
description:
"Cost effective offering to support high throughput.
DEPRECATED: Will be shut down on March 31, 2026. Use
gemini-2.5-flash-lite instead.";
disabled:
true;
inputTokenCost:
0.075;
maxInputTokens:
1048576;
maxOutputTokens:
8192;
modelName:
"gemini-2.0-flash-lite";
outputTokenCost:
0.3;
provider:
"google";
type:
"text";
},
{
costUnit:
"characters";
description:
"RETIRED: No longer available. Use gemini-2.5-flash
instead.";
disabled:
true;
inputTokenCost:
0.01875;
maxInputTokens:
1048576;
maxOutputTokens:
8192;
modelName:
"gemini-1.5-flash";
outputTokenCost:
0.075;
outputTokensPerSecond:
178;
provider:
"google";
type:
"text";
},
{
costUnit:
"characters";
description:
"RETIRED: No longer available. Use gemini-2.5-pro
instead.";
disabled:
true;
inputTokenCost:
0.3125;
maxInputTokens:
2097152;
maxOutputTokens:
8192;
modelName:
"gemini-1.5-pro";
outputTokenCost:
1.25;
outputTokensPerSecond:
59;
provider:
"google";
type:
"text";
},
{
costUnit:
"characters";
description:
"RETIRED: No longer available. Use gemini-2.5-flash
instead.";
disabled:
true;
inputTokenCost:
0.125;
maxInputTokens:
32760;
maxOutputTokens:
8192;
modelName:
"gemini-1.0-pro";
outputTokenCost:
0.375;
provider:
"google";
type:
"text";
},
{
cachedInputTokenCost:
0.5;
description:
"The most intelligent Claude model for building agents and
coding. 200K context window (1M in beta), 128K max
output.";
inputTokenCost:
5;
maxInputTokens:
200000;
maxOutputTokens:
131072;
modelName:
"claude-opus-4-6";
outputTokenCost:
25;
outputTokensPerSecond:
53;
provider:
"anthropic";
reasoning:
{
canDisable:
true;
outputsSignatures:
true;
outputsThinking:
true;
};
type:
"text";
},
{
cachedInputTokenCost:
0.3;
description:
"The best combination of speed and intelligence. 200K context
window (1M in beta), 64K max output.";
inputTokenCost:
3;
maxInputTokens:
200000;
maxOutputTokens:
64000;
modelName:
"claude-sonnet-4-6";
outputTokenCost:
15;
provider:
"anthropic";
reasoning:
{
canDisable:
true;
outputsSignatures:
true;
outputsThinking:
true;
};
type:
"text";
},
{
cachedInputTokenCost:
0.1;
description:
"The fastest Claude model with near-frontier intelligence.
200K context window, 64K max output.";
inputTokenCost:
1;
maxInputTokens:
200000;
maxOutputTokens:
64000;
modelName:
"claude-haiku-4-5-20251001";
outputTokenCost:
5;
outputTokensPerSecond:
97;
provider:
"anthropic";
reasoning:
{
canDisable:
true;
outputsSignatures:
true;
outputsThinking:
true;
};
type:
"text";
},
{
description:
"Claude 3.7 Sonnet — legacy model. Use claude-sonnet-4-6
instead.";
disabled:
true;
inputTokenCost:
3;
maxInputTokens:
200000;
maxOutputTokens:
8192;
modelName:
"claude-3-7-sonnet-latest";
outputTokenCost:
15;
outputTokensPerSecond:
78;
provider:
"anthropic";
reasoning:
{
canDisable:
true;
outputsSignatures:
true;
outputsThinking:
true;
};
type:
"text";
},
{
description:
"Claude 3.5 Haiku — legacy model. Use
claude-haiku-4-5-20251001 instead.";
disabled:
true;
inputTokenCost:
0.8;
maxInputTokens:
200000;
maxOutputTokens:
8192;
modelName:
"claude-3-5-haiku-latest";
outputTokenCost:
4;
outputTokensPerSecond:
66;
provider:
"anthropic";
type:
"text";
},
{
description:
"Runs via ollama";
maxInputTokens:
128000;
maxOutputTokens:
128000;
modelName:
"deepseek-r1:8b";
provider:
"ollama";
type:
"text";
},
{
description:
"Runs via ollama";
maxInputTokens:
128000;
maxOutputTokens:
128000;
modelName:
"mistral:latest";
provider:
"ollama";
type:
"text";
},
{
description:
"Fine tuned Mistral 7B model, chunked into parts of 50 chars
each, 100 iterations.";
maxInputTokens:
8192;
maxOutputTokens:
8192;
modelName:
"mistral-adapters-chunk50-iters100";
provider:
"local";
type:
"text";
},
{
maxInputTokens:
256;
maxOutputTokens:
256;
modelName:
"llama-7b";
provider:
"replicate";
type:
"text";
},
] = ...