From openrouter-pack
Optimizes OpenRouter context with token estimation, model selection, conversation trimming, and chunking for RAG systems. Fits limits in long chats to cut costs.
npx claudepluginhub jeremylongshore/claude-code-plugins-plus-skills --plugin openrouter-packThis skill is limited to using the following tools:
OpenRouter models have varying context windows (4K to 1M+ tokens). Since pricing is per-token, stuffing unnecessary context wastes money and can degrade output quality. This skill covers context window lookup, token estimation, conversation trimming, chunking strategies, and Anthropic prompt caching for large contexts.
Guides token budgeting, placement effects, RAG patterns, prompt caching, compression, and multi-turn strategies for LLM applications. Use for context windows, budgets, overflow, and optimization.
Provides strategies for managing LLM context windows via summarization, trimming, routing, and avoiding context rot. Activates on mentions of context window, tokens, or limits.
Provides strategies for managing LLM context windows via summarization, trimming, routing, token-counting, and prioritization. Useful for multi-turn conversations and long prompts to avoid context rot.
Share bugs, ideas, or general feedback.
OpenRouter models have varying context windows (4K to 1M+ tokens). Since pricing is per-token, stuffing unnecessary context wastes money and can degrade output quality. This skill covers context window lookup, token estimation, conversation trimming, chunking strategies, and Anthropic prompt caching for large contexts.
# Check context window for specific models
curl -s https://openrouter.ai/api/v1/models | jq '[.data[] | select(
.id == "anthropic/claude-3.5-sonnet" or
.id == "openai/gpt-4o" or
.id == "google/gemini-2.0-flash-001" or
.id == "meta-llama/llama-3.1-70b-instruct"
) | {id, context_length, prompt_per_M: ((.pricing.prompt|tonumber)*1000000)}]'
import os, requests
from openai import OpenAI
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
default_headers={"HTTP-Referer": "https://my-app.com", "X-Title": "my-app"},
)
# Cache model metadata at startup
MODELS = {m["id"]: m for m in requests.get("https://openrouter.ai/api/v1/models").json()["data"]}
def estimate_tokens(text: str) -> int:
"""Rough estimate: 1 token ~ 4 characters for English text."""
return len(text) // 4
def select_model_for_context(messages: list, preferred: str = "anthropic/claude-3.5-sonnet") -> str:
"""Pick a model that fits the context, falling back to larger windows."""
estimated_tokens = sum(len(m.get("content", "")) for m in messages) // 4
FALLBACK_CHAIN = [
("openai/gpt-4o-mini", 128_000),
("anthropic/claude-3.5-sonnet", 200_000),
("google/gemini-2.0-flash-001", 1_000_000),
]
# Try preferred model first
preferred_ctx = MODELS.get(preferred, {}).get("context_length", 0)
if estimated_tokens < preferred_ctx * 0.8: # 80% safety margin
return preferred
for model_id, ctx in FALLBACK_CHAIN:
if estimated_tokens < ctx * 0.8:
return model_id
raise ValueError(f"Content too large ({estimated_tokens} est. tokens)")
def trim_conversation(
messages: list[dict],
max_tokens: int = 100_000,
keep_system: bool = True,
keep_last_n: int = 4,
) -> list[dict]:
"""Trim conversation history to fit context window.
Strategy: Keep system prompt + last N messages.
If still too large, reduce to last 2 messages.
"""
system = [m for m in messages if m["role"] == "system"] if keep_system else []
non_system = [m for m in messages if m["role"] != "system"]
kept = non_system[-keep_last_n:]
trimmed = non_system[:-keep_last_n] if len(non_system) > keep_last_n else []
total_est = sum(estimate_tokens(m.get("content", "")) for m in system + kept)
if total_est > max_tokens and keep_last_n > 2:
kept = non_system[-2:]
result = system + kept
if trimmed:
summary_note = {
"role": "system",
"content": f"[Previous {len(trimmed)} messages trimmed for context limits]",
}
result = system + [summary_note] + kept
return result
def chunk_and_process(document: str, question: str, model: str = "openai/gpt-4o-mini",
chunk_size: int = 8000, overlap: int = 500) -> str:
"""Process a large document in overlapping chunks, then synthesize."""
chunks = []
start = 0
while start < len(document):
chunks.append(document[start:start + chunk_size])
start += chunk_size - overlap
results = []
for i, chunk in enumerate(chunks):
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": f"Analyzing chunk {i+1}/{len(chunks)}."},
{"role": "user", "content": f"Document:\n{chunk}\n\nQuestion: {question}"},
],
max_tokens=1024, temperature=0,
)
results.append(response.choices[0].message.content)
# Synthesize
synthesis = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "Synthesize these partial analyses."},
{"role": "user", "content": f"Question: {question}\n\nResults:\n" + "\n---\n".join(results)},
],
max_tokens=2048, temperature=0,
)
return synthesis.choices[0].message.content
# Anthropic models support prompt caching -- mark large static blocks
# Subsequent requests with same cached block cost 90% less for input tokens
response = client.chat.completions.create(
model="anthropic/claude-3.5-sonnet",
messages=[
{
"role": "system",
"content": [
{
"type": "text",
"text": large_reference_document, # 50K+ tokens
"cache_control": {"type": "ephemeral"},
}
],
},
{"role": "user", "content": "Summarize section 3."},
],
max_tokens=1024,
)
# First request: cache_creation_input_tokens at 1.25x rate
# Subsequent: cache_read_input_tokens at 0.1x rate (90% savings)
| Error | Cause | Fix |
|---|---|---|
400 context_length_exceeded | Input + max_tokens > model limit | Trim messages or use larger-context model |
400 max_tokens too large | max_tokens alone exceeds limit | Reduce max_tokens |
| Slow responses | Very large context | Use streaming; consider chunking |
| Degraded quality | Too much irrelevant context | Trim to relevant content only |
/api/v1/models at startup to cache context limits -- don't hardcode (they change)max_tokens on every request to prevent runaway completion costs on large contextsprompt_tokens in responses to detect context bloat before it hits limits