Provides decision framework and Python code for hybrid regex-LLM parsing of structured text like quizzes, forms, and invoices—prioritize regex, LLM for edge cases.
npx claudepluginhub siniyayousuf/everything_claudecodeThis skill uses the workspace's default tool permissions.
A practical decision framework for parsing structured text (quizzes, forms, invoices, documents). The key insight: regex handles 95-98% of cases cheaply and deterministically. Reserve expensive LLM calls for the remaining edge cases.
Provides Ktor server patterns for routing DSL, plugins (auth, CORS, serialization), Koin DI, WebSockets, services, and testApplication testing.
Conducts multi-source web research with firecrawl and exa MCPs: searches, scrapes pages, synthesizes cited reports. For deep dives, competitive analysis, tech evaluations, or due diligence.
Provides demand forecasting, safety stock optimization, replenishment planning, and promotional lift estimation for multi-location retailers managing 300-800 SKUs.
A practical decision framework for parsing structured text (quizzes, forms, invoices, documents). The key insight: regex handles 95-98% of cases cheaply and deterministically. Reserve expensive LLM calls for the remaining edge cases.
Is the text format consistent and repeating?
├── Yes (>90% follows a pattern) → Start with Regex
│ ├── Regex handles 95%+ → Done, no LLM needed
│ └── Regex handles <95% → Add LLM for edge cases only
└── No (free-form, highly variable) → Use LLM directly
Source Text
│
▼
[Regex Parser] ─── Extracts structure (95-98% accuracy)
│
▼
[Text Cleaner] ─── Removes noise (markers, page numbers, artifacts)
│
▼
[Confidence Scorer] ─── Flags low-confidence extractions
│
├── High confidence (≥0.95) → Direct output
│
└── Low confidence (<0.95) → [LLM Validator] → Output
import re
from dataclasses import dataclass
@dataclass(frozen=True)
class ParsedItem:
id: str
text: str
choices: tuple[str, ...]
answer: str
confidence: float = 1.0
def parse_structured_text(content: str) -> list[ParsedItem]:
"""Parse structured text using regex patterns."""
pattern = re.compile(
r"(?P<id>\d+)\.\s*(?P<text>.+?)\n"
r"(?P<choices>(?:[A-D]\..+?\n)+)"
r"Answer:\s*(?P<answer>[A-D])",
re.MULTILINE | re.DOTALL,
)
items = []
for match in pattern.finditer(content):
choices = tuple(
c.strip() for c in re.findall(r"[A-D]\.\s*(.+)", match.group("choices"))
)
items.append(ParsedItem(
id=match.group("id"),
text=match.group("text").strip(),
choices=choices,
answer=match.group("answer"),
))
return items
Flag items that may need LLM review:
@dataclass(frozen=True)
class ConfidenceFlag:
item_id: str
score: float
reasons: tuple[str, ...]
def score_confidence(item: ParsedItem) -> ConfidenceFlag:
"""Score extraction confidence and flag issues."""
reasons = []
score = 1.0
if len(item.choices) < 3:
reasons.append("few_choices")
score -= 0.3
if not item.answer:
reasons.append("missing_answer")
score -= 0.5
if len(item.text) < 10:
reasons.append("short_text")
score -= 0.2
return ConfidenceFlag(
item_id=item.id,
score=max(0.0, score),
reasons=tuple(reasons),
)
def identify_low_confidence(
items: list[ParsedItem],
threshold: float = 0.95,
) -> list[ConfidenceFlag]:
"""Return items below confidence threshold."""
flags = [score_confidence(item) for item in items]
return [f for f in flags if f.score < threshold]
def validate_with_llm(
item: ParsedItem,
original_text: str,
client,
) -> ParsedItem:
"""Use LLM to fix low-confidence extractions."""
response = client.messages.create(
model="claude-haiku-4-5-20251001", # Cheapest model for validation
max_tokens=500,
messages=[{
"role": "user",
"content": (
f"Extract the question, choices, and answer from this text.\n\n"
f"Text: {original_text}\n\n"
f"Current extraction: {item}\n\n"
f"Return corrected JSON if needed, or 'CORRECT' if accurate."
),
}],
)
# Parse LLM response and return corrected item...
return corrected_item
def process_document(
content: str,
*,
llm_client=None,
confidence_threshold: float = 0.95,
) -> list[ParsedItem]:
"""Full pipeline: regex -> confidence check -> LLM for edge cases."""
# Step 1: Regex extraction (handles 95-98%)
items = parse_structured_text(content)
# Step 2: Confidence scoring
low_confidence = identify_low_confidence(items, confidence_threshold)
if not low_confidence or llm_client is None:
return items
# Step 3: LLM validation (only for flagged items)
low_conf_ids = {f.item_id for f in low_confidence}
result = []
for item in items:
if item.id in low_conf_ids:
result.append(validate_with_llm(item, content, llm_client))
else:
result.append(item)
return result
From a production quiz parsing pipeline (410 items):
| Metric | Value |
|---|---|
| Regex success rate | 98.0% |
| Low confidence items | 8 (2.0%) |
| LLM calls needed | ~5 |
| Cost savings vs all-LLM | ~95% |
| Test coverage | 93% |