From sciagent-skills
Queries NCBI ClinVar via E-utilities for genetic variant clinical significance, pathogenicity, disease associations. Searches by gene/rsID/condition/review status; returns ClinSig, submitter data, conditions, HGVS.
npx claudepluginhub jaechang-hits/sciagent-skills --plugin sciagent-skillsThis skill uses the workspace's default tool permissions.
ClinVar is NCBI's public archive of interpretations of variants submitted by clinical laboratories, researchers, and expert panels. It contains 2M+ variants with clinical significance classifications (Pathogenic, Likely Pathogenic, VUS, Likely Benign, Benign) for over 6,000 conditions. Access is free and requires no authentication via NCBI E-utilities.
Queries NCBI ClinVar for genetic variant clinical significance by gene, position, or condition. Interprets pathogenicity classifications, accesses via E-utilities API or FTP, annotates VCFs.
Queries NCBI ClinVar for genetic variant clinical significance by gene, position, or condition. Interprets pathogenicity classifications, accesses via E-utilities API or FTP, annotates VCFs for genomic medicine.
Queries NCBI dbSNP for SNP records by rsID, gene, or region via E-utilities and Variation Services REST API. Retrieves alleles, MAF, variant class, clinical links, cross-DB IDs like ClinVar.
Share bugs, ideas, or general feedback.
ClinVar is NCBI's public archive of interpretations of variants submitted by clinical laboratories, researchers, and expert panels. It contains 2M+ variants with clinical significance classifications (Pathogenic, Likely Pathogenic, VUS, Likely Benign, Benign) for over 6,000 conditions. Access is free and requires no authentication via NCBI E-utilities.
cosmic-database; for GWAS associations use gwas-databaserequests, xml.etree.ElementTree (stdlib)email parameter)pip install requests
# No additional packages required; xml.etree is part of Python stdlib
import requests
EMAIL = "your@email.com" # required by NCBI policy
def clinvar_search(query, retmax=10):
"""Search ClinVar and return a list of ClinVar Variation IDs."""
r = requests.get(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
params={"db": "clinvar", "term": query, "retmax": retmax,
"retmode": "json", "email": EMAIL}
)
r.raise_for_status()
return r.json()["esearchresult"]["idlist"]
# Find pathogenic BRCA1 variants
ids = clinvar_search("BRCA1[gene] AND pathogenic[clinsig]", retmax=5)
print(f"Found variation IDs: {ids}")
Use ESearch to find ClinVar Variation IDs matching a structured query.
import requests
EMAIL = "your@email.com"
BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
def esearch(query, retmax=200):
r = requests.get(f"{BASE}/esearch.fcgi",
params={"db": "clinvar", "term": query,
"retmax": retmax, "retmode": "json", "email": EMAIL})
r.raise_for_status()
result = r.json()["esearchresult"]
return result["idlist"], int(result["count"])
# Gene-specific pathogenic variants
ids, total = esearch("BRCA2[gene] AND (pathogenic[clinsig] OR likely pathogenic[clinsig])")
print(f"Pathogenic/LP BRCA2 variants: {total} total, retrieved {len(ids)}")
print(f"First 5 IDs: {ids[:5]}")
# By rsID
ids, _ = esearch("rs80357906[rs]")
print(f"Variant IDs for rs80357906: {ids}")
# By condition name
ids, total = esearch("breast cancer[dis] AND pathogenic[clinsig]")
print(f"Pathogenic variants for breast cancer: {total}")
Retrieve structured summary data (JSON) for a list of Variation IDs.
import requests, json
EMAIL = "your@email.com"
BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
def esummary(ids):
"""Fetch ESummary records for a list of ClinVar variation IDs."""
r = requests.post(f"{BASE}/esummary.fcgi",
data={"db": "clinvar", "id": ",".join(ids),
"retmode": "json", "email": EMAIL})
r.raise_for_status()
return r.json()["result"]
ids, _ = esearch_func = lambda q: requests.get(
f"{BASE}/esearch.fcgi",
params={"db": "clinvar", "term": q, "retmax": 5, "retmode": "json", "email": EMAIL}
).json()["esearchresult"]["idlist"]
# Manual example with known IDs
sample_ids = ["12375", "17684", "54270"]
result = esummary(sample_ids)
for vid in result.get("uids", []):
rec = result[vid]
print(f"\nVariation {vid}: {rec.get('title')}")
print(f" ClinSig : {rec.get('clinical_significance', {}).get('description')}")
print(f" Review : {rec.get('clinical_significance', {}).get('review_status')}")
print(f" Gene : {rec.get('genes', [{}])[0].get('symbol')}")
Retrieve the complete variant record in XML for detailed submitter and condition data.
import requests
import xml.etree.ElementTree as ET
EMAIL = "your@email.com"
BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
def efetch_xml(ids):
r = requests.post(f"{BASE}/efetch.fcgi",
data={"db": "clinvar", "id": ",".join(ids),
"rettype": "clinvarset", "retmode": "xml", "email": EMAIL})
r.raise_for_status()
return ET.fromstring(r.text)
root = efetch_xml(["12375"])
# Parse clinical assertions
for ca in root.iter("ClinVarAssertion"):
clin_sig = ca.find(".//ClinicalSignificance/Description")
submitter = ca.find(".//ClinVarSubmissionID")
if clin_sig is not None and submitter is not None:
print(f"Submitter: {submitter.get('submitterDate', 'n/a')} | ClinSig: {clin_sig.text}")
For large-scale queries, download and parse the full variant summary file.
import urllib.request
import gzip, csv, io
# Full summary (tab-separated, ~300 MB compressed)
URL = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz"
# Stream and parse without full download
with urllib.request.urlopen(URL) as resp:
with gzip.open(resp, "rt", encoding="utf-8") as f:
reader = csv.DictReader(f, delimiter="\t")
pathogenic_brca1 = []
for row in reader:
if row["GeneSymbol"] == "BRCA1" and "Pathogenic" in row["ClinicalSignificance"]:
pathogenic_brca1.append({
"name": row["Name"],
"clinsig": row["ClinicalSignificance"],
"condition": row["PhenotypeList"],
"rsid": row["RS# (dbSNP)"],
})
print(f"Pathogenic BRCA1 variants: {len(pathogenic_brca1)}")
for v in pathogenic_brca1[:3]:
print(f" {v['name']} | {v['clinsig']} | rs{v['rsid']}")
Filter variants by review status (evidence quality) and find conflicts.
import requests
EMAIL = "your@email.com"
BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
# Stars correspond to review levels:
# 0 = no assertion criteria, 1 = criteria provided (single),
# 2 = criteria provided (multiple), 3 = expert panel, 4 = practice guideline
def search_by_review_stars(gene, min_stars=2):
"""Search for variants with at least min_stars review status."""
star_terms = {1: "criteria provided, single submitter",
2: "criteria provided, multiple submitters, no conflicts",
3: "reviewed by expert panel",
4: "practice guideline"}
terms = [f'"{star_terms[s]}"[review status]' for s in range(min_stars, 5) if s in star_terms]
query = f"{gene}[gene] AND (" + " OR ".join(terms) + ")"
r = requests.get(f"{BASE}/esearch.fcgi",
params={"db": "clinvar", "term": query, "retmax": 100,
"retmode": "json", "email": EMAIL})
return r.json()["esearchresult"]
result = search_by_review_stars("BRCA1", min_stars=3)
print(f"Expert-reviewed BRCA1 variants: {result['count']}")
Extract condition (phenotype) data from ClinVar records.
import requests, json
EMAIL = "your@email.com"
BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
def get_conditions(variation_ids):
"""Return condition data for a list of ClinVar variation IDs."""
r = requests.post(f"{BASE}/esummary.fcgi",
data={"db": "clinvar", "id": ",".join(variation_ids),
"retmode": "json", "email": EMAIL})
r.raise_for_status()
result = r.json()["result"]
conditions = {}
for vid in result.get("uids", []):
rec = result[vid]
trait_set = rec.get("trait_set", [])
conditions[vid] = [t.get("trait_name") for t in trait_set]
return conditions
sample_ids = ["12375", "17684", "54270"]
cond_map = get_conditions(sample_ids)
for vid, conds in cond_map.items():
print(f"Variation {vid}: {', '.join(conds)}")
ClinVar assigns its own stable Variation ID (integer) to each interpreted variant record. This differs from dbSNP rsIDs. A single rsID can correspond to multiple ClinVar Variation IDs if different alleles or interpretations are submitted separately.
ClinVar's "review status" encodes the level of evidence:
Goal: Retrieve all high-confidence pathogenic variants in a gene and export to CSV.
import requests, json, time, pandas as pd
EMAIL = "your@email.com"
BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
def search_gene_pathogenic(gene, clinsig="pathogenic"):
query = f"{gene}[gene] AND {clinsig}[clinsig]"
r = requests.get(f"{BASE}/esearch.fcgi",
params={"db": "clinvar", "term": query, "retmax": 500,
"retmode": "json", "email": EMAIL})
return r.json()["esearchresult"]["idlist"]
def fetch_summaries(ids):
records = []
for i in range(0, len(ids), 100):
batch = ids[i:i+100]
r = requests.post(f"{BASE}/esummary.fcgi",
data={"db": "clinvar", "id": ",".join(batch),
"retmode": "json", "email": EMAIL})
result = r.json()["result"]
for vid in result.get("uids", []):
rec = result[vid]
clinsig = rec.get("clinical_significance", {})
records.append({
"variation_id": vid,
"name": rec.get("title"),
"clinsig": clinsig.get("description"),
"review_status": clinsig.get("review_status"),
"gene": ",".join(g.get("symbol", "") for g in rec.get("genes", [])),
"conditions": "; ".join(t.get("trait_name", "") for t in rec.get("trait_set", [])),
})
time.sleep(0.15)
return records
gene = "BRCA1"
ids = search_gene_pathogenic(gene)
print(f"Found {len(ids)} pathogenic variants in {gene}")
records = fetch_summaries(ids)
df = pd.DataFrame(records)
df.to_csv(f"{gene}_pathogenic_variants.csv", index=False)
print(f"Saved {len(df)} records → {gene}_pathogenic_variants.csv")
print(df[["name", "clinsig", "review_status"]].head())
Goal: Check ClinVar status for a list of user-provided rsIDs or HGVS notations.
import requests, time, pandas as pd
EMAIL = "your@email.com"
BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
variants = ["rs80357906", "rs80357220", "rs28897672"]
results = []
for rsid in variants:
r = requests.get(f"{BASE}/esearch.fcgi",
params={"db": "clinvar", "term": f"{rsid}[rs]",
"retmax": 5, "retmode": "json", "email": EMAIL})
ids = r.json()["esearchresult"]["idlist"]
if not ids:
results.append({"rsid": rsid, "variation_id": None, "clinsig": "Not in ClinVar"})
continue
r2 = requests.post(f"{BASE}/esummary.fcgi",
data={"db": "clinvar", "id": ",".join(ids[:1]),
"retmode": "json", "email": EMAIL})
rec = r2.json()["result"][ids[0]]
clinsig = rec.get("clinical_significance", {})
results.append({
"rsid": rsid,
"variation_id": ids[0],
"clinsig": clinsig.get("description", "Unknown"),
"review_status": clinsig.get("review_status"),
})
time.sleep(0.15)
df = pd.DataFrame(results)
print(df.to_string(index=False))
| Parameter | Module | Default | Range / Options | Effect |
|---|---|---|---|---|
retmax | ESearch | 20 | 1–10000 | Max records returned per query |
retmode | ESearch/ESummary | "xml" | "json", "xml" | Response format |
rettype | EFetch | "clinvarset" | "clinvarset", "vcv" | Record type for XML fetch |
clinsig query field | ESearch | — | "pathogenic", "likely pathogenic", "VUS" | Filter by clinical significance |
review status query field | ESearch | — | 0–4 star terms | Filter by evidence quality |
email | All | required | valid email | NCBI policy; prevents blocking |
Always set email: NCBI requires an email in all E-utility calls for rate-limit attribution and policy compliance.
Use FTP bulk download for large queries: For more than ~1000 variants, download variant_summary.txt.gz from the ClinVar FTP rather than looping over EFetch — it's faster and avoids rate limits.
Filter by review status: Automated pipelines should filter to ≥2-star variants to reduce noise from single-submitter assertions without peer review.
Use API key for production: Register at https://www.ncbi.nlm.nih.gov/account/ to get a free API key (api_key parameter) and triple your rate limit (3 → 10 req/s).
Handle VUS separately: "Conflicting interpretations of pathogenicity" is its own ClinSig category — don't combine it with "VUS" in filters; they have different implications for clinical decision-making.
When to use: Quick lookup for a single known variant.
import requests
EMAIL = "your@email.com"
rsid = "rs80357906"
r = requests.get(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
params={"db": "clinvar", "term": f"{rsid}[rs]",
"retmax": 1, "retmode": "json", "email": EMAIL}
)
count = int(r.json()["esearchresult"]["count"])
print(f"{rsid}: {'found' if count else 'NOT'} in ClinVar ({count} records)")
When to use: Bulk analysis — load entire ClinVar into a pandas DataFrame.
import pandas as pd
url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz"
# Only human GRCh38 pathogenic variants
df = pd.read_csv(url, sep="\t", compression="gzip",
usecols=["#AlleleID", "Name", "GeneSymbol", "ClinicalSignificance",
"ReviewStatus", "PhenotypeList", "Assembly", "RS# (dbSNP)"])
df = df[(df["Assembly"] == "GRCh38") & (df["ClinicalSignificance"].str.contains("Pathogenic", na=False))]
print(f"Pathogenic variants (GRCh38): {len(df)}")
df.to_csv("clinvar_pathogenic_grch38.csv", index=False)
When to use: Find all ClinVar variants associated with a specific OMIM condition.
import requests
EMAIL = "your@email.com"
omim_id = "604370" # BRCA1-associated breast-ovarian cancer
r = requests.get(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
params={"db": "clinvar", "term": f"{omim_id}[MIM]",
"retmax": 20, "retmode": "json", "email": EMAIL}
)
result = r.json()["esearchresult"]
print(f"Variants for OMIM {omim_id}: {result['count']} total")
print(f"First IDs: {result['idlist'][:5]}")
| Problem | Cause | Solution |
|---|---|---|
HTTP 429 or no response | Rate limit exceeded | Add time.sleep(0.35) between requests; use API key |
Empty idlist for rsID query | rsID not indexed in ClinVar | Try HGVS notation or gene+position query instead |
Missing clinsig in summary | Variant has no interpretation | Check review_status; "no interpretation for the single variant" means no ClinSig yet |
| XML parse error in EFetch | Incomplete response (timeout) | Set requests.get(..., timeout=30) and retry once |
| Conflicting results for same rsID | Multiple submissions with different interpretations | Group by review_status and prefer higher-star entries |
| FTP download fails | Large file / slow connection | Use pandas.read_csv with chunksize=100000 or pre-filter with grep |
gwas-database — GWAS Catalog for population-level SNP-trait associations (complement to ClinVar's clinical assertions)ensembl-database — Ensembl VEP for predicting variant consequences without requiring prior clinical curationcosmic-database — Somatic cancer variant database (complementary to ClinVar's germline focus)pubmed-database — Retrieve supporting publications cited in ClinVar submissions