From sciagent-skills
Queries bioRxiv/medRxiv preprints via REST API by DOI, category, or date range. Retrieves metadata (title, abstract, authors, DOI) and PDFs for tracking unpublished biomedical research.
npx claudepluginhub jaechang-hits/sciagent-skills --plugin sciagent-skillsThis skill uses the workspace's default tool permissions.
bioRxiv (biology) and medRxiv (health sciences) are free preprint servers hosting 200,000+ and 50,000+ manuscripts, respectively, before or alongside peer review. The unified REST API provides programmatic access to preprint metadata (title, abstract, authors, category, DOI, version history) without authentication. Preprints are available as PDF and can be retrieved by DOI, date range, or categ...
Searches bioRxiv preprints by keywords, authors, dates, categories; retrieves JSON metadata, DOIs, citations; downloads PDFs for life sciences literature reviews.
Searches bioRxiv preprints by keywords, authors, date ranges, categories; retrieves JSON metadata with titles, abstracts, DOIs, citations; downloads PDFs for life sciences literature reviews.
Provides programmatic access to PubMed via NCBI E-utilities REST API. Supports Boolean/MeSH queries, field searches, endpoints like ESearch/EFetch, batch processing, and systematic reviews for biomedical literature pipelines.
Share bugs, ideas, or general feedback.
bioRxiv (biology) and medRxiv (health sciences) are free preprint servers hosting 200,000+ and 50,000+ manuscripts, respectively, before or alongside peer review. The unified REST API provides programmatic access to preprint metadata (title, abstract, authors, category, DOI, version history) without authentication. Preprints are available as PDF and can be retrieved by DOI, date range, or category.
pubmed-database; for all disciplines use openalex-databaserequests, pandaspip install requests pandas
import requests
BASE = "https://api.biorxiv.org"
# Retrieve recent bioinformatics preprints
r = requests.get(f"{BASE}/details/biorxiv/2024-01-01/2024-01-07/0",
params={"category": "bioinformatics"})
r.raise_for_status()
data = r.json()
print(f"Total preprints: {data['messages'][0]['total']}")
for article in data["collection"][:3]:
print(f"\n{article['title'][:80]}")
print(f" Authors : {article['authors'][:60]}")
print(f" DOI : {article['doi']}")
print(f" Category: {article['category']}")
Retrieve all preprints posted within a date range, optionally filtered by category.
import requests, pandas as pd
BASE = "https://api.biorxiv.org"
def get_preprints(server, date_from, date_to, cursor=0, category=None):
"""
server: 'biorxiv' or 'medrxiv'
date_from, date_to: 'YYYY-MM-DD' strings
cursor: page offset (increments of 100)
"""
url = f"{BASE}/details/{server}/{date_from}/{date_to}/{cursor}"
r = requests.get(url)
r.raise_for_status()
return r.json()
data = get_preprints("biorxiv", "2024-01-01", "2024-01-03")
total = data["messages"][0]["total"]
print(f"bioRxiv preprints Jan 1-3, 2024: {total}")
rows = []
for article in data["collection"][:10]:
rows.append({
"doi": article["doi"],
"title": article["title"],
"authors": article["authors"][:80],
"category": article["category"],
"date": article["date"],
"version": article["version"],
})
df = pd.DataFrame(rows)
print(df[["title", "category", "date"]].head())
# Paginate through all results for a date range
def get_all_preprints(server, date_from, date_to, max_results=500):
all_articles = []
cursor = 0
while len(all_articles) < max_results:
data = get_preprints(server, date_from, date_to, cursor)
collection = data["collection"]
if not collection:
break
all_articles.extend(collection)
total = data["messages"][0]["total"]
cursor += 100
if cursor >= total:
break
return all_articles[:max_results]
articles = get_all_preprints("biorxiv", "2024-01-01", "2024-01-07")
print(f"Retrieved {len(articles)} preprints from first week of 2024")
Retrieve full metadata and version history for a specific preprint by DOI.
import requests
BASE = "https://api.biorxiv.org"
# Retrieve specific preprint by DOI
doi = "10.1101/2024.01.01.000001" # Replace with real DOI
def get_by_doi(server, doi):
r = requests.get(f"{BASE}/details/{server}/{doi}")
r.raise_for_status()
return r.json()
# Generic example using bioRxiv DOI pattern
r = requests.get(f"{BASE}/details/biorxiv/10.1101/2021.01.01.425318")
if r.ok:
data = r.json()
articles = data.get("collection", [])
if articles:
art = articles[-1] # Latest version
print(f"Title : {art['title']}")
print(f"Authors : {art['authors'][:100]}")
print(f"Category: {art['category']}")
print(f"Date : {art['date']}")
print(f"Version : {art['version']}")
print(f"DOI : {art['doi']}")
print(f"Abstract (first 300): {art['abstract'][:300]}")
Check if a preprint has been published in a peer-reviewed journal.
import requests
BASE = "https://api.biorxiv.org"
def check_published(server, doi):
"""Check if a preprint DOI has a corresponding published article."""
r = requests.get(f"{BASE}/publisher/{server}/{doi}")
r.raise_for_status()
data = r.json()
return data.get("collection", [])
# Check one known preprint
doi = "10.1101/2021.01.01.425318"
published = check_published("biorxiv", doi)
if published:
pub = published[0]
print(f"Published in: {pub.get('published_journal')}")
print(f"Published DOI: {pub.get('published_doi')}")
else:
print(f"Preprint {doi} has not been published yet (or not tracked)")
Monitor preprints by specific research category.
import requests, pandas as pd
from datetime import date, timedelta
BASE = "https://api.biorxiv.org"
# bioRxiv categories include: bioinformatics, genomics, neuroscience,
# immunology, cell-biology, biochemistry, microbiology, etc.
def weekly_category_digest(category, days_back=7):
"""Get preprints from last N days for a specific category."""
today = date.today()
date_from = (today - timedelta(days=days_back)).strftime("%Y-%m-%d")
date_to = today.strftime("%Y-%m-%d")
all_articles = []
cursor = 0
while True:
r = requests.get(f"{BASE}/details/biorxiv/{date_from}/{date_to}/{cursor}")
data = r.json()
batch = [a for a in data["collection"] if category.lower() in a["category"].lower()]
all_articles.extend(batch)
if len(data["collection"]) < 100:
break
cursor += 100
return pd.DataFrame(all_articles)[["doi", "title", "authors", "date"]] if all_articles else pd.DataFrame()
df = weekly_category_digest("genomics", days_back=3)
print(f"Recent genomics preprints: {len(df)}")
print(df[["title", "date"]].head())
Query medRxiv for health and clinical science preprints.
import requests, pandas as pd
BASE = "https://api.biorxiv.org"
# medRxiv categories: infectious diseases, epidemiology, oncology,
# cardiology, neurology, psychiatry, public and global health, etc.
r = requests.get(f"{BASE}/details/medrxiv/2024-01-01/2024-01-07/0")
r.raise_for_status()
data = r.json()
total = data["messages"][0]["total"]
print(f"medRxiv preprints Jan 1-7, 2024: {total}")
# Group by category
from collections import Counter
category_counts = Counter(a["category"] for a in data["collection"])
print("\nTop categories:")
for cat, count in category_counts.most_common(5):
print(f" {cat}: {count}")
Retrieve abstracts for a list of bioRxiv DOIs.
import requests, time, pandas as pd
BASE = "https://api.biorxiv.org"
dois = [
"10.1101/2023.01.01.000001",
"10.1101/2023.02.01.000002",
"10.1101/2023.03.01.000003",
]
rows = []
for doi in dois:
r = requests.get(f"{BASE}/details/biorxiv/{doi}")
if r.ok:
collection = r.json().get("collection", [])
if collection:
art = collection[-1] # Latest version
rows.append({
"doi": doi,
"title": art.get("title"),
"category": art.get("category"),
"date": art.get("date"),
"abstract": art.get("abstract", "")[:300],
})
time.sleep(0.2)
df = pd.DataFrame(rows)
if not df.empty:
df.to_csv("preprint_abstracts.csv", index=False)
print(df[["doi", "title", "category"]].to_string(index=False))
else:
print("No valid preprints found for provided DOIs")
The bioRxiv API follows the pattern: https://api.biorxiv.org/details/{server}/{interval}/{cursor}
server: biorxiv or medrxivinterval: either a DOI (for single record) or date_from/date_to (for date range)cursor: pagination offset (0, 100, 200…)Preprints can be updated; each update creates a new version (v1, v2, v3…). The API returns all versions chronologically; the last item in collection is always the most recent.
Goal: Automatically collect last week's preprints in target categories and export for review.
import requests, time, pandas as pd
from datetime import date, timedelta
BASE = "https://api.biorxiv.org"
TARGET_CATEGORIES = ["bioinformatics", "genomics", "systems biology"]
DAYS_BACK = 7
today = date.today()
date_from = (today - timedelta(days=DAYS_BACK)).strftime("%Y-%m-%d")
date_to = today.strftime("%Y-%m-%d")
print(f"Fetching bioRxiv preprints from {date_from} to {date_to}")
all_articles = []
cursor = 0
while True:
r = requests.get(f"{BASE}/details/biorxiv/{date_from}/{date_to}/{cursor}")
r.raise_for_status()
data = r.json()
batch = data["collection"]
if not batch:
break
all_articles.extend(batch)
total = data["messages"][0]["total"]
cursor += 100
if cursor >= total:
break
time.sleep(0.1)
# Filter by target categories
filtered = [a for a in all_articles
if any(cat in a.get("category", "").lower() for cat in TARGET_CATEGORIES)]
df = pd.DataFrame(filtered)[["doi", "title", "authors", "category", "date"]]
df = df.drop_duplicates(subset="doi") # Remove duplicate versions
output_file = f"biorxiv_digest_{date_to}.csv"
df.to_csv(output_file, index=False)
print(f"\nSaved {len(df)} preprints across {len(TARGET_CATEGORIES)} categories → {output_file}")
print(df[["title", "category", "date"]].head(5).to_string(index=False))
Goal: For a list of preprint DOIs, check which have been published and retrieve publication details.
import requests, time, pandas as pd
BASE = "https://api.biorxiv.org"
preprint_dois = [
"10.1101/2021.01.01.425318",
"10.1101/2020.04.01.020370",
]
results = []
for doi in preprint_dois:
# Get preprint metadata
r_meta = requests.get(f"{BASE}/details/biorxiv/{doi}")
meta = {}
if r_meta.ok and r_meta.json().get("collection"):
art = r_meta.json()["collection"][-1]
meta = {"title": art["title"], "category": art["category"],
"preprint_date": art["date"]}
# Check publication status
r_pub = requests.get(f"{BASE}/publisher/biorxiv/{doi}")
published = {}
if r_pub.ok and r_pub.json().get("collection"):
pub = r_pub.json()["collection"][0]
published = {"journal": pub.get("published_journal"),
"pub_doi": pub.get("published_doi")}
results.append({"preprint_doi": doi, **meta, **published})
time.sleep(0.25)
df = pd.DataFrame(results)
print(df.to_string(index=False))
df.to_csv("preprint_publication_status.csv", index=False)
| Parameter | Module | Default | Range / Options | Effect |
|---|---|---|---|---|
server | URL path | required | "biorxiv", "medrxiv" | Select preprint server |
date_from | URL path | required | "YYYY-MM-DD" | Start of date range |
date_to | URL path | required | "YYYY-MM-DD" | End of date range |
cursor | URL path | 0 | 0, 100, 200… | Pagination offset (100 per page) |
category | Filter | — | e.g., "bioinformatics" | Category name substring match (post-filter) |
version | — | all versions | — | API returns all versions; use [-1] for latest |
Always take the last element for latest version: The collection array is sorted oldest-to-newest version. Use collection[-1] to get the most current version of a preprint.
Post-filter by category: The API does not natively filter by category; retrieve all preprints for a date range and filter client-side using if category in article["category"].lower().
Respect server resources: Add time.sleep(0.2) between individual DOI lookups; avoid bulk hammering the API.
Cross-check with PubMed: The publisher endpoint reveals when a preprint is published; use pubmed-database to retrieve the full peer-reviewed article metadata.
Handle missing abstracts: Some preprints have empty abstract fields. Always guard with art.get("abstract", "") or "No abstract available".
When to use: Retrieve full-text PDF for a bioRxiv preprint.
import requests
doi = "10.1101/2021.01.01.425318"
# bioRxiv PDF URL pattern
pdf_url = f"https://www.biorxiv.org/content/{doi}.full.pdf"
r = requests.get(pdf_url, headers={"User-Agent": "Mozilla/5.0"})
if r.ok:
with open(f"{doi.replace('/', '_')}.pdf", "wb") as f:
f.write(r.content)
print(f"Downloaded PDF ({len(r.content)//1024} KB)")
else:
print(f"PDF not available: HTTP {r.status_code}")
When to use: Analyze the distribution of preprints across bioRxiv categories in a time window.
import requests, pandas as pd
from collections import Counter
r = requests.get("https://api.biorxiv.org/details/biorxiv/2024-01-01/2024-01-07/0")
data = r.json()
total = data["messages"][0]["total"]
# Fetch all pages
all_articles = data["collection"]
for cursor in range(100, min(total, 1000), 100):
r2 = requests.get(f"https://api.biorxiv.org/details/biorxiv/2024-01-01/2024-01-07/{cursor}")
all_articles.extend(r2.json()["collection"])
counts = Counter(a["category"] for a in all_articles)
df = pd.DataFrame(counts.most_common(), columns=["category", "count"])
print(df.head(10).to_string(index=False))
When to use: Quick single-preprint publication check.
import requests
doi = "10.1101/2021.01.01.425318"
r = requests.get(f"https://api.biorxiv.org/publisher/biorxiv/{doi}")
collection = r.json().get("collection", [])
if collection:
print(f"Published: {collection[0]['published_journal']} | DOI: {collection[0]['published_doi']}")
else:
print("Not published or not tracked")
| Problem | Cause | Solution |
|---|---|---|
collection is empty | DOI not found or date range has no results | Verify DOI format (starts with 10.1101/); check date range |
| Duplicate preprints in results | Multiple versions returned | Deduplicate by DOI: df.drop_duplicates(subset='doi', keep='last') |
| Missing abstract field | Some preprints don't have structured abstracts | Guard with art.get("abstract", "") or "N/A" |
total count vs retrieved mismatch | New preprints added during pagination | Accept approximate totals; preprints are added continuously |
| PDF download blocked | Anti-bot protection | Add a User-Agent header; respect robots.txt; use for research only |
| Slow pagination for large date ranges | Large number of preprints | Use narrower date windows (3-7 days) for busy periods |
pubmed-database — Peer-reviewed biomedical literature for verifying published versions of preprintsopenalex-database — Broader scholarly index including bioRxiv content after indexing lagliterature-review — Guide for incorporating preprints into systematic reviewsscientific-brainstorming — Using preprint alerts as input for hypothesis generation