Implement FireCrawl PII handling, data retention, and GDPR/CCPA compliance patterns. Use when handling sensitive data, implementing data redaction, configuring retention policies, or ensuring compliance with privacy regulations for FireCrawl integrations. Trigger with phrases like "firecrawl data", "firecrawl PII", "firecrawl GDPR", "firecrawl data retention", "firecrawl privacy", "firecrawl CCPA".
From firecrawl-packnpx claudepluginhub nickloveinvesting/nick-love-plugins --plugin firecrawl-packThis skill is limited to using the following tools:
Guides Next.js Cache Components and Partial Prerendering (PPR) with cacheComponents enabled. Implements 'use cache', cacheLife(), cacheTag(), revalidateTag(), static/dynamic optimization, and cache debugging.
Migrates code, prompts, and API calls from Claude Sonnet 4.0/4.5 or Opus 4.1 to Opus 4.5, updating model strings on Anthropic, AWS, GCP, Azure platforms.
Details PluginEval's skill quality evaluation: 3 layers (static, LLM judge), 10 dimensions, rubrics, formulas, anti-patterns, badges. Use to interpret scores, improve triggering, calibrate thresholds.
Manage scraped web content from FireCrawl pipelines. Covers content extraction filtering, HTML sanitization, markdown cleaning, structured data validation, and storage patterns for crawled content.
@mendable/firecrawl-js SDKimport FirecrawlApp from '@mendable/firecrawl-js';
const firecrawl = new FirecrawlApp({
apiKey: process.env.FIRECRAWL_API_KEY!,
});
// Scrape with controlled output formats
async function scrapeClean(url: string) {
const result = await firecrawl.scrapeUrl(url, {
formats: ['markdown'], // Markdown is cleanest for LLMs
onlyMainContent: true, // Strip nav, footer, sidebar
excludeTags: ['script', 'style', 'nav', 'footer', 'iframe'],
waitFor: 2000, # 2000: 2 seconds in ms
});
return {
markdown: cleanMarkdown(result.markdown || ''),
metadata: result.metadata,
url,
scrapedAt: new Date().toISOString(),
};
}
function cleanMarkdown(md: string): string {
return md
.replace(/\n{3,}/g, '\n\n') // Collapse multiple newlines
.replace(/\[.*?\]\(javascript:.*?\)/g, '') // Remove JS links
.replace(/!\[.*?\]\(data:.*?\)/g, '') // Remove inline data URIs
.replace(/<!--[\s\S]*?-->/g, '') // Remove HTML comments
.trim();
}
import { z } from 'zod';
const ArticleSchema = z.object({
title: z.string().min(1),
author: z.string().optional(),
publishedDate: z.string().optional(),
content: z.string().min(50),
wordCount: z.number(),
});
async function extractArticle(url: string) {
const result = await firecrawl.scrapeUrl(url, {
formats: ['extract'],
extract: {
schema: {
type: 'object',
properties: {
title: { type: 'string' },
author: { type: 'string' },
publishedDate: { type: 'string' },
content: { type: 'string' },
},
required: ['title', 'content'],
},
},
});
const extracted = result.extract;
if (!extracted) throw new Error('Extraction failed');
return ArticleSchema.parse({
...extracted,
wordCount: extracted.content?.split(/\s+/).length || 0,
});
}
import { writeFileSync, mkdirSync } from 'fs';
import { join } from 'path';
async function crawlAndStore(
baseUrl: string,
outputDir: string,
options?: { maxPages?: number; includePaths?: string[] }
) {
mkdirSync(outputDir, { recursive: true });
const crawlResult = await firecrawl.crawlUrl(baseUrl, {
limit: options?.maxPages || 50,
includePaths: options?.includePaths,
scrapeOptions: {
formats: ['markdown'],
onlyMainContent: true,
},
});
const manifest: Array<{ url: string; path: string; size: number }> = [];
for (const page of crawlResult.data || []) {
const slug = new URL(page.metadata?.sourceURL || baseUrl)
.pathname.replace(/\//g, '_').replace(/^_|_$/g, '') || 'index';
const filename = `$firecrawl-data-handling.md`;
const filePath = join(outputDir, filename);
const content = cleanMarkdown(page.markdown || '');
writeFileSync(filePath, content);
manifest.push({
url: page.metadata?.sourceURL || baseUrl,
path: filename,
size: content.length,
});
}
writeFileSync(join(outputDir, 'manifest.json'), JSON.stringify(manifest, null, 2));
return manifest;
}
import { createHash } from 'crypto';
function contentHash(text: string): string {
return createHash('sha256').update(text.trim().toLowerCase()).digest('hex');
}
function deduplicatePages(pages: Array<{ url: string; content: string }>) {
const seen = new Map<string, string>(); // hash -> url
const unique: typeof pages = [];
const duplicates: Array<{ url: string; duplicateOf: string }> = [];
for (const page of pages) {
const hash = contentHash(page.content);
if (seen.has(hash)) {
duplicates.push({ url: page.url, duplicateOf: seen.get(hash)! });
} else {
seen.set(hash, page.url);
unique.push(page);
}
}
return { unique, duplicates };
}
| Issue | Cause | Solution |
|---|---|---|
| Empty content | Dynamic JS not loaded | Increase waitFor timeout |
| Garbage in markdown | Bad HTML cleanup | Use onlyMainContent and excludeTags |
| Duplicate pages | URL aliases or redirects | Hash content for deduplication |
| Large file sizes | Full HTML stored | Use markdown format only |
const docs = await crawlAndStore('https://docs.example.com', './scraped-docs', {
maxPages: 100,
includePaths: ['/docs/*', '/api/*'],
});
console.log(`Scraped ${docs.length} pages`);