mistral-data-handling | mistral-pack

Stats

Actions

Tags

mistral-data-handling | mistral-pack

Mistral Data Handling

Overview

Manage data flows through Mistral AI APIs with PII redaction, audit logging, fine-tuning dataset sanitization, and conversation retention policies. Mistral's data policy: API requests on La Plateforme are not used for training by default. Self-deployed models give full data sovereignty.

Prerequisites

Mistral API key configured
Understanding of data classification (PII, PHI, PCI)
Logging infrastructure for audit trails

Instructions

Step 1: PII Redaction Before API Calls

interface RedactionRule {
  pattern: RegExp;
  replacement: string;
  type: string;
}

const PII_RULES: RedactionRule[] = [
  { pattern: /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/gi, replacement: '[EMAIL]', type: 'email' },
  { pattern: /\b\d{3}[-.]?\d{3}[-.]?\d{4}\b/g, replacement: '[PHONE]', type: 'phone' },
  { pattern: /\b\d{3}-\d{2}-\d{4}\b/g, replacement: '[SSN]', type: 'ssn' },
  { pattern: /\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b/g, replacement: '[CARD]', type: 'credit_card' },
  { pattern: /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/g, replacement: '[IP]', type: 'ip_address' },
];

function redactPII(text: string): { cleaned: string; redactions: string[] } {
  const redactions: string[] = [];
  let cleaned = text;

  for (const rule of PII_RULES) {
    const matches = cleaned.match(rule.pattern);
    if (matches) {
      redactions.push(...matches.map(m => `${rule.type}: ${m.slice(0, 4)}***`));
      cleaned = cleaned.replace(rule.pattern, rule.replacement);
    }
  }
  return { cleaned, redactions };
}

Step 2: Safe Mistral API Wrapper

import { Mistral } from '@mistralai/mistralai';

const client = new Mistral({ apiKey: process.env.MISTRAL_API_KEY });

async function safeChatCompletion(
  messages: Array<{ role: string; content: string }>,
  options: { redactPII?: boolean; model?: string; auditLog?: boolean } = {},
) {
  const processed = messages.map(msg => {
    if (options.redactPII !== false) {
      const { cleaned, redactions } = redactPII(msg.content);
      if (redactions.length > 0 && options.auditLog) {
        console.warn(`Redacted ${redactions.length} PII items from ${msg.role} message`);
      }
      return { ...msg, content: cleaned };
    }
    return msg;
  });

  const response = await client.chat.complete({
    model: options.model ?? 'mistral-small-latest',
    messages: processed,
  });

  // Optionally redact PII in output too
  const output = response.choices?.[0]?.message?.content ?? '';
  if (options.redactPII !== false) {
    const { cleaned } = redactPII(output);
    if (response.choices?.[0]?.message) {
      response.choices[0].message.content = cleaned;
    }
  }

  return response;
}

Step 3: Fine-Tuning Dataset Sanitization

Mistral fine-tuning requires JSONL files. Sanitize before uploading:

import { createReadStream, createWriteStream } from 'fs';
import { createInterface } from 'readline';

async function sanitizeTrainingData(inputPath: string, outputPath: string) {
  const rl = createInterface({ input: createReadStream(inputPath) });
  const out = createWriteStream(outputPath);
  let lines = 0, redacted = 0;

  for await (const line of rl) {
    const record = JSON.parse(line);
    const sanitized = record.messages.map((msg: any) => {
      const { cleaned, redactions } = redactPII(msg.content);
      if (redactions.length > 0) redacted++;
      return { ...msg, content: cleaned };
    });

    out.write(JSON.stringify({ messages: sanitized }) + '\n');
    lines++;
  }

  out.end();
  console.log(`Processed ${lines} training examples, redacted PII in ${redacted}`);
  return { lines, redacted };
}

Step 4: Conversation History with TTL

class ConversationStore {
  private store = new Map<string, { messages: any[]; createdAt: number }>();
  private maxAgeMins: number;
  private maxMessages: number;

  constructor(maxAgeMins = 60, maxMessages = 100) {
    this.maxAgeMins = maxAgeMins;
    this.maxMessages = maxMessages;
  }

  get(sessionId: string): any[] {
    const entry = this.store.get(sessionId);
    if (!entry) return [];

    // Auto-expire
    if (Date.now() - entry.createdAt > this.maxAgeMins * 60_000) {
      this.store.delete(sessionId);
      return [];
    }

    return entry.messages;
  }

  append(sessionId: string, message: any): void {
    const entry = this.store.get(sessionId) ?? { messages: [], createdAt: Date.now() };
    entry.messages.push(message);

    // Cap message count
    if (entry.messages.length > this.maxMessages) {
      const system = entry.messages[0]?.role === 'system' ? [entry.messages[0]] : [];
      entry.messages = [...system, ...entry.messages.slice(-this.maxMessages)];
    }

    this.store.set(sessionId, entry);
  }

  destroy(sessionId: string): void {
    this.store.delete(sessionId);
  }

  // GDPR right-to-erasure
  eraseUser(userId: string): number {
    let count = 0;
    for (const [key] of this.store) {
      if (key.startsWith(userId)) {
        this.store.delete(key);
        count++;
      }
    }
    return count;
  }
}

Step 5: Audit Logging

interface AuditEntry {
  timestamp: string;
  sessionId: string;
  model: string;
  inputChars: number;
  outputChars: number;
  piiRedacted: number;
  tokensUsed: { prompt: number; completion: number };
}

function logAudit(entry: AuditEntry): void {
  // Log metadata only — never log actual message content
  console.log(JSON.stringify({
    ...entry,
    // Intentionally exclude message content for compliance
  }));
}

Error Handling

Issue	Cause	Solution
PII leak to API	Regex missed pattern	Add domain-specific rules (e.g., patient IDs)
Fine-tune rejected	Unsanitized data in JSONL	Run sanitization before `client.files.upload()`
Conversation too long	No retention policy	Set max age and message count limits
GDPR request	Right to erasure	Implement `eraseUser()` across all stores

Examples

Safe Embedding Generation

async function safeEmbed(texts: string[]) {
  const cleaned = texts.map(t => redactPII(t).cleaned);
  return client.embeddings.create({
    model: 'mistral-embed',
    inputs: cleaned,
  });
}

Batch API with PII Redaction

import json

def sanitize_batch_file(input_path: str, output_path: str):
    """Sanitize a Mistral batch JSONL file before submission."""
    with open(input_path) as f_in, open(output_path, "w") as f_out:
        for line in f_in:
            record = json.loads(line)
            for msg in record["body"]["messages"]:
                msg["content"] = redact_pii(msg["content"])
            f_out.write(json.dumps(record) + "\n")

Resources

Output

PII redaction layer for all API calls
Safe chat wrapper with audit logging
Fine-tuning dataset sanitization pipeline
Conversation store with TTL and GDPR erasure