CSV-Wrangler

Patterns for handling real-world messy CSV files.

Encoding Detection

import chardet

def detect_encoding(file_path: str, sample_size: int = 10000) -> str:
    """Detect file encoding from sample."""
    with open(file_path, 'rb') as f:
        raw = f.read(sample_size)
    result = chardet.detect(raw)
    return result['encoding']

def read_with_encoding(path: str) -> pd.DataFrame:
    """Read CSV with auto-detected encoding."""
    encoding = detect_encoding(path)
    # Common fallback chain
    encodings = [encoding, 'utf-8', 'latin-1', 'cp1252']

    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc)
        except UnicodeDecodeError:
            continue

    # Last resort: ignore errors
    return pd.read_csv(path, encoding='utf-8', errors='ignore')

Delimiter Detection

import csv

def detect_delimiter(file_path: str) -> str:
    """Detect CSV delimiter from file sample."""
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        sample = f.read(4096)

    sniffer = csv.Sniffer()
    try:
        dialect = sniffer.sniff(sample, delimiters=',;\t|')
        return dialect.delimiter
    except csv.Error:
        # Count occurrences and pick most common
        counts = {d: sample.count(d) for d in [',', ';', '\t', '|']}
        return max(counts, key=counts.get)

def read_with_delimiter_detection(path: str) -> pd.DataFrame:
    """Read CSV with auto-detected delimiter."""
    delimiter = detect_delimiter(path)
    return pd.read_csv(path, sep=delimiter)

Handling Malformed Rows

def read_with_error_handling(path: str) -> tuple[pd.DataFrame, list[dict]]:
    """Read CSV, capturing malformed rows separately."""
    good_rows = []
    bad_rows = []

    with open(path, 'r', encoding='utf-8', errors='replace') as f:
        reader = csv.reader(f)
        header = next(reader)
        expected_cols = len(header)

        for line_num, row in enumerate(reader, start=2):
            if len(row) == expected_cols:
                good_rows.append(row)
            else:
                bad_rows.append({
                    'line': line_num,
                    'expected': expected_cols,
                    'actual': len(row),
                    'data': row
                })

    df = pd.DataFrame(good_rows, columns=header)
    return df, bad_rows

# Using pandas error_bad_lines (deprecated) / on_bad_lines
def read_skip_bad_lines(path: str) -> pd.DataFrame:
    """Read CSV, skipping malformed rows."""
    return pd.read_csv(
        path,
        on_bad_lines='skip',  # or 'warn' to see them
        engine='python'
    )

Quoted Fields with Embedded Delimiters

def read_quoted_csv(path: str) -> pd.DataFrame:
    """Handle CSVs with quoted fields containing delimiters."""
    return pd.read_csv(
        path,
        quotechar='"',
        doublequote=True,  # Handle "" as escaped quote
        escapechar='\\',   # Handle \, as escaped comma
        engine='python'
    )

# For really messy files
def read_regex_separated(path: str, pattern: str = r',(?=(?:[^"]*"[^"]*")*[^"]*$)') -> pd.DataFrame:
    """Split on delimiter only outside quotes using regex."""
    import re
    rows = []
    with open(path, 'r') as f:
        for line in f:
            rows.append(re.split(pattern, line.strip()))
    return pd.DataFrame(rows[1:], columns=rows[0])

Header Detection

def detect_header_row(path: str, max_rows: int = 10) -> int:
    """Find the header row in files with metadata at top."""
    with open(path, 'r') as f:
        lines = [f.readline() for _ in range(max_rows)]

    delimiter = detect_delimiter(path)

    for i, line in enumerate(lines):
        parts = line.split(delimiter)
        # Header likely has more columns than metadata
        # and contains text-like values
        if len(parts) > 2 and all(p.strip().replace('_', '').isalnum() for p in parts[:3]):
            return i

    return 0  # Default to first row

def read_with_header_detection(path: str) -> pd.DataFrame:
    """Read CSV, auto-detecting header row."""
    header_row = detect_header_row(path)
    return pd.read_csv(path, skiprows=header_row)

Date Parsing

def infer_date_columns(df: pd.DataFrame) -> list[str]:
    """Identify columns that look like dates."""
    date_cols = []
    date_patterns = [
        r'\d{4}-\d{2}-\d{2}',           # 2024-01-15
        r'\d{2}/\d{2}/\d{4}',           # 01/15/2024
        r'\d{2}-\d{2}-\d{4}',           # 15-01-2024
        r'\d{4}/\d{2}/\d{2}',           # 2024/01/15
    ]

    for col in df.columns:
        if df[col].dtype == 'object':
            sample = df[col].dropna().head(100)
            for pattern in date_patterns:
                if sample.str.match(pattern).mean() > 0.8:
                    date_cols.append(col)
                    break

    return date_cols

def parse_dates_flexibly(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
    """Parse dates with multiple format attempts."""
    for col in columns:
        try:
            df[col] = pd.to_datetime(df[col], infer_datetime_format=True)
        except:
            # Try common formats explicitly
            for fmt in ['%Y-%m-%d', '%m/%d/%Y', '%d/%m/%Y', '%Y/%m/%d']:
                try:
                    df[col] = pd.to_datetime(df[col], format=fmt)
                    break
                except:
                    continue
    return df

Numeric Cleaning

def clean_numeric_column(series: pd.Series) -> pd.Series:
    """Clean numeric column with currency symbols, commas, etc."""
    if series.dtype == 'object':
        cleaned = (series
            .astype(str)
            .str.replace(r'[$€£¥,]', '', regex=True)  # Remove currency/commas
            .str.replace(r'\s+', '', regex=True)       # Remove whitespace
            .str.replace(r'\((.+)\)', r'-\1', regex=True)  # (123) -> -123
        )
        return pd.to_numeric(cleaned, errors='coerce')
    return series

def clean_all_numeric(df: pd.DataFrame) -> pd.DataFrame:
    """Attempt to convert object columns to numeric."""
    for col in df.select_dtypes(include=['object']).columns:
        converted = pd.to_numeric(df[col], errors='coerce')
        # Keep conversion if >80% success
        if converted.notna().mean() > 0.8:
            df[col] = converted
    return df

Complete Wrangling Pipeline

def wrangle_csv(path: str) -> tuple[pd.DataFrame, dict]:
    """Full pipeline for messy CSV handling."""
    report = {'issues': [], 'fixes': []}

    # Step 1: Detect encoding
    encoding = detect_encoding(path)
    report['encoding'] = encoding

    # Step 2: Detect delimiter
    delimiter = detect_delimiter(path)
    report['delimiter'] = delimiter

    # Step 3: Detect header row
    header_row = detect_header_row(path)
    if header_row > 0:
        report['fixes'].append(f"Skipped {header_row} metadata rows")

    # Step 4: Read with error handling
    try:
        df = pd.read_csv(
            path,
            encoding=encoding,
            sep=delimiter,
            skiprows=header_row,
            on_bad_lines='warn',
            engine='python'
        )
    except Exception as e:
        report['issues'].append(f"Read error: {e}")
        df = pd.read_csv(path, encoding='latin-1', on_bad_lines='skip')

    # Step 5: Clean column names
    df.columns = (df.columns
        .str.strip()
        .str.lower()
        .str.replace(r'\s+', '_', regex=True)
        .str.replace(r'[^\w]', '', regex=True)
    )
    report['fixes'].append("Normalized column names")

    # Step 6: Parse dates
    date_cols = infer_date_columns(df)
    if date_cols:
        df = parse_dates_flexibly(df, date_cols)
        report['fixes'].append(f"Parsed dates: {date_cols}")

    # Step 7: Clean numeric columns
    df = clean_all_numeric(df)

    report['shape'] = df.shape
    report['dtypes'] = df.dtypes.to_dict()

    return df, report

Excel File Handling

def read_excel_smart(path: str, sheet: str | int = 0) -> pd.DataFrame:
    """Read Excel with common cleanup."""
    df = pd.read_excel(
        path,
        sheet_name=sheet,
        engine='openpyxl',  # For .xlsx
        # engine='xlrd',    # For .xls
    )

    # Drop fully empty rows/columns
    df = df.dropna(how='all').dropna(axis=1, how='all')

    # Reset index after dropping
    df = df.reset_index(drop=True)

    return df

def list_excel_sheets(path: str) -> list[str]:
    """List all sheets in an Excel file."""
    xl = pd.ExcelFile(path)
    return xl.sheet_names

csv-wrangler

CSV-Wrangler

Encoding Detection

Delimiter Detection

Handling Malformed Rows

Quoted Fields with Embedded Delimiters

Header Detection

Date Parsing

Numeric Cleaning

Complete Wrangling Pipeline

Excel File Handling

Similar Skills