**Description**: Perform systematic data analysis using Python, pandas, and visualization libraries
Perform systematic data analysis using Python, pandas, and visualization libraries. Use this when you need to clean datasets, create visualizations, run statistical analysis, or generate data reports from CSV/Excel files.
/plugin marketplace add samuelgarrett/claude-code-plugin-test/plugin install data-science-skills@claude-skills-marketplaceDescription: Perform systematic data analysis using Python, pandas, and visualization libraries
import pandas as pd
import numpy as np
# Load data
df = pd.read_csv('data.csv')
# df = pd.read_excel('data.xlsx')
# df = pd.read_sql(query, connection)
# df = pd.read_json('data.json')
# Initial inspection
print(df.shape) # (rows, columns)
print(df.info()) # Data types, null counts
print(df.describe()) # Statistical summary
print(df.head()) # First 5 rows
print(df.columns.tolist()) # Column names
print(df.dtypes) # Data types
# Check for missing data
print(df.isnull().sum())
print(df.duplicated().sum())
# Handle missing values
df = df.dropna() # Drop rows with any null
df = df.dropna(subset=['important_col']) # Drop if specific column null
df['col'] = df['col'].fillna(0) # Fill with value
df['col'] = df['col'].fillna(df['col'].mean()) # Fill with mean
# Remove duplicates
df = df.drop_duplicates()
df = df.drop_duplicates(subset=['id'], keep='first')
# Fix data types
df['date'] = pd.to_datetime(df['date'])
df['price'] = df['price'].astype(float)
df['category'] = df['category'].astype('category')
# Handle outliers
from scipy import stats
df = df[(np.abs(stats.zscore(df['value'])) < 3)] # Remove outliers
# Rename columns
df = df.rename(columns={'old_name': 'new_name'})
df.columns = df.columns.str.lower().str.replace(' ', '_')
# Create new columns
df['total'] = df['quantity'] * df['price']
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
# Apply functions
df['normalized'] = df['value'] / df['value'].max()
df['category_encoded'] = df['category'].map({'A': 1, 'B': 2, 'C': 3})
# Groupby aggregations
summary = df.groupby('category').agg({
'sales': ['sum', 'mean', 'count'],
'profit': 'sum'
}).round(2)
# Pivot tables
pivot = df.pivot_table(
values='sales',
index='category',
columns='region',
aggfunc='sum',
fill_value=0
)
# Merge datasets
merged = pd.merge(df1, df2, on='id', how='left')
# Distribution analysis
print(df['value'].value_counts())
print(df['value'].value_counts(normalize=True)) # Percentages
# Correlation
correlation = df[['col1', 'col2', 'col3']].corr()
# Cross-tabulation
pd.crosstab(df['category'], df['region'], normalize='index')
import matplotlib.pyplot as plt
import seaborn as sns
# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
# Histogram
plt.hist(df['value'], bins=30, edgecolor='black')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Distribution of Values')
plt.show()
# Boxplot
sns.boxplot(data=df, x='category', y='value')
plt.title('Value Distribution by Category')
plt.show()
# Time series
df.set_index('date')['sales'].plot()
plt.title('Sales Over Time')
plt.show()
# Heatmap
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()
# Scatter plot
sns.scatterplot(data=df, x='x_value', y='y_value', hue='category')
plt.title('X vs Y by Category')
plt.show()
from scipy import stats
# T-test
group1 = df[df['category'] == 'A']['value']
group2 = df[df['category'] == 'B']['value']
t_stat, p_value = stats.ttest_ind(group1, group2)
print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}")
# Chi-square test
contingency_table = pd.crosstab(df['category'], df['outcome'])
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
print(f"Chi-square: {chi2:.4f}, P-value: {p_value:.4f}")
# Linear regression
from sklearn.linear_model import LinearRegression
X = df[['feature1', 'feature2']]
y = df['target']
model = LinearRegression()
model.fit(X, y)
print(f"R² Score: {model.score(X, y):.4f}")
print(f"Coefficients: {model.coef_}")
# Set random seed
np.random.seed(42)
# Save processed data
df.to_csv('cleaned_data.csv', index=False)
# Document transformations
# Use Jupyter notebooks with markdown cells
def validate_data(df):
"""Validate data quality"""
checks = {
'null_values': df.isnull().sum().sum() == 0,
'duplicates': df.duplicated().sum() == 0,
'date_range_valid': df['date'].min() > pd.Timestamp('2000-01-01'),
'no_negative_prices': (df['price'] >= 0).all()
}
return all(checks.values()), checks
is_valid, results = validate_data(df)
print(f"Data valid: {is_valid}")
print(results)
# Use vectorized operations
df['result'] = df['a'] * df['b'] # ✅ Fast
# Avoid loops
for i in range(len(df)): # ❌ Slow
df.loc[i, 'result'] = df.loc[i, 'a'] * df.loc[i, 'b']
# Use appropriate data types
df['category'] = df['category'].astype('category') # Saves memory
# Chunk large files
for chunk in pd.read_csv('large_file.csv', chunksize=10000):
process(chunk)
# 1. LOAD DATA
df = pd.read_csv('data.csv')
# 2. INITIAL INSPECTION
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Nulls: \n{df.isnull().sum()}")
# 3. CLEAN DATA
df = df.drop_duplicates()
df = df.dropna(subset=['critical_column'])
df['date'] = pd.to_datetime(df['date'])
# 4. TRANSFORM
df['month'] = df['date'].dt.month
df['total'] = df['quantity'] * df['price']
# 5. EXPLORE
print(df['category'].value_counts())
print(df[['sales', 'profit']].describe())
# 6. VISUALIZE
df.groupby('month')['sales'].sum().plot(kind='bar')
plt.title('Monthly Sales')
plt.show()
# 7. ANALYZE
monthly_avg = df.groupby('month')['sales'].mean()
print(f"Average monthly sales: ${monthly_avg.mean():.2f}")
# 8. EXPORT RESULTS
results = df.groupby('category').agg({
'sales': 'sum',
'profit': 'sum'
})
results.to_csv('results.csv')
Remember: Good data analysis is methodical - inspect, clean, explore, analyze, visualize, report!