Keep Learning: Python Seaborn - Categorical Plots

# Seaborn — Categorical Plots
# Categorical plot = one axis has categories (like Dept: HR, IT, Finance)
#                    other axis has numbers (like salary, score)
# Used to compare groups and see patterns across categories

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# ── Sample data used throughout ───────────────────────────────────────────────
df = pd.DataFrame({
    "dept":   ["HR","HR","HR","IT","IT","IT","Finance","Finance","Finance"],
    "salary": [45000, 48000, 52000, 80000, 85000, 90000, 65000, 70000, 72000],
    "score":  [60, 65, 70, 85, 88, 92, 72, 75, 78],
    "gender": ["M","F","M","F","M","F","M","F","M"]
})

# ── color — single color for all bars ────────────────────────────────────────
sns.barplot(x="dept", y="salary", data=df, color="steelblue")
plt.title("Salary by Dept (single color)")
plt.show()  
 # Output: 3 bars — IT tallest (avg ~85k), HR shortest (avg ~48k)
# Output: all bars in same steel blue color
# ── hue= — split each bar further by another category ────────────────────────
# hue adds a color split — one bar becomes two bars (one per gender)
sns.barplot(x="dept", y="salary", data=df, hue="gender")
plt.title("Average Salary by Dept and Gender")
plt.legend()
plt.show()          # Output: each dept has 2 bars — one for M, one for F
# ── estimator= — controls what value the bar height represents ────────────────
# By default barplot shows the MEAN (average) — estimator lets you change that
#
# ┌──────────────────────────┬──────────────────────────────────────────┐
# │ estimator=               │ Bar height shows                         │
# ├──────────────────────────┼──────────────────────────────────────────┤
# │ "mean"  (default)        │ average of values in that group          │
# │ "median"                 │ middle value of that group               │
# │ "sum"                    │ total sum of values in that group        │
# │ "min"                    │ smallest value in that group             │
# │ "max"                    │ largest value in that group              │
# └──────────────────────────┴──────────────────────────────────────────┘

sns.barplot(x="dept", y="salary", data=df, estimator="mean")
plt.title("Bar height = Average Salary (default)")
plt.show()          # Output: bars show average salary per dept

sns.barplot(x="dept", y="salary", data=df, estimator="median")
plt.title("Bar height = Median Salary")
plt.show()          # Output: bars show median salary — less affected by outliers

sns.barplot(x="dept", y="salary", data=df, estimator="sum")
plt.title("Bar height = Total Salary (Sum)")
plt.show()          # Output: bars show total salary of all employees in each dept

sns.barplot(x="dept", y="salary", data=df, estimator="max")
plt.title("Bar height = Highest Salary in each Dept")
plt.show()          # Output: bars show the highest salary value in each dept

# ══════════════════════════════════════════════════════════════════════════════
# ── 2. countplot — Count how many rows per category ───────────────────────────
# Does NOT need a y column — it counts occurrences automatically
# Bar height = number of rows in that category
# ══════════════════════════════════════════════════════════════════════════════
sns.countplot(x="dept", data=df)
plt.title("Number of Employees per Department")
plt.show()          # Output: 3 equal bars (3 employees each dept in our data)

# ── hue= — color split within each category ──────────────────────────────────
sns.countplot(x="dept", data=df, hue="gender")
plt.title("Employee Count by Dept and Gender")
plt.show()          # Output: each dept bar is split into M and F counts

# ── horizontal countplot ──────────────────────────────────────────────────────
sns.countplot(y="dept", data=df)        # use y= instead of x= for horizontal bars
plt.title("Horizontal Count Plot")
plt.show()          # Output: same chart but bars go left to right

# ══════════════════════════════════════════════════════════════════════════════
# ── 3. boxplot — Shows spread and outliers of data ────────────────────────────
# Box = middle 50% of data (most common values)
# Line inside box = median (middle value)
# Whiskers = range of normal values
# Dots outside whiskers = outliers (unusual values)
# ══════════════════════════════════════════════════════════════════════════════
sns.boxplot(x="dept", y="salary", data=df)
plt.title("Salary Distribution by Department")
plt.show()          # Output: 3 boxes — IT box is highest, shows salary spread per dept

# ── hue= — split boxes by gender ─────────────────────────────────────────────
sns.boxplot(x="dept", y="salary", data=df, hue="gender")
plt.title("Salary Box Plot by Dept and Gender")
plt.show()          # Output: each dept has 2 boxes — M and F side by side

# ── palette= — set color theme ───────────────────────────────────────────────
sns.boxplot(x="dept", y="salary", data=df, palette="Set2")
plt.title("Box Plot with Color Palette")
plt.show()          # Output: each dept box is a different color from Set2 palette

# ══════════════════════════════════════════════════════════════════════════════
# ── 4. violinplot — Like boxplot but shows full shape of data ─────────────────
# Wider part = more data values in that range
# Narrow part = fewer data values
# Combines boxplot (inside) + KDE curve (outside shape)
# ══════════════════════════════════════════════════════════════════════════════
sns.violinplot(x="dept", y="salary", data=df)
plt.title("Salary Violin Plot by Department")
plt.show()          # Output: violin shapes per dept — wider = more people at that salary

# ── hue= + split=True — compare two groups side by side in same violin ────────
sns.violinplot(x="dept", y="salary", data=df, hue="gender", split=True)
plt.title("Split Violin — M vs F per Dept")
plt.show()          # Output: each violin is half M (left) and half F (right)

# ── inner= — what to show inside the violin ───────────────────────────────────
sns.violinplot(x="dept", y="salary", data=df, inner="box")    # box inside violin
sns.violinplot(x="dept", y="salary", data=df, inner="point")  # dots inside violin
sns.violinplot(x="dept", y="salary", data=df, inner="stick")  # lines inside violin
plt.show()          # Output: violin with chosen inner marker style

# ══════════════════════════════════════════════════════════════════════════════
# ── 5. stripplot — Shows every individual data point ─────────────────────────
# Plots a dot for each row in the data
# Useful to see the actual data — not just averages or ranges
# ══════════════════════════════════════════════════════════════════════════════
sns.stripplot(x="dept", y="salary", data=df)
plt.title("Every Salary as a Dot")
plt.show()          # Output: dots stacked vertically per dept — each dot = one employee

# ── jitter=True — spread dots horizontally so they don't overlap ──────────────
sns.stripplot(x="dept", y="salary", data=df, jitter=True, color="purple", alpha=0.7)
plt.title("Strip Plot with Jitter")
plt.show()          # Output: dots spread sideways — easier to see individual values

# ── combine with boxplot — show box + raw data together ──────────────────────
sns.boxplot(x="dept", y="salary", data=df, palette="pastel")
sns.stripplot(x="dept", y="salary", data=df, color="black", size=5, jitter=True)
plt.title("Box Plot + Raw Data Points")
plt.show()          # Output: box showing range + black dots showing each actual value

# ══════════════════════════════════════════════════════════════════════════════
# ── 6. swarmplot — Like stripplot but dots never overlap ─────────────────────
# Automatically arranges dots side by side so every dot is visible
# Better than stripplot when you want to see all values clearly
# ══════════════════════════════════════════════════════════════════════════════
sns.swarmplot(x="dept", y="salary", data=df)
plt.title("Swarm Plot — No Overlapping Dots")
plt.show()          # Output: dots arranged like a swarm — all visible, none hidden

# ── hue= — color dots by gender ──────────────────────────────────────────────
sns.swarmplot(x="dept", y="salary", data=df, hue="gender")
plt.title("Swarm Plot colored by Gender")
plt.show()          # Output: dots colored M/F, arranged so none overlap
# ══════════════════════════════════════════════════════════════════════════════
# ── 7. pointplot — Shows average with confidence interval as a line ───────────
# Like barplot but shows averages as dots connected by a line
# Useful to track how averages change across categories
# confidence interval = error bar showing how reliable the average is
# ══════════════════════════════════════════════════════════════════════════════
sns.pointplot(x="dept", y="salary", data=df)
plt.title("Average Salary with Confidence Interval")
plt.show()          # Output: dots (averages) connected by line, vertical bars show confidence range

# ── hue= — separate line per gender ──────────────────────────────────────────
sns.pointplot(x="dept", y="salary", data=df, hue="gender")
plt.title("Point Plot — Salary trend by Gender")
plt.show()          # Output: two lines (M and F) showing how salary averages compare across depts

# ══════════════════════════════════════════════════════════════════════════════
# ── When to use which plot ────────────────────────────────────────────────────
# ┌─────────────┬──────────────────────────────────────────┬─────────────────────────────┐
# │ Plot        │ Use when                                 │ Shows                       │
# ├─────────────┼──────────────────────────────────────────┼─────────────────────────────┤
# │ barplot     │ compare averages across groups           │ mean value per category     │
# │ countplot   │ count rows per group (no y needed)       │ number of rows per category │
# │ boxplot     │ see spread + outliers per group          │ median, range, outliers     │
# │ violinplot  │ see full shape of data per group         │ distribution shape + box    │
# │ stripplot   │ show every individual data point         │ one dot per row             │
# │ swarmplot   │ show every point clearly, no overlaps    │ one dot per row, spread out │
# │ pointplot   │ track average trend across categories    │ mean + confidence interval  │
# └─────────────┴──────────────────────────────────────────┴─────────────────────────────┘
# ══════════════════════════════════════════════════════════════════════════════

# ══════════════════════════════════════════════════════════════════════════════
# ── Quick Reference ───────────────────────────────────────────────────────────
# ══════════════════════════════════════════════════════════════════════════════

# sns.barplot(x=, y=, data=)          → average per category (bar height = mean)
# sns.countplot(x=, data=)            → count rows per category (no y needed)
# sns.boxplot(x=, y=, data=)          → box showing spread, median, outliers
# sns.violinplot(x=, y=, data=)       → full shape of data per category
# sns.stripplot(x=, y=, data=)        → every data point as a dot
# sns.swarmplot(x=, y=, data=)        → every dot visible, no overlaps
# sns.pointplot(x=, y=, data=)        → averages connected by a line
#
# hue="col"                           → color-split by another category column
# palette="Set2"                      → color theme (Set1, Set2, pastel, muted)
# plt.show()                          → display the plot

# sns.catplot(x=, y=, data=, kind=)   → universal categorical plot (use this)
# ── 8. catplot — Universal Categorical Plot (replaces factorplot) ─────────────
# One function that can draw any categorical plot type using kind=
# Extra power: col= and row= create a grid of plots split by a category
#
# sns.factorplot() was the OLD name — renamed to catplot in seaborn 0.9
# ┌──────────────┬─────────────────────────────┐
# │ kind=        │ Same as                     │
# ├──────────────┼─────────────────────────────┤
# │ "bar"        │ barplot                     │
# │ "count"      │ countplot                   │
# │ "box"        │ boxplot                     │
# │ "violin"     │ violinplot                  │
# │ "strip"      │ stripplot                   │
# │ "swarm"      │ swarmplot                   │
# │ "point"      │ pointplot                   │
# └──────────────┴─────────────────────────────┘
# ══════════════════════════════════════════════════════════════════════════════

# ── kind= — switch between plot types ────────────────────────────────────────
sns.catplot(x="dept", y="salary", data=df, kind="bar")
plt.title("catplot kind=bar  (same as barplot)")
plt.show()          # Output: average salary bars per dept — same as sns.barplot()

sns.catplot(x="dept", y="salary", data=df, kind="box")
plt.title("catplot kind=box  (same as boxplot)")
plt.show()          # Output: box plots per dept — same as sns.boxplot()

sns.catplot(x="dept", y="salary", data=df, kind="violin")
plt.title("catplot kind=violin  (same as violinplot)")
plt.show()          # Output: violin shapes per dept — same as sns.violinplot()

sns.catplot(x="dept", data=df, kind="count")
plt.title("catplot kind=count  (same as countplot)")
plt.show()          # Output: count of employees per dept — same as sns.countplot()

# ── hue= — color split within each category ──────────────────────────────────
sns.catplot(x="dept", y="salary", data=df, kind="bar", hue="gender")
plt.title("catplot with hue — split by gender")
plt.show()          # Output: each dept has 2 bars — one for M, one for F

# ── col= — create a separate plot per category automatically ──────────────────
# col= splits the data and draws one plot per unique value in that column
# This is the biggest advantage of catplot over individual plot functions
sns.catplot(x="dept", y="salary", data=df, kind="box", col="gender")
plt.suptitle("Separate Box Plot per Gender", y=1.02)
plt.show()          # Output: two side-by-side box plots — one for M, one for F

# ── row= — stack plots vertically by category ─────────────────────────────────
sns.catplot(x="dept", y="salary", data=df, kind="bar", row="gender")
plt.suptitle("Stacked Bar Plots per Gender", y=1.02)
plt.show()          # Output: two bar plots stacked vertically — one row per gender