# Seaborn — Categorical Plots
# Categorical plot = one axis has categories (like Dept: HR, IT, Finance)
# other axis has numbers (like salary, score)
# Used to compare groups and see patterns across categories
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# ── Sample data used throughout ───────────────────────────────────────────────
df = pd.DataFrame({
"dept": ["HR","HR","HR","IT","IT","IT","Finance","Finance","Finance"],
"salary": [45000, 48000, 52000, 80000, 85000, 90000, 65000, 70000, 72000],
"score": [60, 65, 70, 85, 88, 92, 72, 75, 78],
"gender": ["M","F","M","F","M","F","M","F","M"]
})
# ── color — single color for all bars ────────────────────────────────────────
sns.barplot(x="dept", y="salary", data=df, color="steelblue")
plt.title("Salary by Dept (single color)")
plt.show()
# Output: 3 bars — IT tallest (avg ~85k), HR shortest (avg ~48k)
# Output: all bars in same steel blue color
# ── hue= — split each bar further by another category ────────────────────────
# hue adds a color split — one bar becomes two bars (one per gender)
sns.barplot(x="dept", y="salary", data=df, hue="gender")
plt.title("Average Salary by Dept and Gender")
plt.legend()
plt.show() # Output: each dept has 2 bars — one for M, one for F
# ── estimator= — controls what value the bar height represents ────────────────
# By default barplot shows the MEAN (average) — estimator lets you change that
#
# ┌──────────────────────────┬──────────────────────────────────────────┐
# │ estimator= │ Bar height shows │
# ├──────────────────────────┼──────────────────────────────────────────┤
# │ "mean" (default) │ average of values in that group │
# │ "median" │ middle value of that group │
# │ "sum" │ total sum of values in that group │
# │ "min" │ smallest value in that group │
# │ "max" │ largest value in that group │
# └──────────────────────────┴──────────────────────────────────────────┘
sns.barplot(x="dept", y="salary", data=df, estimator="mean")
plt.title("Bar height = Average Salary (default)")
plt.show() # Output: bars show average salary per dept
sns.barplot(x="dept", y="salary", data=df, estimator="median")
plt.title("Bar height = Median Salary")
plt.show() # Output: bars show median salary — less affected by outliers
sns.barplot(x="dept", y="salary", data=df, estimator="sum")
plt.title("Bar height = Total Salary (Sum)")
plt.show() # Output: bars show total salary of all employees in each dept
sns.barplot(x="dept", y="salary", data=df, estimator="max")
plt.title("Bar height = Highest Salary in each Dept")
plt.show() # Output: bars show the highest salary value in each dept
# ══════════════════════════════════════════════════════════════════════════════
# ── 2. countplot — Count how many rows per category ───────────────────────────
# Does NOT need a y column — it counts occurrences automatically
# Bar height = number of rows in that category
# ══════════════════════════════════════════════════════════════════════════════
sns.countplot(x="dept", data=df)
plt.title("Number of Employees per Department")
plt.show() # Output: 3 equal bars (3 employees each dept in our data)
# ── hue= — color split within each category ──────────────────────────────────
sns.countplot(x="dept", data=df, hue="gender")
plt.title("Employee Count by Dept and Gender")
plt.show() # Output: each dept bar is split into M and F counts
# ── horizontal countplot ──────────────────────────────────────────────────────
sns.countplot(y="dept", data=df) # use y= instead of x= for horizontal bars
plt.title("Horizontal Count Plot")
plt.show() # Output: same chart but bars go left to right
# ══════════════════════════════════════════════════════════════════════════════
# ── 3. boxplot — Shows spread and outliers of data ────────────────────────────
# Box = middle 50% of data (most common values)
# Line inside box = median (middle value)
# Whiskers = range of normal values
# Dots outside whiskers = outliers (unusual values)
# ══════════════════════════════════════════════════════════════════════════════
sns.boxplot(x="dept", y="salary", data=df)
plt.title("Salary Distribution by Department")
plt.show() # Output: 3 boxes — IT box is highest, shows salary spread per dept
# ── hue= — split boxes by gender ─────────────────────────────────────────────
sns.boxplot(x="dept", y="salary", data=df, hue="gender")
plt.title("Salary Box Plot by Dept and Gender")
plt.show() # Output: each dept has 2 boxes — M and F side by side
# ── palette= — set color theme ───────────────────────────────────────────────
sns.boxplot(x="dept", y="salary", data=df, palette="Set2")
plt.title("Box Plot with Color Palette")
plt.show() # Output: each dept box is a different color from Set2 palette
# ══════════════════════════════════════════════════════════════════════════════
# ── 4. violinplot — Like boxplot but shows full shape of data ─────────────────
# Wider part = more data values in that range
# Narrow part = fewer data values
# Combines boxplot (inside) + KDE curve (outside shape)
# ══════════════════════════════════════════════════════════════════════════════
sns.violinplot(x="dept", y="salary", data=df)
plt.title("Salary Violin Plot by Department")
plt.show() # Output: violin shapes per dept — wider = more people at that salary
# ── hue= + split=True — compare two groups side by side in same violin ────────
sns.violinplot(x="dept", y="salary", data=df, hue="gender", split=True)
plt.title("Split Violin — M vs F per Dept")
plt.show() # Output: each violin is half M (left) and half F (right)
# ── inner= — what to show inside the violin ───────────────────────────────────
sns.violinplot(x="dept", y="salary", data=df, inner="box") # box inside violin
sns.violinplot(x="dept", y="salary", data=df, inner="point") # dots inside violin
sns.violinplot(x="dept", y="salary", data=df, inner="stick") # lines inside violin
plt.show() # Output: violin with chosen inner marker style
# ══════════════════════════════════════════════════════════════════════════════
# ── 5. stripplot — Shows every individual data point ─────────────────────────
# Plots a dot for each row in the data
# Useful to see the actual data — not just averages or ranges
# ══════════════════════════════════════════════════════════════════════════════
sns.stripplot(x="dept", y="salary", data=df)
plt.title("Every Salary as a Dot")
plt.show() # Output: dots stacked vertically per dept — each dot = one employee
# ── jitter=True — spread dots horizontally so they don't overlap ──────────────
sns.stripplot(x="dept", y="salary", data=df, jitter=True, color="purple", alpha=0.7)
plt.title("Strip Plot with Jitter")
plt.show() # Output: dots spread sideways — easier to see individual values
# ── combine with boxplot — show box + raw data together ──────────────────────
sns.boxplot(x="dept", y="salary", data=df, palette="pastel")
sns.stripplot(x="dept", y="salary", data=df, color="black", size=5, jitter=True)
plt.title("Box Plot + Raw Data Points")
plt.show() # Output: box showing range + black dots showing each actual value
# ══════════════════════════════════════════════════════════════════════════════
# ── 6. swarmplot — Like stripplot but dots never overlap ─────────────────────
# Automatically arranges dots side by side so every dot is visible
# Better than stripplot when you want to see all values clearly
# ══════════════════════════════════════════════════════════════════════════════
sns.swarmplot(x="dept", y="salary", data=df)
plt.title("Swarm Plot — No Overlapping Dots")
plt.show() # Output: dots arranged like a swarm — all visible, none hidden
# ── hue= — color dots by gender ──────────────────────────────────────────────
sns.swarmplot(x="dept", y="salary", data=df, hue="gender")
plt.title("Swarm Plot colored by Gender")
plt.show() # Output: dots colored M/F, arranged so none overlap
# ══════════════════════════════════════════════════════════════════════════════
# ── 7. pointplot — Shows average with confidence interval as a line ───────────
# Like barplot but shows averages as dots connected by a line
# Useful to track how averages change across categories
# confidence interval = error bar showing how reliable the average is
# ══════════════════════════════════════════════════════════════════════════════
sns.pointplot(x="dept", y="salary", data=df)
plt.title("Average Salary with Confidence Interval")
plt.show() # Output: dots (averages) connected by line, vertical bars show confidence range
# ── hue= — separate line per gender ──────────────────────────────────────────
sns.pointplot(x="dept", y="salary", data=df, hue="gender")
plt.title("Point Plot — Salary trend by Gender")
plt.show() # Output: two lines (M and F) showing how salary averages compare across depts
# ══════════════════════════════════════════════════════════════════════════════
# ── When to use which plot ────────────────────────────────────────────────────
# ┌─────────────┬──────────────────────────────────────────┬─────────────────────────────┐
# │ Plot │ Use when │ Shows │
# ├─────────────┼──────────────────────────────────────────┼─────────────────────────────┤
# │ barplot │ compare averages across groups │ mean value per category │
# │ countplot │ count rows per group (no y needed) │ number of rows per category │
# │ boxplot │ see spread + outliers per group │ median, range, outliers │
# │ violinplot │ see full shape of data per group │ distribution shape + box │
# │ stripplot │ show every individual data point │ one dot per row │
# │ swarmplot │ show every point clearly, no overlaps │ one dot per row, spread out │
# │ pointplot │ track average trend across categories │ mean + confidence interval │
# └─────────────┴──────────────────────────────────────────┴─────────────────────────────┘
# ══════════════════════════════════════════════════════════════════════════════
# ══════════════════════════════════════════════════════════════════════════════
# ── Quick Reference ───────────────────────────────────────────────────────────
# ══════════════════════════════════════════════════════════════════════════════
# sns.barplot(x=, y=, data=) → average per category (bar height = mean)
# sns.countplot(x=, data=) → count rows per category (no y needed)
# sns.boxplot(x=, y=, data=) → box showing spread, median, outliers
# sns.violinplot(x=, y=, data=) → full shape of data per category
# sns.stripplot(x=, y=, data=) → every data point as a dot
# sns.swarmplot(x=, y=, data=) → every dot visible, no overlaps
# sns.pointplot(x=, y=, data=) → averages connected by a line
#
# hue="col" → color-split by another category column
# palette="Set2" → color theme (Set1, Set2, pastel, muted)
# plt.show() → display the plot
# ── 8. catplot — Universal Categorical Plot (replaces factorplot) ─────────────
# One function that can draw any categorical plot type using kind=
# Extra power: col= and row= create a grid of plots split by a category
#
# sns.factorplot() was the OLD name — renamed to catplot in seaborn 0.9
# ┌──────────────┬─────────────────────────────┐
# │ kind= │ Same as │
# ├──────────────┼─────────────────────────────┤
# │ "bar" │ barplot │
# │ "count" │ countplot │
# │ "box" │ boxplot │
# │ "violin" │ violinplot │
# │ "strip" │ stripplot │
# │ "swarm" │ swarmplot │
# │ "point" │ pointplot │
# └──────────────┴─────────────────────────────┘
# ══════════════════════════════════════════════════════════════════════════════
# ── kind= — switch between plot types ────────────────────────────────────────
sns.catplot(x="dept", y="salary", data=df, kind="bar")
plt.title("catplot kind=bar (same as barplot)")
plt.show() # Output: average salary bars per dept — same as sns.barplot()
sns.catplot(x="dept", y="salary", data=df, kind="box")
plt.title("catplot kind=box (same as boxplot)")
plt.show() # Output: box plots per dept — same as sns.boxplot()
sns.catplot(x="dept", y="salary", data=df, kind="violin")
plt.title("catplot kind=violin (same as violinplot)")
plt.show() # Output: violin shapes per dept — same as sns.violinplot()
sns.catplot(x="dept", data=df, kind="count")
plt.title("catplot kind=count (same as countplot)")
plt.show() # Output: count of employees per dept — same as sns.countplot()
# ── hue= — color split within each category ──────────────────────────────────
sns.catplot(x="dept", y="salary", data=df, kind="bar", hue="gender")
plt.title("catplot with hue — split by gender")
plt.show() # Output: each dept has 2 bars — one for M, one for F
# ── col= — create a separate plot per category automatically ──────────────────
# col= splits the data and draws one plot per unique value in that column
# This is the biggest advantage of catplot over individual plot functions
sns.catplot(x="dept", y="salary", data=df, kind="box", col="gender")
plt.suptitle("Separate Box Plot per Gender", y=1.02)
plt.show() # Output: two side-by-side box plots — one for M, one for F
# ── row= — stack plots vertically by category ─────────────────────────────────
sns.catplot(x="dept", y="salary", data=df, kind="bar", row="gender")
plt.suptitle("Stacked Bar Plots per Gender", y=1.02)
plt.show() # Output: two bar plots stacked vertically — one row per gender
No comments:
Post a Comment
Please comment below to feedback or ask questions.