Keep Learning: Python Seaborn

# Seaborn — Matrix Plots
# Matrix plot = a grid where rows and columns are categories/variables
#               and each cell is colored based on a value
# Used to see patterns, correlations, and relationships at a glance
# Two main matrix plots: heatmap and clustermap
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# ══════════════════════════════════════════════════════════════════════════════
# ── 1. heatmap — Color Grid showing values ────────────────────────────────────
# Each cell = a number shown as a color
# Darker/brighter color = higher or lower value (depends on colormap)
# Great for: showing correlation between columns, confusion matrix, pivot tables
# ══════════════════════════════════════════════════════════════════════════════

# ── Simple heatmap from a 2D list ─────────────────────────────────────────────
data = pd.DataFrame(
    [[10, 20, 30],
     [40, 50, 60],
     [70, 80, 90]],
    index=["Row A", "Row B", "Row C"],       # row labels (y-axis)
    columns=["Col 1", "Col 2", "Col 3"]      # column labels (x-axis)
)
# data=pd.DataFrame({
#     "col1" : [10,40,70],
#     "col2" : [20,50,80],
#     "col3" : [30,60,90]
# })
# data.index=["Row A","Row B","Row C"]
sns.heatmap(data)
plt.title("Basic Heatmap")
plt.show()          # Output: 3×3 color grid — darker cells = higher values (bottom row darkest)

# ── annot=True — show numbers inside each cell ────────────────────────────────
sns.heatmap(data, annot=True)
plt.title("Heatmap with Numbers")
plt.show()          # Output: same color grid but each cell also shows its number

# ── fmt= — format of numbers shown inside cells ───────────────────────────────
sns.heatmap(data, annot=True, fmt="d")      # d = integer format (no decimals)
plt.title("Heatmap with Integer Labels")
plt.show()          # Output: numbers shown as 10, 20, 30 (not 10.0, 20.0...)

sns.heatmap(data, annot=True, cmap="Blues")     # light blue → dark blue
plt.title("Heatmap with color theme Blues")
plt.show()          # Output: low values = light blue, high values = dark blue

sns.heatmap(data, annot=True, cmap="YlOrRd")    # yellow → orange → red
plt.title("Heatmap with color theme YlOrRd")
plt.show()          # Output: low values = yellow, high values = red

sns.heatmap(data, annot=True, cmap="coolwarm")  # blue → white → red
plt.title("Heatmap with color theme coolwarm")
plt.show()          # Output: low = blue, middle = white, high = red (good for correlation)

sns.heatmap(data, annot=True, cmap="Greens")    # light green → dark green
plt.title("Heatmap with color theme Greens")
plt.show()          # Output: low values = light green, high values = dark green

# ── linewidths= — adds borders between cells ──────────────────────────────────
sns.heatmap(data, annot=True, linewidths=0.5, linecolor="white")
plt.title("Heatmap with Cell Borders")
plt.show()          # Output: white lines separate each cell — easier to read

# ── vmin / vmax — fix the color scale range ───────────────────────────────────
# vmin = value mapped to the lightest color
# vmax = value mapped to the darkest color
sns.heatmap(data, annot=True, vmin=0, vmax=100)
plt.title("Heatmap with Fixed Color Scale (0 to 100)")
plt.show()          # Output: colors scale from 0 (light) to 100 (dark) — 90 is near darkest
# ══════════════════════════════════════════════════════════════════════════════
# ── 2. Correlation Heatmap — most common real-world use ───────────────────────
# Correlation = how much two columns move together
#   +1.0  → both increase together (perfect positive)
#    0.0  → no relationship
#   -1.0  → one increases while other decreases (perfect negative)
# df.corr() calculates correlation between all numeric columns
# ══════════════════════════════════════════════════════════════════════════════
df = pd.DataFrame({
    "age":        [22, 25, 30, 35, 40, 45, 50],
    "salary":     [30000, 35000, 50000, 60000, 72000, 80000, 90000],
    "experience": [1, 2, 5, 8, 12, 18, 25],
    "score":      [85, 80, 75, 70, 65, 60, 55]
})

corr = df.corr()    # corr() returns a table of correlation values between every column pair
#calculates how strongly each pair of columns is related to each other.

sns.heatmap(corr,
            annot=True,         # show correlation values inside cells
            fmt=".2f",          # 2 decimal places  e.g. 0.98
            cmap="coolwarm",    # blue=negative, red=positive correlation
            vmin=-1, vmax=1)    # fix scale from -1 to +1

plt.title("Correlation Heatmap")
plt.show()          # Output: grid showing how strongly each pair of columns is related
                    # age vs salary = ~0.99 (strong positive), age vs score = ~-0.99 (strong negative)

# ── mask= — hide the upper triangle (avoid duplicate info) ────────────────────
# Correlation table is symmetric — top-right mirrors bottom-left
# mask hides the duplicate upper triangle so it's easier to read
mask = np.zeros_like(corr, dtype=bool)  # start with all False (show everything)
mask[np.triu_indices_from(mask)] = True  # set upper triangle to True (hide it)

sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", mask=mask, vmin=-1, vmax=1)
plt.title("Correlation Heatmap — Lower Triangle Only")
plt.show()          # Output: only bottom-left half shown — cleaner, no repeated values
# ══════════════════════════════════════════════════════════════════════════════
# ── 3. clustermap — Heatmap with automatic grouping (clustering) ──────────────
# Same as heatmap BUT it reorders rows and columns automatically
# so that similar rows/columns are placed next to each other
# Dendrograms (tree diagrams) on top and left show which rows/cols are similar
# ══════════════════════════════════════════════════════════════════════════════
# ── Simple clustermap ────────────────────────────────────────────────────────
data2 = pd.DataFrame({
    "Math":    [90, 85, 40, 45, 70],
    "Science": [88, 80, 42, 50, 68],
    "History": [45, 50, 85, 90, 55],
    "Art":     [40, 45, 88, 92, 60]
}, index=["Alice", "Bob", "Carol", "Dave", "Eve"])

sns.clustermap(data2)
plt.suptitle("Clustermap — similar students and subjects grouped together", y=1.02)
plt.show()          # Output: heatmap with rows/cols reordered — students good at Math/Science
                    # grouped together, students good at History/Art grouped together
# ── annot=True — show values in cells ────────────────────────────────────────
sns.clustermap(data2, annot=True, fmt="d", cmap="YlOrRd")
plt.suptitle("Clustermap with Values", y=1.02)
plt.show()          # Output: colored grid with scores shown, similar rows/cols clustered
# ── standard_scale= — normalize data before clustering ───────────────────────
# standard_scale=1 → scale each column so values go from 0 to 1
# Useful when columns have very different ranges (e.g. salary vs age)
sns.clustermap(data2, standard_scale=1, cmap="Blues")
plt.suptitle("Clustermap with Normalized Columns (0 to 1)", y=1.02)
plt.show()          # Output: each column scaled 0–1, makes comparison fair across columns
# ── z_score= — normalize by row or column using z-score ──────────────────────
# z_score=0 → normalize each row    z_score=1 → normalize each column
# Shows which values are above/below average within each row or column
sns.clustermap(data2, z_score=1, cmap="coolwarm")
plt.suptitle("Clustermap with Z-score (above/below average per column)", y=1.02)
plt.show()          # Output: blue = below average, red = above average within each subject
# ══════════════════════════════════════════════════════════════════════════════
# ── heatmap vs clustermap ─────────────────────────────────────────────────────
# ┌─────────────┬──────────────────────────────────┬────────────────────────────────┐
# │             │ heatmap                          │ clustermap                     │
# ├─────────────┼──────────────────────────────────┼────────────────────────────────┤
# │ Row order   │ stays as-is                      │ reordered to group similar rows│
# │ Col order   │ stays as-is                      │ reordered to group similar cols│
# │ Dendrogram  │ no                               │ yes (tree on top and left)     │
# │ Best for    │ fixed grids like confusion matrix│ finding hidden patterns/groups │
# └─────────────┴──────────────────────────────────┴────────────────────────────────┘
# ══════════════════════════════════════════════════════════════════════════════

# ══════════════════════════════════════════════════════════════════════════════
# ── Quick Reference ───────────────────────────────────────────────────────────
# ══════════════════════════════════════════════════════════════════════════════

# sns.heatmap(data)                   → color grid from a 2D table
# sns.heatmap(data, annot=True)       → show values inside each cell
# sns.heatmap(data, fmt="d")          → integer format inside cells
# sns.heatmap(data, fmt=".2f")        → 2 decimal format inside cells
# sns.heatmap(data, cmap="coolwarm")  → set color theme
# sns.heatmap(data, vmin=0, vmax=1)   → fix color scale range
# sns.heatmap(data, linewidths=0.5)   → borders between cells
# sns.heatmap(data, mask=mask)        → hide certain cells (e.g. upper triangle)
# df.corr()                           → correlation table between all numeric columns
#
# sns.clustermap(data)                → heatmap with auto-grouping of similar rows/cols
# sns.clustermap(data, standard_scale=1) → normalize columns 0 to 1
# sns.clustermap(data, z_score=1)     → show above/below average per column
#
# plt.show()                          → display the plot