Keep Learning: Python Seaborn - Distribution Plots

# Seaborn — a Python library built on top of Matplotlib

# Makes statistical plots easier and prettier with less code

# import convention: import seaborn as sns

# Distribution Plot — shows how data is spread / how often values occur

# Useful to understand: shape, center, spread, and outliers of data

import seaborn as sns

import matplotlib.pyplot as plt

import numpy as np

# ── Sample data used throughout ───────────────────────────────────────────────

ages    = [22, 25, 25, 27, 28, 30, 30, 30, 32, 35, 35, 38, 40, 42, 45]

scores  = np.random.seed(42) or np.random.normal(loc=70, scale=10, size=200)

# loc=70 → mean is 70,  scale=10 → std deviation 10,  size=200 → 200 values

# ══════════════════════════════════════════════════════════════════════════════

# ── 1. histplot — Histogram ───────────────────────────────────────────────────

# Shows how many times each value (or range of values) appears

# x-axis: value ranges (bins),  y-axis: count of values in that range

# ══════════════════════════════════════════════════════════════════════════════

sns.histplot(ages, bins=5, color="steelblue")   # bins=5 → divide data into 5 intervals

plt.title("Age Distribution")

plt.xlabel("Age")

plt.ylabel("Count")

plt.show()

# ── bins — controls the number of intervals ───────────────────────────────────
sns.histplot(ages, bins=3)  # fewer bins → wider bars, less detail
plt.show()                  # Output: 3 tall wide bars

sns.histplot(ages, bins=10) # more bins → narrower bars, more detail

plt.show()                  # Output: 10 narrow bars showing finer breakdown

# ── kde=True — adds a smooth curve over the histogram ────────────────────────

# KDE = Kernel Density Estimate — a smooth line showing the shape of distribution
sns.histplot(ages, bins=5, kde=True, color="teal")
plt.title("Histogram with KDE Curve")
plt.show()                  # Output: bars + a smooth curved line on top

# ── stat= — changes what y-axis shows ────────────────────────────────────────
agess   = [10,20,30,10,20,30,40]
plt.title("changes what y-axis shows")
sns.histplot(agess, stat="count")       # count    → number of values (default)
sns.histplot(agess, stat="frequency")   # frequency → proportion per bin width
sns.histplot(agess, stat="density")     # density   → area under curve = 1 (for KDE)
sns.histplot(agess, stat="probability") # probability → each bar = fraction of total
plt.show()                             # Output: y-axis changes based on stat used

# ══════════════════════════════════════════════════════════════════════════════
# ── 2. kdeplot — Smooth Density Curve ────────────────────────────────────────
# KDE = Kernel Density Estimate
# Instead of bars, shows a smooth curve — great for seeing the shape of data
# y-axis shows density (not count) — area under the curve = 1
# ══════════════════════════════════════════════════════════════════════════════

np.random.seed(42)
# np.random.normal(loc, scale, size) — generates random numbers that cluster around a center
# normal is used here specifically because KDE plots are meant to show bell-shaped distributions.
#   loc=70   → center / average — most numbers will be near 70
#   scale=10 → spread — how far numbers go from center
#              ~68% of values fall between 60–80 (70 ± 10)
#              ~95% of values fall between 50–90 (70 ± 20)
#   size=200 → generate 200 numbers
# Think of it as: exam scores for 200 students — most score around 70, few very high or low
scores = np.random.normal(loc=70, scale=10, size=200)
sns.kdeplot(scores, color="blue")
plt.title("Score Distribution (KDE)")
plt.xlabel("Score")
plt.ylabel("Density")
plt.show()  

# ── fill=True — fills area under the curve ───────────────────────────────────
sns.kdeplot(scores, fill=True, color="skyblue", alpha=0.6)  # alpha = transparency
plt.title("KDE with Fill")
plt.show()  # Output: filled blue area under the curve

# ── bw_adjust — controls smoothness of the curve ─────────────────────────────
# bw_adjust < 1 → more detail / jagged,   bw_adjust > 1 → smoother / wider
sns.kdeplot(scores, bw_adjust=0.5, label="Less smooth (0.5)")
sns.kdeplot(scores, bw_adjust=2.0, label="More smooth (2.0)")
plt.legend()
plt.title("KDE Smoothness Comparison")
plt.show()                  # Output: two curves — one tighter, one wider/smoother

# ── Multiple KDE curves on same plot ─────────────────────────────────────────
group_a = np.random.normal(60, 8,  100)    # Group A: mean=60
group_b = np.random.normal(80, 10, 100)    # Group B: mean=80

sns.kdeplot(group_a, label="Group A", fill=True, alpha=0.4)
sns.kdeplot(group_b, label="Group B", fill=True, alpha=0.4)
plt.legend()
plt.title("Two Groups Compared")
plt.show()                  # Output: two overlapping filled curves — easy to compare groups

#   distplot(data)            → sns.histplot(data, kde=True)   ✔
#   distplot(data, hist=False)→ sns.kdeplot(data)              ✔
#   distplot(data, kde=False) → sns.histplot(data)             ✔
np.random.seed(42)
scores = np.random.normal(loc=70, scale=10, size=200)

sns.histplot(scores, kde=True, color="steelblue") 
plt.title("histplot + kde=True")
plt.show()                  # Output: bars with smooth curve on top — same look as old distplot

sns.kdeplot(scores, fill=True, color="teal")   
plt.title("kdeplot  (replaces distplot kde-only mode)")
plt.show()                  # Output: filled smooth density curve

# ══════════════════════════════════════════════════════════════════════════════
# ── 4. displot — Flexible Distribution Plot (combines hist + kde) ─────────────
# displot = distribution plot — one function that can draw hist, kde, or ecdf
# kind= switches between types
# ══════════════════════════════════════════════════════════════════════════════
np.random.seed(42)
scores = np.random.normal(loc=70, scale=10, size=200)
sns.displot(scores, kind="hist")    # same as histplot
plt.show()                          # Output: histogram bars

sns.displot(scores, kind="kde")     # same as kdeplot
plt.show()                          # Output: smooth density curve

sns.displot(scores, kind="ecdf")    # ECDF = shows what % of data is below each value
plt.show()                          # Output: S-shaped step curve going from 0% to 100%distplot
# ── hist + kde together ───────────────────────────────────────────────────────
sns.displot(scores, kind="hist", kde=True, color="mediumseagreen")
plt.title("Histogram + KDE using displot")
plt.show()                          # Output: bars with smooth line on top

# ══════════════════════════════════════════════════════════════════════════════
# ── 5. ecdfplot — Cumulative Distribution ────────────────────────────────────
# ECDF = Empirical Cumulative Distribution Function
# For each value on x-axis, shows: what fraction of data is ≤ that value
# Useful to answer: "what % of students scored below 75?"
# ══════════════════════════════════════════════════════════════════════════════
np.random.seed(42)
scores = np.random.normal(loc=70, scale=10, size=200)

sns.ecdfplot(scores, color="darkorange")
plt.title("Cumulative Score Distribution")
plt.xlabel("Score")
plt.ylabel("Proportion (0 to 1)")
plt.axhline(0.5, color="gray", linestyle="--")  # horizontal line at 50%
plt.show()                          # Output: S-curve; where it crosses 0.5 = median score
# ══════════════════════════════════════════════════════════════════════════════
# ── 6. rugplot — Data Point Markers ──────────────────────────────────────────
# Draws a small tick mark on the x-axis for every single data point
# Shows exactly where each value sits — useful combined with kde or hist
# ══════════════════════════════════════════════════════════════════════════════

small_data = [22, 25, 28, 30, 30, 33, 35, 38, 40]

sns.kdeplot(small_data, color="blue")
sns.rugplot(small_data, color="red", height=0.05)  # height = tick mark size
plt.title("KDE + Rug Plot")
plt.xlabel("Value")
plt.show()   # Output: smooth curve with tiny red ticks below it showing raw values

# ══════════════════════════════════════════════════════════════════════════════
# ── 7. jointplot — Relationship + Distribution of Two Variables ───────────────
# Shows center plot (relationship between x and y)
# + side plots (distribution of x alone and y alone)
# Useful to see: how two variables relate AND how each is spread
# ══════════════════════════════════════════════════════════════════════════════

import pandas as pd

np.random.seed(42)
age    = np.random.randint(22, 55, 100)     # 100 random ages between 22 and 55
salary = age * 1500                          # salary = age × 1500  (older = higher salary)

df = pd.DataFrame({"age": age, "salary": salary})

# ── kind="scatter" — dots in center (default) ────────────────────────────────
sns.jointplot(x="age", y="salary", data=df, kind="scatter")
plt.suptitle("scatter — dots showing each person", y=1.02)
plt.show()          # Output: scatter dots in center, histogram of age on top, salary on right

# ── kind="kde" — smooth density contours ─────────────────────────────────────
# Contour lines show where most data points are concentrated (like a topographic map)
sns.jointplot(x="age", y="salary", data=df, kind="kde")
plt.suptitle("kde — density contours (where most points are)", y=1.02)
plt.show()          # Output: oval contour rings in center, smooth kde curves on sides

# ── kind="hist" — 2D histogram grid ──────────────────────────────────────────
# Divides the space into a grid of squares — darker square = more points in that area
sns.jointplot(x="age", y="salary", data=df, kind="hist")
plt.suptitle("hist — grid squares, darker = more values", y=1.02)
plt.show()          # Output: colored grid squares in center, histograms on sides

# ── kind="hex" — hexagonal bins ──────────────────────────────────────────────
# Like hist but uses hexagons — better when many points overlap
# Darker hexagon = more data points in that area
sns.jointplot(x="age", y="salary", data=df, kind="hex")
plt.suptitle("hex — hexagons, darker = more values (good for large data)", y=1.02)
plt.show()          # Output: hexagonal grid in center, histograms on sides

# ── kind="reg" — scatter + regression line ───────────────────────────────────
# regression line = a straight line showing the overall trend in the data
sns.jointplot(x="age", y="salary", data=df, kind="reg")
plt.suptitle("reg — scatter with trend line", y=1.02)
plt.show()          # Output: dots with a best-fit line, kde curves on sides



# ══════════════════════════════════════════════════════════════════════════════
# ── 8. pairplot — Distribution + Relationship for All Column Pairs ────────────
# Automatically plots every combination of columns in a DataFrame
# Diagonal  → distribution of each column (how data is spread)
# Off-diagonal → relationship between each pair of columns (scatter)
#
# Example with 3 columns (age, salary, score):
#          age       salary    score
#  age   [hist]    [scatter] [scatter]
#  salary[scatter]   [hist]  [scatter]
#  score [scatter] [scatter]   [hist]
# ══════════════════════════════════════════════════════════════════════════════


np.random.seed(42)
df2 = pd.DataFrame({
    "age":    np.random.randint(22, 55, 50),                  # 50 random ages
    "salary": np.random.randint(22, 55, 50) * 1500,           # salary based on age
    "score":  np.random.randint(50, 100, 50),                 # random exam scores
    "dept":   np.random.choice(["HR", "IT", "Finance"], 50)  # random department
})

# ── Basic pairplot ────────────────────────────────────────────────────────────
sns.pairplot(df2[["age", "salary", "score"]])   # pass only numeric columns
plt.suptitle("Basic Pairplot", y=1.02)
plt.show()          # Output: 3×3 grid — diagonal has histograms, others have scatter dots

# ── diag_kind="kde" — smooth curve on diagonal instead of histogram ───────────
sns.pairplot(df2[["age", "salary", "score"]], diag_kind="kde")
plt.suptitle("Pairplot with KDE on diagonal", y=1.02)
plt.show()          # Output: 3×3 grid — diagonal has smooth curves, others have scatter dots

# ── kind="reg" — scatter + trend line on off-diagonal ────────────────────────
sns.pairplot(df2[["age", "salary", "score"]], kind="reg")
plt.suptitle("Pairplot with regression lines", y=1.02)
plt.show()          # Output: each scatter plot has a best-fit trend line drawn through it

# ── hue= — color-code points by a category column ────────────────────────────
# hue splits the data by a category and colors each group differently
sns.pairplot(df2, hue="dept")                   # color by department
plt.suptitle("Pairplot colored by Department", y=1.02)
plt.show()          # Output: dots colored by HR/IT/Finance — easy to compare groups


# ══════════════════════════════════════════════════════════════════════════════
# ── Quick Reference ───────────────────────────────────────────────────────────
# ══════════════════════════════════════════════════════════════════════════════

# sns.histplot(data)                  → histogram bars (count per range)
# sns.histplot(data, kde=True)        → histogram + smooth curve on top
# sns.histplot(data, bins=n)          → control number of intervals
# sns.histplot(data, stat="density")  → y-axis as density instead of count
#
# sns.kdeplot(data)                   → smooth density curve only
# sns.kdeplot(data, fill=True)        → fill area under curve
# sns.kdeplot(data, bw_adjust=0.5)    → adjust smoothness
#
# sns.distplot(data)                  → DEPRECATED ❌ — use histplot/kdeplot instead
# sns.histplot(data, kde=True)        → modern replacement for distplot
#
# sns.displot(data, kind="hist")      → histogram via displot
# sns.displot(data, kind="kde")       → kde via displot
# sns.displot(data, kind="ecdf")      → cumulative curve via displot
#
# sns.ecdfplot(data)                  → cumulative distribution (0 to 1)
# sns.rugplot(data)                   → tick marks for each data point
#
# sns.jointplot(x=, y=, data=)        → relationship + distribution of 2 variables
# sns.jointplot(..., kind="scatter")  → dots in center (default)
# sns.jointplot(..., kind="kde")      → smooth density contours
# sns.jointplot(..., kind="hist")     → 2D histogram grid
# sns.jointplot(..., kind="hex")      → hexagonal bins (good for many overlapping points)
# sns.jointplot(..., kind="reg")      → scatter + regression line