# Seaborn — a Python library built on top of Matplotlib
# Makes statistical plots easier and prettier with less code
# import convention: import seaborn as sns
# Distribution Plot — shows how data is spread / how often values occur
# Useful to understand: shape, center, spread, and outliers of data
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
# ── Sample data used throughout ───────────────────────────────────────────────
ages = [22, 25, 25, 27, 28, 30, 30, 30, 32, 35, 35, 38, 40, 42, 45]
scores = np.random.seed(42) or np.random.normal(loc=70, scale=10, size=200)
# loc=70 → mean is 70, scale=10 → std deviation 10, size=200 → 200 values
# ══════════════════════════════════════════════════════════════════════════════
# ── 1. histplot — Histogram ───────────────────────────────────────────────────
# Shows how many times each value (or range of values) appears
# x-axis: value ranges (bins), y-axis: count of values in that range
# ══════════════════════════════════════════════════════════════════════════════
sns.histplot(ages, bins=5, color="steelblue") # bins=5 → divide data into 5 intervals
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Count")
plt.show()
# ── bins — controls the number of intervals ───────────────────────────────────
sns.histplot(ages, bins=3) # fewer bins → wider bars, less detail
plt.show() # Output: 3 tall wide bars
sns.histplot(ages, bins=10) # more bins → narrower bars, more detail
plt.show() # Output: 10 narrow bars showing finer breakdown
# ── kde=True — adds a smooth curve over the histogram ────────────────────────
# KDE = Kernel Density Estimate — a smooth line showing the shape of distribution
sns.histplot(ages, bins=5, kde=True, color="teal")
plt.title("Histogram with KDE Curve")
plt.show() # Output: bars + a smooth curved line on top
# ── stat= — changes what y-axis shows ────────────────────────────────────────
agess = [10,20,30,10,20,30,40]
plt.title("changes what y-axis shows")
sns.histplot(agess, stat="count") # count → number of values (default)
sns.histplot(agess, stat="frequency") # frequency → proportion per bin width
sns.histplot(agess, stat="density") # density → area under curve = 1 (for KDE)
sns.histplot(agess, stat="probability") # probability → each bar = fraction of total
plt.show() # Output: y-axis changes based on stat used
# ══════════════════════════════════════════════════════════════════════════════
# ── 2. kdeplot — Smooth Density Curve ────────────────────────────────────────
# KDE = Kernel Density Estimate
# Instead of bars, shows a smooth curve — great for seeing the shape of data
# y-axis shows density (not count) — area under the curve = 1
# ══════════════════════════════════════════════════════════════════════════════
np.random.seed(42)
# np.random.normal(loc, scale, size) — generates random numbers that cluster around a center
# normal is used here specifically because KDE plots are meant to show bell-shaped distributions.
# loc=70 → center / average — most numbers will be near 70
# scale=10 → spread — how far numbers go from center
# ~68% of values fall between 60–80 (70 ± 10)
# ~95% of values fall between 50–90 (70 ± 20)
# size=200 → generate 200 numbers
# Think of it as: exam scores for 200 students — most score around 70, few very high or low
scores = np.random.normal(loc=70, scale=10, size=200)
sns.kdeplot(scores, color="blue")
plt.title("Score Distribution (KDE)")
plt.xlabel("Score")
plt.ylabel("Density")
plt.show()
# ── fill=True — fills area under the curve ───────────────────────────────────
sns.kdeplot(scores, fill=True, color="skyblue", alpha=0.6) # alpha = transparency
plt.title("KDE with Fill")
plt.show() # Output: filled blue area under the curve
# ── bw_adjust — controls smoothness of the curve ─────────────────────────────
# bw_adjust < 1 → more detail / jagged, bw_adjust > 1 → smoother / wider
sns.kdeplot(scores, bw_adjust=0.5, label="Less smooth (0.5)")
sns.kdeplot(scores, bw_adjust=2.0, label="More smooth (2.0)")
plt.legend()
plt.title("KDE Smoothness Comparison")
plt.show() # Output: two curves — one tighter, one wider/smoother
# ── Multiple KDE curves on same plot ─────────────────────────────────────────
group_a = np.random.normal(60, 8, 100) # Group A: mean=60
group_b = np.random.normal(80, 10, 100) # Group B: mean=80
sns.kdeplot(group_a, label="Group A", fill=True, alpha=0.4)
sns.kdeplot(group_b, label="Group B", fill=True, alpha=0.4)
plt.legend()
plt.title("Two Groups Compared")
plt.show() # Output: two overlapping filled curves — easy to compare groups
# distplot(data) → sns.histplot(data, kde=True) ✔
# distplot(data, hist=False)→ sns.kdeplot(data) ✔
# distplot(data, kde=False) → sns.histplot(data) ✔
np.random.seed(42)
scores = np.random.normal(loc=70, scale=10, size=200)
sns.histplot(scores, kde=True, color="steelblue")
plt.title("histplot + kde=True")
plt.show() # Output: bars with smooth curve on top — same look as old distplot
sns.kdeplot(scores, fill=True, color="teal")
plt.title("kdeplot (replaces distplot kde-only mode)")
plt.show() # Output: filled smooth density curve
# ══════════════════════════════════════════════════════════════════════════════
# ── 4. displot — Flexible Distribution Plot (combines hist + kde) ─────────────
# displot = distribution plot — one function that can draw hist, kde, or ecdf
# kind= switches between types
# ══════════════════════════════════════════════════════════════════════════════
np.random.seed(42)
scores = np.random.normal(loc=70, scale=10, size=200)
sns.displot(scores, kind="hist") # same as histplot
plt.show() # Output: histogram bars
sns.displot(scores, kind="kde") # same as kdeplot
plt.show() # Output: smooth density curve
sns.displot(scores, kind="ecdf") # ECDF = shows what % of data is below each value
plt.show() # Output: S-shaped step curve going from 0% to 100%distplot
# ── hist + kde together ───────────────────────────────────────────────────────
sns.displot(scores, kind="hist", kde=True, color="mediumseagreen")
plt.title("Histogram + KDE using displot")
plt.show() # Output: bars with smooth line on top
# ══════════════════════════════════════════════════════════════════════════════
# ── 5. ecdfplot — Cumulative Distribution ────────────────────────────────────
# ECDF = Empirical Cumulative Distribution Function
# For each value on x-axis, shows: what fraction of data is ≤ that value
# Useful to answer: "what % of students scored below 75?"
# ══════════════════════════════════════════════════════════════════════════════
np.random.seed(42)
scores = np.random.normal(loc=70, scale=10, size=200)
sns.ecdfplot(scores, color="darkorange")
plt.title("Cumulative Score Distribution")
plt.xlabel("Score")
plt.ylabel("Proportion (0 to 1)")
plt.axhline(0.5, color="gray", linestyle="--") # horizontal line at 50%
plt.show() # Output: S-curve; where it crosses 0.5 = median score
# ══════════════════════════════════════════════════════════════════════════════
# ── 6. rugplot — Data Point Markers ──────────────────────────────────────────
# Draws a small tick mark on the x-axis for every single data point
# Shows exactly where each value sits — useful combined with kde or hist
# ══════════════════════════════════════════════════════════════════════════════
small_data = [22, 25, 28, 30, 30, 33, 35, 38, 40]
sns.kdeplot(small_data, color="blue")
sns.rugplot(small_data, color="red", height=0.05) # height = tick mark size
plt.title("KDE + Rug Plot")
plt.xlabel("Value")
plt.show() # Output: smooth curve with tiny red ticks below it showing raw values
# ══════════════════════════════════════════════════════════════════════════════
# ── 7. jointplot — Relationship + Distribution of Two Variables ───────────────
# Shows center plot (relationship between x and y)
# + side plots (distribution of x alone and y alone)
# Useful to see: how two variables relate AND how each is spread
# ══════════════════════════════════════════════════════════════════════════════
import pandas as pd
np.random.seed(42)
age = np.random.randint(22, 55, 100) # 100 random ages between 22 and 55
salary = age * 1500 # salary = age × 1500 (older = higher salary)
df = pd.DataFrame({"age": age, "salary": salary})
# ── kind="scatter" — dots in center (default) ────────────────────────────────
sns.jointplot(x="age", y="salary", data=df, kind="scatter")
plt.suptitle("scatter — dots showing each person", y=1.02)
plt.show() # Output: scatter dots in center, histogram of age on top, salary on right
# ── kind="kde" — smooth density contours ─────────────────────────────────────
# Contour lines show where most data points are concentrated (like a topographic map)
sns.jointplot(x="age", y="salary", data=df, kind="kde")
plt.suptitle("kde — density contours (where most points are)", y=1.02)
plt.show() # Output: oval contour rings in center, smooth kde curves on sides
# ── kind="hist" — 2D histogram grid ──────────────────────────────────────────
# Divides the space into a grid of squares — darker square = more points in that area
sns.jointplot(x="age", y="salary", data=df, kind="hist")
plt.suptitle("hist — grid squares, darker = more values", y=1.02)
plt.show() # Output: colored grid squares in center, histograms on sides
# ── kind="hex" — hexagonal bins ──────────────────────────────────────────────
# Like hist but uses hexagons — better when many points overlap
# Darker hexagon = more data points in that area
sns.jointplot(x="age", y="salary", data=df, kind="hex")
plt.suptitle("hex — hexagons, darker = more values (good for large data)", y=1.02)
plt.show() # Output: hexagonal grid in center, histograms on sides
# ── kind="reg" — scatter + regression line ───────────────────────────────────
# regression line = a straight line showing the overall trend in the data
sns.jointplot(x="age", y="salary", data=df, kind="reg")
plt.suptitle("reg — scatter with trend line", y=1.02)
plt.show() # Output: dots with a best-fit line, kde curves on sides
# ══════════════════════════════════════════════════════════════════════════════
# ── 8. pairplot — Distribution + Relationship for All Column Pairs ────────────
# Automatically plots every combination of columns in a DataFrame
# Diagonal → distribution of each column (how data is spread)
# Off-diagonal → relationship between each pair of columns (scatter)
#
# Example with 3 columns (age, salary, score):
# age salary score
# age [hist] [scatter] [scatter]
# salary[scatter] [hist] [scatter]
# score [scatter] [scatter] [hist]
# ══════════════════════════════════════════════════════════════════════════════
np.random.seed(42)
df2 = pd.DataFrame({
"age": np.random.randint(22, 55, 50), # 50 random ages
"salary": np.random.randint(22, 55, 50) * 1500, # salary based on age
"score": np.random.randint(50, 100, 50), # random exam scores
"dept": np.random.choice(["HR", "IT", "Finance"], 50) # random department
})
# ── Basic pairplot ────────────────────────────────────────────────────────────
sns.pairplot(df2[["age", "salary", "score"]]) # pass only numeric columns
plt.suptitle("Basic Pairplot", y=1.02)
plt.show() # Output: 3×3 grid — diagonal has histograms, others have scatter dots
# ── diag_kind="kde" — smooth curve on diagonal instead of histogram ───────────
sns.pairplot(df2[["age", "salary", "score"]], diag_kind="kde")
plt.suptitle("Pairplot with KDE on diagonal", y=1.02)
plt.show() # Output: 3×3 grid — diagonal has smooth curves, others have scatter dots
# ── kind="reg" — scatter + trend line on off-diagonal ────────────────────────
sns.pairplot(df2[["age", "salary", "score"]], kind="reg")
plt.suptitle("Pairplot with regression lines", y=1.02)
plt.show() # Output: each scatter plot has a best-fit trend line drawn through it
# ── hue= — color-code points by a category column ────────────────────────────
# hue splits the data by a category and colors each group differently
sns.pairplot(df2, hue="dept") # color by department
plt.suptitle("Pairplot colored by Department", y=1.02)
plt.show() # Output: dots colored by HR/IT/Finance — easy to compare groups
# ══════════════════════════════════════════════════════════════════════════════
# ── Quick Reference ───────────────────────────────────────────────────────────
# ══════════════════════════════════════════════════════════════════════════════
# sns.histplot(data) → histogram bars (count per range)
# sns.histplot(data, kde=True) → histogram + smooth curve on top
# sns.histplot(data, bins=n) → control number of intervals
# sns.histplot(data, stat="density") → y-axis as density instead of count
#
# sns.kdeplot(data) → smooth density curve only
# sns.kdeplot(data, fill=True) → fill area under curve
# sns.kdeplot(data, bw_adjust=0.5) → adjust smoothness
#
# sns.distplot(data) → DEPRECATED ❌ — use histplot/kdeplot instead
# sns.histplot(data, kde=True) → modern replacement for distplot
#
# sns.displot(data, kind="hist") → histogram via displot
# sns.displot(data, kind="kde") → kde via displot
# sns.displot(data, kind="ecdf") → cumulative curve via displot
#
# sns.ecdfplot(data) → cumulative distribution (0 to 1)
# sns.rugplot(data) → tick marks for each data point
#
# sns.jointplot(x=, y=, data=) → relationship + distribution of 2 variables
# sns.jointplot(..., kind="scatter") → dots in center (default)
# sns.jointplot(..., kind="kde") → smooth density contours
# sns.jointplot(..., kind="hist") → 2D histogram grid
# sns.jointplot(..., kind="hex") → hexagonal bins (good for many overlapping points)
# sns.jointplot(..., kind="reg") → scatter + regression line
No comments:
Post a Comment
Please comment below to feedback or ask questions.