Python Pandas DataFrames

# A DataFrame is a collection of Series — each column is a Series sharing the same index.
# DataFrame is a 2D labeled table with rows and columns — like a spreadsheet or SQL table.
# From dictionary — most common way
# From list of dicts — each dict = one row
# From NumPy array — with column names
# Custom index — label rows
# Attributes — shape, dtypes, columns, index
# Viewing — head, tail, info, describe
# Selecting columns — single & multiple
# Selecting rows — iloc, loc
# Specific cell — iloc[row, col], loc[row, col]
# Boolean filtering — single & multiple conditions
# Add/remove columns — derived columns, drop
# Sorting — sort_values ascending/descending
# Missing values — isnull, dropna, fillna
# GroupBy — groupby, agg
import pandas as pd
import numpy as np

# ── 1. Creating a DataFrame from Dictionary ───────────────────────────────────
# Keys = column names, Values = list of column data

df = pd.DataFrame({
"Name": ["Alice", "Bob", "Carol"],
"Age": [30, 25, 28],
"Score": [90, 85, 92]
})
print(df)
# Name Age Score
# 0 Alice 30 90
# 1 Bob 25 85
# 2 Carol 28 92

# ── 2. Creating from List of Dictionaries ─────────────────────────────────────
# Each dict = one row

df = pd.DataFrame([
{"Name": "Alice", "Age": 30, "Score": 90},
{"Name": "Bob", "Age": 25, "Score": 85},
{"Name": "Carol", "Age": 28, "Score": 92}
])
print(df)
# Name Age Score
# 0 Alice 30 90
# 1 Bob 25 85
# 2 Carol 28 92

# ── 3. Creating from NumPy Array ──────────────────────────────────────────────

arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df = pd.DataFrame(arr, columns=["A", "B", "C"])
print(df)
# A B C
# 0 1 2 3
# 1 4 5 6
# 2 7 8 9

# ── 4. Custom Index ───────────────────────────────────────────────────────────

df = pd.DataFrame({
"Name": ["Alice", "Bob", "Carol"],
"Score": [90, 85, 92]
}, index=["r1", "r2", "r3"])
print(df)
# Name Score
# r1 Alice 90
# r2 Bob 85
# r3 Carol 92

# ── 5. DataFrame Attributes ───────────────────────────────────────────────────

df = pd.DataFrame({
"Name": ["Alice", "Bob", "Carol"],
"Age": [30, 25, 28],
"Score": [90, 85, 92]
})

print(df.shape) # (3, 3) → rows, cols
print(df.ndim) # 2 → dimensions
print(df.size) # 9 → total elements
print(df.dtypes) # Name object, Age int64, Score int64
print(df.columns.tolist()) # ['Name', 'Age', 'Score']
print(df.index.tolist()) # [0, 1, 2]

# ── 6. Viewing Data ───────────────────────────────────────────────────────────

print(df.head(2)) # first 2 rows
# Name Age Score
# 0 Alice 30 90
# 1 Bob 25 85

print(df.tail(2)) # last 2 rows
# Name Age Score
# 1 Bob 25 85
# 2 Carol 28 92

print(df.info()) # column names, non-null counts, dtypes
print(df.describe()) # count, mean, std, min, 25%, 50%, 75%, max

# ── 7. Selecting Columns ──────────────────────────────────────────────────────

print(df["Name"]) # Series → Alice, Bob, Carol
print(df[["Name", "Score"]]) # DataFrame with 2 columns

# ── 8. Selecting Rows ─────────────────────────────────────────────────────────

print(df.iloc[0]) # first row by position
# Name Alice
# Age 30
# Score 90

print(df.iloc[0:2]) # rows 0 and 1 by position
print(df.loc[0]) # row by label/index
print(df.loc[0:1]) # rows 0 to 1 by label (inclusive)

# ── 9. Selecting Specific Cell ────────────────────────────────────────────────

print(df.iloc[0, 1]) # 30 → row 0, col 1 (by position)
print(df.loc[0, "Age"]) # 30 → row 0, col "Age" (by label)

# ── 10. Boolean Filtering ─────────────────────────────────────────────────────

print(df[df["Age"] > 26])
# Name Age Score
# 0 Alice 30 90
# 2 Carol 28 92

print(df[df["Score"] >= 90])
# Name Age Score
# 0 Alice 30 90
# 2 Carol 28 92

print(df[(df["Age"] > 24) & (df["Score"] > 85)]) # multiple conditions
# Name Age Score
# 0 Alice 30 90
# 2 Carol 28 92

# ── 11. Adding / Removing Columns ─────────────────────────────────────────────

df["Grade"] = ["A", "B", "A"] # add new column
print(df)
# Name Age Score Grade
# 0 Alice 30 90 A
# 1 Bob 25 85 B
# 2 Carol 28 92 A

df["Pass"] = df["Score"] >= 90 # derived column
print(df["Pass"]) # 0 True, 1 False, 2 True

df.drop(columns=["Grade"], inplace=True) # remove column
print(df.columns.tolist()) # ['Name', 'Age', 'Score', 'Pass']

# ── 12. Sorting ───────────────────────────────────────────────────────────────

print(df.sort_values("Age")) # sort by Age ascending
# Name Age Score
# 1 Bob 25 85
# 2 Carol 28 92
# 0 Alice 30 90

print(df.sort_values("Score", ascending=False)) # sort by Score descending
# Name Age Score
# 2 Carol 28 92
# 0 Alice 30 90
# 1 Bob 25 85

# ── 13. Handling Missing Values ───────────────────────────────────────────────

df2 = pd.DataFrame({
"Name": ["Alice", "Bob", "Carol"],
"Age": [30, None, 28],
"Score": [90, 85, None]
})

print(df2.isnull()) # True where value is missing
print(df2.isnull().sum()) # Age 1, Score 1 → count of NaN per column
print(df2.dropna()) # remove rows with any NaN
# Name Age Score
# 0 Alice 30.0 90.0

print(df2.dropna(axis=1)) # remove COLUMNS with any NaN
# Name
# 0 Alice
# 1 Bob
# 2 Carol

print(df2.dropna(axis=0)) # remove ROWS with any NaN (default, same as dropna())
# Name Age Score
# 0 Alice 30.0 90.0

print(df2.dropna(how="all")) # remove rows where ALL values are NaN
print(df2.dropna(thresh=2)) # keep rows with at least 2 non-NaN values
# Name Age Score
# 0 Alice 30.0 90.0
# 1 Bob NaN 85.0
# 2 Carol 28.0 NaN

print(df2.fillna(0)) # replace NaN with 0
# Name Age Score
# 0 Alice 30.0 90.0
# 1 Bob 0.0 85.0
# 2 Carol 28.0 0.0

print(df2.fillna(value="FILL VALUE")) # replace NaN with a custom string label
# Name Age Score
# 0 Alice 30.0 90.0
# 1 Bob FILL VALUE 85.0
# 2 Carol 28.0 FILL VALUE

print(df2.fillna("")) # replace NaN with empty string (useful for string columns)
# Name Age Score
# 0 Alice 30 90
# 1 Bob 85
# 2 Carol 28

print(df2.fillna({"Age": 0, "Score": df2["Score"].mean()})) # different fill per column
# Name Age Score
# 0 Alice 30.0 90.0
# 1 Bob 0.0 85.0
# 2 Carol 28.0 87.5

print(df2.fillna(method="ffill")) # forward fill — copy value from previous row
# Name Age Score
# 0 Alice 30.0 90.0
# 1 Bob 30.0 85.0 ← Age filled from Alice
# 2 Carol 28.0 85.0 ← Score filled from Bob

print(df2.fillna(method="bfill")) # backward fill — copy value from next row
# Name Age Score
# 0 Alice 30.0 90.0
# 1 Bob 28.0 85.0 ← Age filled from Carol
# 2 Carol 28.0 NaN ← no next row to fill from

print(df2.fillna(df2.mean(numeric_only=True))) # replace NaN with column mean
# Name Age Score
# 0 Alice 30.0 90.0
# 1 Bob 29.0 85.0 ← Age mean = (30+28)/2
# 2 Carol 28.0 87.5 ← Score mean = (90+85)/2

# ── 14. Aggregate & GroupBy ───────────────────────────────────────────────────

df3 = pd.DataFrame({
"Dept": ["HR", "IT", "HR", "IT", "HR"],
"Name": ["Alice", "Bob", "Carol", "Dave", "Eve"],
"Salary": [50000, 80000, 55000, 90000, 52000]
})

print(df3.groupby("Dept")["Salary"].mean())
# Dept
# HR 52333.33
# IT 85000.00

print(df3.groupby("Dept")["Salary"].agg(["min", "max", "mean"]))
# min max mean
# Dept
# HR 50000 55000 52333.333333
# IT 80000 90000 85000.000000

print(df3.groupby("Dept").size()) # count rows per group
# Dept
# HR 3
# IT 2

print(df3.groupby("Dept")["Salary"].sum()) # total salary per dept
# Dept
# HR 157000
# IT 170000

print(df3.groupby("Dept")["Salary"].count()) # non-null count per group
# Dept
# HR 3
# IT 2

# Multiple columns groupby
df3["Level"] = ["Junior", "Senior", "Junior", "Senior", "Junior"]
print(df3.groupby(["Dept", "Level"])["Salary"].mean())
# Dept Level
# HR Junior 52333.33
# IT Senior 85000.00

# ── unique() & nunique() ─────────────────────────────────────────────────────
# unique() → returns array of unique values in a Series
# nunique() → returns COUNT of unique values (number only)

df_u = pd.DataFrame({
"Dept": ["HR", "IT", "HR", "Finance", "IT", "HR"],
"Name": ["Alice", "Bob", "Carol", "Dave", "Eve", "Frank"],
"Score": [90, 85, 90, 78, 85, 92]
})

print(df_u["Dept"].unique()) # ['HR' 'IT' 'Finance'] → unique values
print(df_u["Dept"].nunique()) # 3 → count of unique values
print(df_u["Score"].unique()) # [90 85 78 92] → unique scores
print(df_u["Score"].nunique()) # 4 → count of unique scores

# nunique across all columns
print(df_u.nunique())
# Dept 3
# Name 6
# Score 4

# unique() vs value_counts()
print(df_u["Dept"].value_counts()) # HR 3, IT 2, Finance 1 → count per value
print(df_u["Dept"].unique()) # ['HR' 'IT' 'Finance'] → just the values

# ── 15. Merging & Joining ─────────────────────────────────────────────────────
# Combine two DataFrames based on a common column (like SQL JOIN)

left = pd.DataFrame({
"ID": [1, 2, 3, 4],
"Name": ["Alice", "Bob", "Carol", "Dave"]
})

right = pd.DataFrame({
"ID": [1, 2, 3, 5],
"Salary": [90000, 85000, 92000, 70000]
})

# default merge — inner join on common column name (no need to specify on= or how=)
print(pd.merge(left, right))
# ID Name Salary
# 0 1 Alice 90000
# 1 2 Bob 85000
# 2 3 Carol 92000

# merge on multiple columns — both columns must match for a row to be kept
left2 = pd.DataFrame({
"ID": [1, 2, 3],
"Dept": ["HR", "IT", "HR"],
"Name": ["Alice", "Bob", "Carol"]
})
right2 = pd.DataFrame({
"ID": [1, 2, 3],
"Dept": ["HR", "IT", "Finance"],
"Salary": [90000, 85000, 70000]
})

print(pd.merge(left2, right2, on=["ID", "Dept"]))
# ID Dept Name Salary
# 0 1 HR Alice 90000 ← ID=1 AND Dept=HR matched
# 1 2 IT Bob 85000 ← ID=2 AND Dept=IT matched
# row 3 dropped — ID=3 matches but Dept HR ≠ Finance

# inner join — only matching IDs in both
print(pd.merge(left, right, on="ID", how="inner"))
# ID Name Salary
# 0 1 Alice 90000
# 1 2 Bob 85000
# 2 3 Carol 92000

# left join — all rows from left, NaN where no match in right
print(pd.merge(left, right, on="ID", how="left"))
# ID Name Salary
# 0 1 Alice 90000.0
# 1 2 Bob 85000.0
# 2 3 Carol 92000.0
# 3 4 Dave NaN

# right join — all rows from right, NaN where no match in left
print(pd.merge(left, right, on="ID", how="right"))
# ID Name Salary
# 0 1 Alice 90000
# 1 2 Bob 85000
# 2 3 Carol 92000
# 3 5 NaN 70000

# outer join — all rows from both, NaN where no match
print(pd.merge(left, right, on="ID", how="outer"))
# ID Name Salary
# 0 1 Alice 90000.0
# 1 2 Bob 85000.0
# 2 3 Carol 92000.0
# 3 4 Dave NaN
# 4 5 NaN 70000.0

# ── join() — join on index (not a column) ────────────────────────────────────
# df.join() is like pd.merge() but matches on the INDEX instead of a column
#
# | Method | Joins on | Default | Use when |
# |---------------------|-----------|-----------|----------------------|
# | left.join(right) | INDEX | left join | index is the key |
# | pd.merge(left,right)| COLUMN | inner join| column is the key |

left_j = pd.DataFrame(
{"Name": ["Alice", "Bob", "Carol"]},
index=[1, 2, 3] # index acts as the key
)
right_j = pd.DataFrame(
{"Salary": [90000, 85000, 92000]},
index=[1, 2, 4] # index 3 missing, index 4 extra
)

# default join — left join on index (default is left join)
print(left_j.join(right_j))
# Name Salary
# 1 Alice 90000.0 ← index 1 matched
# 2 Bob 85000.0 ← index 2 matched
# 3 Carol NaN ← index 3 not in right → NaN

# inner join — only matching indices
print(left_j.join(right_j, how="inner"))
# Name Salary
# 1 Alice 90000
# 2 Bob 85000

# outer join — all indices from both
print(left_j.join(right_j, how="outer"))
# Name Salary
# 1 Alice 90000.0
# 2 Bob 85000.0
# 3 Carol NaN
# 4 NaN 92000.0

# join vs merge — key difference
# left.join(right) → joins on INDEX
# pd.merge(left, right, on=col) → joins on COLUMN

# ── 16. Concatenating ─────────────────────────────────────────────────────────
# Stack DataFrames vertically (rows) or horizontally (columns)

df_a = pd.DataFrame({"Name": ["Alice", "Bob"], "Score": [90, 85]})
df_b = pd.DataFrame({"Name": ["Carol", "Dave"], "Score": [92, 88]})

# vertical stack (axis=0) — add rows
print(pd.concat([df_a, df_b], ignore_index=True))
# Name Score
# 0 Alice 90
# 1 Bob 85
# 2 Carol 92
# 3 Dave 88

# horizontal stack (axis=1) — add columns
df_c = pd.DataFrame({"Grade": ["A", "B"]})
print(pd.concat([df_a, df_c], axis=1))
# Name Score Grade
# 0 Alice 90 A
# 1 Bob 85 B

# ── 17. DataFrame Operations ──────────────────────────────────────────────────

df = pd.DataFrame({
"Name": ["Alice", "Bob", "Carol"],
"Age": [30, 25, 28],
"Score": [90, 85, 92]
})

# apply() — apply a function along rows or columns
print(df["Score"].apply(lambda x: "Pass" if x >= 90 else "Fail"))
# 0 Pass
# 1 Fail
# 2 Pass

print(df[["Age", "Score"]].apply(lambda col: col.max() - col.min()))
# Age 5
# Score 7

# map() — transform each value in a Series
print(df["Name"].map(str.lower)) # alice, bob, carol

# rename columns
print(df.rename(columns={"Score": "Marks", "Age": "Years"}))
# Name Years Marks
# 0 Alice 30 90
# 1 Bob 25 85
# 2 Carol 28 92

# astype() — change column data type
df["Score"] = df["Score"].astype(float)
print(df["Score"].dtype) # float64

# drop_duplicates()
df_dup = pd.DataFrame({"Name": ["Alice", "Bob", "Alice"], "Score": [90, 85, 90]})
print(df_dup.drop_duplicates())
# Name Score
# 0 Alice 90
# 1 Bob 85

# value_counts()
print(df_dup["Name"].value_counts())
# Alice 2
# Bob 1

# reset_index / set_index
df2 = df.set_index("Name") # Name becomes the row index
print(df2)
# Age Score
# Name
# Alice 30 90.0
# Bob 25 85.0
# Carol 28 92.0

print(df2.reset_index()) # moves index back to a column
# Name Age Score
# 0 Alice 30 90.0
# 1 Bob 25 85.0
# 2 Carol 28 92.0

# String operations via .str accessor
df["Name_upper"] = df["Name"].str.upper() # ALICE, BOB, CAROL
df["Name_len"] = df["Name"].str.len() # 5, 3, 5
print(df[["Name", "Name_upper", "Name_len"]])

# ── 18. Data Input and Output ─────────────────────────────────────────────────

# ── CSV ───────────────────────────────────────────────────────────────────────
df = pd.DataFrame({
"Name": ["Alice", "Bob", "Carol"],
"Age": [30, 25, 28],
"Score": [90, 85, 92]
})

df.to_csv("output.csv", index=False) # save to CSV (index=False skips row numbers)
df_csv = pd.read_csv("output.csv") # load from CSV
print(df_csv)
# Name Age Score
# 0 Alice 30 90
# 1 Bob 25 85
# 2 Carol 28 92

# common read_csv options
# pd.read_csv("file.csv", sep=",") # delimiter (default comma)
# pd.read_csv("file.csv", header=0) # row to use as column names
# pd.read_csv("file.csv", usecols=["Name"]) # load specific columns only
# pd.read_csv("file.csv", nrows=100) # load first 100 rows only
# pd.read_csv("file.csv", na_values=["NA","-"]) # treat these as NaN

# ── JSON ──────────────────────────────────────────────────────────────────────
df.to_json("output.json", orient="records") # save to JSON (list of row dicts)
df_json = pd.read_json("output.json") # load from JSON
print(df_json)
# Name Age Score
# 0 Alice 30 90
# 1 Bob 25 85
# 2 Carol 28 92

# ── Excel ─────────────────────────────────────────────────────────────────────
# requires: pip install openpyxl
# df.to_excel("output.xlsx", index=False, sheet_name="Sheet1")
# df_xl = pd.read_excel("output.xlsx", sheet_name="Sheet1")
# print(df_xl)

# ── Inspect loaded data ───────────────────────────────────────────────────────
print(df_csv.shape) # (3, 3) → rows, cols
print(df_csv.dtypes) # Name object, Age int64, Score int64
print(df_csv.head(2)) # first 2 rows
print(df_csv.info()) # column info, non-null counts

# ── 19. MultiIndex ────────────────────────────────────────────────────────────
# MultiIndex = multiple levels of labels on rows or columns (hierarchical index)
# Useful for grouped/structured data like time series, regions, departments

# ── Creating MultiIndex DataFrame ─────────────────────────────────────────────

arrays = [
["HR", "HR", "IT", "IT"], # level 0 — Dept
["Alice", "Bob", "Carol", "Dave"] # level 1 — Name
]
index = pd.MultiIndex.from_arrays(arrays, names=["Dept", "Name"])

df_mi = pd.DataFrame({"Salary": [50000, 55000, 80000, 90000],
"Score": [88, 92, 95, 87]}, index=index)
print(df_mi)
# Salary Score
# Dept Name
# HR Alice 50000 88
# Bob 55000 92
# IT Carol 80000 95
# Dave 90000 87

# ── from_tuples ───────────────────────────────────────────────────────────────

tuples = [("HR", "Alice"), ("HR", "Bob"), ("IT", "Carol"), ("IT", "Dave")]
index2 = pd.MultiIndex.from_tuples(tuples, names=["Dept", "Name"])
print(index2)
# MultiIndex([('HR', 'Alice'), ('HR', 'Bob'), ('IT', 'Carol'), ('IT', 'Dave')],
# names=['Dept', 'Name'])

# ── from_product — all combinations ──────────────────────────────────────────

index3 = pd.MultiIndex.from_product([["HR", "IT"], ["Q1", "Q2"]],
names=["Dept", "Quarter"])
print(index3)
# MultiIndex([('HR', 'Q1'), ('HR', 'Q2'), ('IT', 'Q1'), ('IT', 'Q2')],
# names=['Dept', 'Quarter'])

# ── Accessing MultiIndex data ─────────────────────────────────────────────────

print(df_mi.loc["HR"]) # all rows where Dept = HR
# Salary Score
# Name
# Alice 50000 88
# Bob 55000 92

print(df_mi.loc[("HR", "Alice")]) # specific row by both levels
# Salary 50000
# Score 88

print(df_mi.loc["IT", "Salary"]) # IT dept, Salary column only
# Name
# Carol 80000
# Dave 90000

# ── xs — cross-section selector ───────────────────────────────────────────────

print(df_mi.xs("Alice", level="Name")) # select across level by value
# Salary Score
# Dept
# HR 50000 88

# ── MultiIndex on columns ─────────────────────────────────────────────────────

col_index = pd.MultiIndex.from_tuples([("2024", "Q1"), ("2024", "Q2"),
("2025", "Q1"), ("2025", "Q2")],
names=["Year", "Quarter"])
df_col = pd.DataFrame([[100, 110, 120, 130],
[200, 210, 220, 230]],
index=["Alice", "Bob"],
columns=col_index)
print(df_col)
# Year 2024 2025
# Quarter Q1 Q2 Q1 Q2
# Alice 100 110 120 130
# Bob 200 210 220 230

print(df_col["2024"]) # select year 2024 columns only
# Quarter Q1 Q2
# Alice 100 110
# Bob 200 210

# ── GroupBy with MultiIndex ───────────────────────────────────────────────────

print(df_mi.groupby(level="Dept")["Salary"].mean())
# Dept
# HR 52500.0
# IT 85000.0

print(df_mi.groupby(level="Dept")["Score"].sum())
# Dept
# HR 180
# IT 182

# ── reset_index — flatten MultiIndex back to columns ─────────────────────────

print(df_mi.reset_index())
# Dept Name Salary Score
# 0 HR Alice 50000 88
# 1 HR Bob 55000 92
# 2 IT Carol 80000 95
# 3 IT Dave 90000 87

# ── stack / unstack — pivot between row and column levels ────────────────────

df_us = df_mi.unstack(level="Name") # moves Name level to columns
print(df_us)
# Salary Score
# Name Alice Bob Carol Dave Alice Bob Carol Dave
# Dept
# HR 50000 55000 NaN NaN 88 92 NaN NaN
# IT NaN NaN 80000 90000 NaN NaN 95 87

df_st = df_us.stack(level="Name") # moves Name back to rows
print(df_st) # restores original MultiIndex shape

# ── 20. zip() — Python Built-in ───────────────────────────────────────────────
# zip() pairs elements from multiple iterables together, index by index
# Returns an iterator of tuples — stops at the shortest iterable

# ── Basic zip ─────────────────────────────────────────────────────────────────

names = ["Alice", "Bob", "Carol"]
ages = [30, 25, 28]
scores = [90, 85, 92]

zipped = zip(names, ages)
print(list(zipped)) # [('Alice', 30), ('Bob', 25), ('Carol', 28)]

# ── zip with 3 iterables ──────────────────────────────────────────────────────

for name, age, score in zip(names, ages, scores):
print(f"{name} | Age: {age} | Score: {score}")
# Alice | Age: 30 | Score: 90
# Bob | Age: 25 | Score: 85
# Carol | Age: 28 | Score: 92

# ── zip to create a dictionary ────────────────────────────────────────────────

keys = ["Name", "Age", "Score"]
values = ["Alice", 30, 90]

d = dict(zip(keys, values))
print(d) # {'Name': 'Alice', 'Age': 30, 'Score': 90}

# ── zip to create a DataFrame ─────────────────────────────────────────────────

df_zip = pd.DataFrame(zip(names, ages, scores), columns=["Name", "Age", "Score"])
print(df_zip)
# Name Age Score
# 0 Alice 30 90
# 1 Bob 25 85
# 2 Carol 28 92

# ── unzip with * (unpacking) ──────────────────────────────────────────────────

pairs = [("Alice", 90), ("Bob", 85), ("Carol", 92)]
names_out, scores_out = zip(*pairs) # unzips list of tuples into separate tuples
print(names_out) # ('Alice', 'Bob', 'Carol')
print(scores_out) # (90, 85, 92)

# ── zip stops at shortest iterable ───────────────────────────────────────────

a = [1, 2, 3, 4]
b = ["x", "y"] # shorter list

print(list(zip(a, b))) # [(1, 'x'), (2, 'y')] → stops at 2

# use zip_longest to fill missing values with None (or custom fill)
from itertools import zip_longest
print(list(zip_longest(a, b, fillvalue="-"))) # [(1,'x'), (2,'y'), (3,'-'), (4,'-')]

# ── 21. pivot_table ───────────────────────────────────────────────────────────
# Summarize data like an Excel pivot — group, aggregate, and reshape in one step
# Syntax: df.pivot_table(values, index, columns, aggfunc, fill_value)

df_pt = pd.DataFrame({
"Dept": ["HR", "IT", "HR", "IT", "HR", "IT"],
"Level": ["Junior", "Junior", "Senior", "Senior", "Junior", "Senior"],
"Gender": ["F", "M", "F", "M", "M", "F"],
"Salary": [50000, 80000, 70000, 90000, 52000, 88000]
})

# ── Basic pivot_table ─────────────────────────────────────────────────────────
# values → column to aggregate
# index → rows
# aggfunc → how to aggregate (default: mean)

print(df_pt.pivot_table(values="Salary", index="Dept"))
# Dept
# HR 57333.33 ← mean salary for HR
# IT 86000.00 ← mean salary for IT

# ── index + columns → 2D cross-table ─────────────────────────────────────────
# columns → spread unique values across columns

print(df_pt.pivot_table(values="Salary", index="Dept", columns="Level"))
# Level Junior Senior
# Dept
# HR 51000.0 70000.0
# IT 80000.0 89000.0

# ── aggfunc — change the aggregation function ─────────────────────────────────

print(df_pt.pivot_table(values="Salary", index="Dept", aggfunc="sum"))
# Dept
# HR 172000 ← total salary for HR
# IT 258000 ← total salary for IT

print(df_pt.pivot_table(values="Salary", index="Dept", aggfunc="count"))
# Dept
# HR 3
# IT 3

print(df_pt.pivot_table(values="Salary", index="Dept", aggfunc=["min","max","mean"]))
# min max mean
# Dept
# HR 50000 70000 57333.333333
# IT 80000 90000 86000.000000

# ── fill_value — replace NaN in result ───────────────────────────────────────

print(df_pt.pivot_table(values="Salary", index="Dept",
columns="Gender", aggfunc="mean", fill_value=0))
# Gender F M
# Dept
# HR 60000.0 52000.0
# IT 88000.0 85000.0

# ── margins — add row/column totals ──────────────────────────────────────────

print(df_pt.pivot_table(values="Salary", index="Dept",
columns="Level", aggfunc="mean", margins=True))
# Level Junior Senior All
# Dept
# HR 51000.0 70000.0 57333.333333
# IT 80000.0 89000.0 86000.000000
# All 65500.0 82333.3 71833.333333

# ── pivot_table vs groupby ────────────────────────────────────────────────────
# groupby → returns Series/DataFrame, flexible, multi-step
# pivot_table → returns 2D cross-table, Excel-like, single step

# groupby equivalent of pivot_table above:
print(df_pt.groupby(["Dept", "Level"])["Salary"].mean().unstack())
# Level Junior Senior
# Dept
# HR 51000.0 70000.0
# IT 80000.0 89000.0

No comments:

Post a Comment

Please comment below to feedback or ask questions.