Keep Learning: Python Pandas DataFrames

# A DataFrame is a collection of Series — each column is a Series sharing the same index.
# DataFrame is a 2D labeled table with rows and columns — like a spreadsheet or SQL table.
# From dictionary — most common way
# From list of dicts — each dict = one row
# From NumPy array — with column names
# Custom index — label rows
# Attributes — shape, dtypes, columns, index
# Viewing — head, tail, info, describe
# Selecting columns — single & multiple
# Selecting rows — iloc, loc
# Specific cell — iloc[row, col], loc[row, col]
# Boolean filtering — single & multiple conditions
# Add/remove columns — derived columns, drop
# Sorting — sort_values ascending/descending
# Missing values — isnull, dropna, fillna
# GroupBy — groupby, agg
import pandas as pd
import numpy as np

# ── 1. Creating a DataFrame from Dictionary ───────────────────────────────────
# Keys = column names, Values = list of column data

df = pd.DataFrame({
    "Name":  ["Alice", "Bob", "Carol"],
    "Age":   [30, 25, 28],
    "Score": [90, 85, 92]
})
print(df)
#     Name  Age  Score
# 0  Alice   30     90
# 1    Bob   25     85
# 2  Carol   28     92

# ── 2. Creating from List of Dictionaries ─────────────────────────────────────
# Each dict = one row

df = pd.DataFrame([
    {"Name": "Alice", "Age": 30, "Score": 90},
    {"Name": "Bob",   "Age": 25, "Score": 85},
    {"Name": "Carol", "Age": 28, "Score": 92}
])
print(df)
#     Name  Age  Score
# 0  Alice   30     90
# 1    Bob   25     85
# 2  Carol   28     92

# ── 3. Creating from NumPy Array ──────────────────────────────────────────────

arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df = pd.DataFrame(arr, columns=["A", "B", "C"])
print(df)
#    A  B  C
# 0  1  2  3
# 1  4  5  6
# 2  7  8  9

# ── 4. Custom Index ───────────────────────────────────────────────────────────

df = pd.DataFrame({
    "Name":  ["Alice", "Bob", "Carol"],
    "Score": [90, 85, 92]
}, index=["r1", "r2", "r3"])
print(df)
#      Name  Score
# r1  Alice     90
# r2    Bob     85
# r3  Carol     92

# ── 5. DataFrame Attributes ───────────────────────────────────────────────────

df = pd.DataFrame({
    "Name":  ["Alice", "Bob", "Carol"],
    "Age":   [30, 25, 28],
    "Score": [90, 85, 92]
})

print(df.shape)                 # (3, 3)   → rows, cols
print(df.ndim)                  # 2        → dimensions
print(df.size)                  # 9        → total elements
print(df.dtypes)                # Name object, Age int64, Score int64
print(df.columns.tolist())      # ['Name', 'Age', 'Score']
print(df.index.tolist())        # [0, 1, 2]

# ── 6. Viewing Data ───────────────────────────────────────────────────────────

print(df.head(2))               # first 2 rows
#     Name  Age  Score
# 0  Alice   30     90
# 1    Bob   25     85

print(df.tail(2))               # last 2 rows
#     Name  Age  Score
# 1    Bob   25     85
# 2  Carol   28     92

print(df.info())                # column names, non-null counts, dtypes
print(df.describe())            # count, mean, std, min, 25%, 50%, 75%, max

# ── 7. Selecting Columns ──────────────────────────────────────────────────────

print(df["Name"])               # Series → Alice, Bob, Carol
print(df[["Name", "Score"]])    # DataFrame with 2 columns

# ── 8. Selecting Rows ─────────────────────────────────────────────────────────

print(df.iloc[0])               # first row by position
# Name     Alice
# Age         30
# Score       90

print(df.iloc[0:2])             # rows 0 and 1 by position
print(df.loc[0])                # row by label/index
print(df.loc[0:1])              # rows 0 to 1 by label (inclusive)

# ── 9. Selecting Specific Cell ────────────────────────────────────────────────

print(df.iloc[0, 1])            # 30    → row 0, col 1 (by position)
print(df.loc[0, "Age"])         # 30    → row 0, col "Age" (by label)

# ── 10. Boolean Filtering ─────────────────────────────────────────────────────

print(df[df["Age"] > 26])
#     Name  Age  Score
# 0  Alice   30     90
# 2  Carol   28     92

print(df[df["Score"] >= 90])
#     Name  Age  Score
# 0  Alice   30     90
# 2  Carol   28     92

print(df[(df["Age"] > 24) & (df["Score"] > 85)])   # multiple conditions
#     Name  Age  Score
# 0  Alice   30     90
# 2  Carol   28     92

# ── 11. Adding / Removing Columns ─────────────────────────────────────────────

df["Grade"] = ["A", "B", "A"]           # add new column
print(df)
#     Name  Age  Score Grade
# 0  Alice   30     90     A
# 1    Bob   25     85     B
# 2  Carol   28     92     A

df["Pass"] = df["Score"] >= 90          # derived column
print(df["Pass"])                       # 0 True, 1 False, 2 True

df.drop(columns=["Grade"], inplace=True)  # remove column
print(df.columns.tolist())              # ['Name', 'Age', 'Score', 'Pass']

# ── 12. Sorting ───────────────────────────────────────────────────────────────

print(df.sort_values("Age"))            # sort by Age ascending
#     Name  Age  Score
# 1    Bob   25     85
# 2  Carol   28     92
# 0  Alice   30     90

print(df.sort_values("Score", ascending=False))   # sort by Score descending
#     Name  Age  Score
# 2  Carol   28     92
# 0  Alice   30     90
# 1    Bob   25     85

# ── 13. Handling Missing Values ───────────────────────────────────────────────

df2 = pd.DataFrame({
    "Name":  ["Alice", "Bob", "Carol"],
    "Age":   [30, None, 28],
    "Score": [90, 85, None]
})

print(df2.isnull())             # True where value is missing
print(df2.isnull().sum())       # Age 1, Score 1  → count of NaN per column
print(df2.dropna())             # remove rows with any NaN
#     Name   Age  Score
# 0  Alice  30.0   90.0

print(df2.dropna(axis=1))       # remove COLUMNS with any NaN
#     Name
# 0  Alice
# 1    Bob
# 2  Carol

print(df2.dropna(axis=0))       # remove ROWS with any NaN (default, same as dropna())
#     Name   Age  Score
# 0  Alice  30.0   90.0

print(df2.dropna(how="all"))    # remove rows where ALL values are NaN
print(df2.dropna(thresh=2))     # keep rows with at least 2 non-NaN values
#     Name   Age  Score
# 0  Alice  30.0   90.0
# 1    Bob   NaN   85.0
# 2  Carol  28.0    NaN

print(df2.fillna(0))            # replace NaN with 0
#     Name   Age  Score
# 0  Alice  30.0   90.0
# 1    Bob   0.0   85.0
# 2  Carol  28.0    0.0

print(df2.fillna(value="FILL VALUE"))  # replace NaN with a custom string label
#     Name   Age Score
# 0  Alice  30.0  90.0
# 1    Bob  FILL VALUE  85.0
# 2  Carol  28.0  FILL VALUE

print(df2.fillna(""))           # replace NaN with empty string (useful for string columns)
#     Name   Age Score
# 0  Alice    30    90
# 1    Bob        85
# 2  Carol    28

print(df2.fillna({"Age": 0, "Score": df2["Score"].mean()}))  # different fill per column
#     Name   Age  Score
# 0  Alice  30.0   90.0
# 1    Bob   0.0   85.0
# 2  Carol  28.0   87.5

print(df2.fillna(method="ffill"))   # forward fill — copy value from previous row
#     Name   Age  Score
# 0  Alice  30.0   90.0
# 1    Bob  30.0   85.0  ← Age filled from Alice
# 2  Carol  28.0   85.0  ← Score filled from Bob

print(df2.fillna(method="bfill"))   # backward fill — copy value from next row
#     Name   Age  Score
# 0  Alice  30.0   90.0
# 1    Bob  28.0   85.0  ← Age filled from Carol
# 2  Carol  28.0    NaN  ← no next row to fill from

print(df2.fillna(df2.mean(numeric_only=True)))   # replace NaN with column mean
#     Name   Age  Score
# 0  Alice  30.0   90.0
# 1    Bob  29.0   85.0  ← Age mean = (30+28)/2
# 2  Carol  28.0   87.5  ← Score mean = (90+85)/2

# ── 14. Aggregate & GroupBy ───────────────────────────────────────────────────

df3 = pd.DataFrame({
    "Dept":  ["HR", "IT", "HR", "IT", "HR"],
    "Name":  ["Alice", "Bob", "Carol", "Dave", "Eve"],
    "Salary": [50000, 80000, 55000, 90000, 52000]
})

print(df3.groupby("Dept")["Salary"].mean())
# Dept
# HR    52333.33
# IT    85000.00

print(df3.groupby("Dept")["Salary"].agg(["min", "max", "mean"]))
#          min    max         mean
# Dept
# HR     50000  55000  52333.333333
# IT     80000  90000  85000.000000

print(df3.groupby("Dept").size())               # count rows per group
# Dept
# HR    3
# IT    2

print(df3.groupby("Dept")["Salary"].sum())      # total salary per dept
# Dept
# HR    157000
# IT    170000

print(df3.groupby("Dept")["Salary"].count())    # non-null count per group
# Dept
# HR    3
# IT    2

# Multiple columns groupby
df3["Level"] = ["Junior", "Senior", "Junior", "Senior", "Junior"]
print(df3.groupby(["Dept", "Level"])["Salary"].mean())
# Dept  Level
# HR    Junior    52333.33
# IT    Senior    85000.00

# ── unique() & nunique() ─────────────────────────────────────────────────────
# unique()  → returns array of unique values in a Series
# nunique() → returns COUNT of unique values (number only)

df_u = pd.DataFrame({
    "Dept":  ["HR", "IT", "HR", "Finance", "IT", "HR"],
    "Name":  ["Alice", "Bob", "Carol", "Dave", "Eve", "Frank"],
    "Score": [90, 85, 90, 78, 85, 92]
})

print(df_u["Dept"].unique())                    # ['HR' 'IT' 'Finance']  → unique values
print(df_u["Dept"].nunique())                   # 3  → count of unique values
print(df_u["Score"].unique())                   # [90 85 78 92]  → unique scores
print(df_u["Score"].nunique())                  # 4  → count of unique scores

# nunique across all columns
print(df_u.nunique())
# Dept     3
# Name     6
# Score    4

# unique() vs value_counts()
print(df_u["Dept"].value_counts())              # HR 3, IT 2, Finance 1  → count per value
print(df_u["Dept"].unique())                    # ['HR' 'IT' 'Finance']  → just the values

# ── 15. Merging & Joining ─────────────────────────────────────────────────────
# Combine two DataFrames based on a common column (like SQL JOIN)

left = pd.DataFrame({
    "ID":   [1, 2, 3, 4],
    "Name": ["Alice", "Bob", "Carol", "Dave"]
})

right = pd.DataFrame({
    "ID":     [1, 2, 3, 5],
    "Salary": [90000, 85000, 92000, 70000]
})

# default merge — inner join on common column name (no need to specify on= or how=)
print(pd.merge(left, right))
#    ID   Name  Salary
# 0   1  Alice   90000
# 1   2    Bob   85000
# 2   3  Carol   92000

# merge on multiple columns — both columns must match for a row to be kept
left2 = pd.DataFrame({
    "ID":   [1, 2, 3],
    "Dept": ["HR", "IT", "HR"],
    "Name": ["Alice", "Bob", "Carol"]
})
right2 = pd.DataFrame({
    "ID":     [1, 2, 3],
    "Dept":   ["HR", "IT", "Finance"],
    "Salary": [90000, 85000, 70000]
})

print(pd.merge(left2, right2, on=["ID", "Dept"]))
#    ID Dept   Name  Salary
# 0   1   HR  Alice   90000   ← ID=1 AND Dept=HR matched
# 1   2   IT    Bob   85000   ← ID=2 AND Dept=IT matched
#   row 3 dropped — ID=3 matches but Dept HR ≠ Finance

# inner join — only matching IDs in both
print(pd.merge(left, right, on="ID", how="inner"))
#    ID   Name  Salary
# 0   1  Alice   90000
# 1   2    Bob   85000
# 2   3  Carol   92000

# left join — all rows from left, NaN where no match in right
print(pd.merge(left, right, on="ID", how="left"))
#    ID   Name    Salary
# 0   1  Alice   90000.0
# 1   2    Bob   85000.0
# 2   3  Carol   92000.0
# 3   4   Dave       NaN

# right join — all rows from right, NaN where no match in left
print(pd.merge(left, right, on="ID", how="right"))
#    ID   Name  Salary
# 0   1  Alice   90000
# 1   2    Bob   85000
# 2   3  Carol   92000
# 3   5   NaN    70000

# outer join — all rows from both, NaN where no match
print(pd.merge(left, right, on="ID", how="outer"))
#    ID   Name    Salary
# 0   1  Alice   90000.0
# 1   2    Bob   85000.0
# 2   3  Carol   92000.0
# 3   4   Dave       NaN
# 4   5    NaN   70000.0

# ── join() — join on index (not a column) ────────────────────────────────────
# df.join() is like pd.merge() but matches on the INDEX instead of a column
#
# | Method              | Joins on  | Default   | Use when             |
# |---------------------|-----------|-----------|----------------------|
# | left.join(right)    | INDEX     | left join | index is the key     |
# | pd.merge(left,right)| COLUMN    | inner join| column is the key    |

left_j = pd.DataFrame(
    {"Name": ["Alice", "Bob", "Carol"]},
    index=[1, 2, 3]                             # index acts as the key
)
right_j = pd.DataFrame(
    {"Salary": [90000, 85000, 92000]},
    index=[1, 2, 4]                             # index 3 missing, index 4 extra
)

# default join — left join on index (default is left join)
print(left_j.join(right_j))
#      Name    Salary
# 1   Alice   90000.0   ← index 1 matched
# 2     Bob   85000.0   ← index 2 matched
# 3   Carol       NaN   ← index 3 not in right → NaN

# inner join — only matching indices
print(left_j.join(right_j, how="inner"))
#      Name  Salary
# 1   Alice   90000
# 2     Bob   85000

# outer join — all indices from both
print(left_j.join(right_j, how="outer"))
#      Name    Salary
# 1   Alice   90000.0
# 2     Bob   85000.0
# 3   Carol       NaN
# 4     NaN   92000.0

# join vs merge — key difference
# left.join(right)              → joins on INDEX
# pd.merge(left, right, on=col) → joins on COLUMN

# ── 16. Concatenating ─────────────────────────────────────────────────────────
# Stack DataFrames vertically (rows) or horizontally (columns)

df_a = pd.DataFrame({"Name": ["Alice", "Bob"],   "Score": [90, 85]})
df_b = pd.DataFrame({"Name": ["Carol", "Dave"],  "Score": [92, 88]})

# vertical stack (axis=0) — add rows
print(pd.concat([df_a, df_b], ignore_index=True))
#     Name  Score
# 0  Alice     90
# 1    Bob     85
# 2  Carol     92
# 3   Dave     88

# horizontal stack (axis=1) — add columns
df_c = pd.DataFrame({"Grade": ["A", "B"]})
print(pd.concat([df_a, df_c], axis=1))
#     Name  Score Grade
# 0  Alice     90     A
# 1    Bob     85     B

# ── 17. DataFrame Operations ──────────────────────────────────────────────────

df = pd.DataFrame({
    "Name":  ["Alice", "Bob", "Carol"],
    "Age":   [30, 25, 28],
    "Score": [90, 85, 92]
})

# apply() — apply a function along rows or columns
print(df["Score"].apply(lambda x: "Pass" if x >= 90 else "Fail"))
# 0    Pass
# 1    Fail
# 2    Pass

print(df[["Age", "Score"]].apply(lambda col: col.max() - col.min()))
# Age      5
# Score    7

# map() — transform each value in a Series
print(df["Name"].map(str.lower))                # alice, bob, carol

# rename columns
print(df.rename(columns={"Score": "Marks", "Age": "Years"}))
#     Name  Years  Marks
# 0  Alice     30     90
# 1    Bob     25     85
# 2  Carol     28     92

# astype() — change column data type
df["Score"] = df["Score"].astype(float)
print(df["Score"].dtype)                        # float64

# drop_duplicates()
df_dup = pd.DataFrame({"Name": ["Alice", "Bob", "Alice"], "Score": [90, 85, 90]})
print(df_dup.drop_duplicates())
#     Name  Score
# 0  Alice     90
# 1    Bob     85

# value_counts()
print(df_dup["Name"].value_counts())
# Alice    2
# Bob      1

# reset_index / set_index
df2 = df.set_index("Name")                      # Name becomes the row index
print(df2)
#        Age  Score
# Name
# Alice   30   90.0
# Bob     25   85.0
# Carol   28   92.0

print(df2.reset_index())                        # moves index back to a column
#     Name  Age  Score
# 0  Alice   30   90.0
# 1    Bob   25   85.0
# 2  Carol   28   92.0

# String operations via .str accessor
df["Name_upper"] = df["Name"].str.upper()       # ALICE, BOB, CAROL
df["Name_len"]   = df["Name"].str.len()         # 5, 3, 5
print(df[["Name", "Name_upper", "Name_len"]])

# ── 18. Data Input and Output ─────────────────────────────────────────────────

# ── CSV ───────────────────────────────────────────────────────────────────────
df = pd.DataFrame({
    "Name":  ["Alice", "Bob", "Carol"],
    "Age":   [30, 25, 28],
    "Score": [90, 85, 92]
})

df.to_csv("output.csv", index=False)            # save to CSV (index=False skips row numbers)
df_csv = pd.read_csv("output.csv")              # load from CSV
print(df_csv)
#     Name  Age  Score
# 0  Alice   30     90
# 1    Bob   25     85
# 2  Carol   28     92

# common read_csv options
# pd.read_csv("file.csv", sep=",")              # delimiter (default comma)
# pd.read_csv("file.csv", header=0)             # row to use as column names
# pd.read_csv("file.csv", usecols=["Name"])     # load specific columns only
# pd.read_csv("file.csv", nrows=100)            # load first 100 rows only
# pd.read_csv("file.csv", na_values=["NA","-"]) # treat these as NaN

# ── JSON ──────────────────────────────────────────────────────────────────────
df.to_json("output.json", orient="records")     # save to JSON (list of row dicts)
df_json = pd.read_json("output.json")           # load from JSON
print(df_json)
#     Name  Age  Score
# 0  Alice   30     90
# 1    Bob   25     85
# 2  Carol   28     92

# ── Excel ─────────────────────────────────────────────────────────────────────
# requires: pip install openpyxl
# df.to_excel("output.xlsx", index=False, sheet_name="Sheet1")
# df_xl = pd.read_excel("output.xlsx", sheet_name="Sheet1")
# print(df_xl)

# ── Inspect loaded data ───────────────────────────────────────────────────────
print(df_csv.shape)                             # (3, 3)  → rows, cols
print(df_csv.dtypes)                            # Name object, Age int64, Score int64
print(df_csv.head(2))                           # first 2 rows
print(df_csv.info())                            # column info, non-null counts

# ── 19. MultiIndex ────────────────────────────────────────────────────────────
# MultiIndex = multiple levels of labels on rows or columns (hierarchical index)
# Useful for grouped/structured data like time series, regions, departments

# ── Creating MultiIndex DataFrame ─────────────────────────────────────────────

arrays = [
    ["HR", "HR", "IT", "IT"],       # level 0 — Dept
    ["Alice", "Bob", "Carol", "Dave"]  # level 1 — Name
]
index = pd.MultiIndex.from_arrays(arrays, names=["Dept", "Name"])

df_mi = pd.DataFrame({"Salary": [50000, 55000, 80000, 90000],
                       "Score":  [88, 92, 95, 87]}, index=index)
print(df_mi)
#              Salary  Score
# Dept Name
# HR   Alice   50000     88
#      Bob     55000     92
# IT   Carol   80000     95
#      Dave    90000     87

# ── from_tuples ───────────────────────────────────────────────────────────────

tuples = [("HR", "Alice"), ("HR", "Bob"), ("IT", "Carol"), ("IT", "Dave")]
index2 = pd.MultiIndex.from_tuples(tuples, names=["Dept", "Name"])
print(index2)
# MultiIndex([('HR', 'Alice'), ('HR', 'Bob'), ('IT', 'Carol'), ('IT', 'Dave')],
#            names=['Dept', 'Name'])

# ── from_product — all combinations ──────────────────────────────────────────

index3 = pd.MultiIndex.from_product([["HR", "IT"], ["Q1", "Q2"]],
                                     names=["Dept", "Quarter"])
print(index3)
# MultiIndex([('HR', 'Q1'), ('HR', 'Q2'), ('IT', 'Q1'), ('IT', 'Q2')],
#            names=['Dept', 'Quarter'])

# ── Accessing MultiIndex data ─────────────────────────────────────────────────

print(df_mi.loc["HR"])                          # all rows where Dept = HR
#        Salary  Score
# Name
# Alice   50000     88
# Bob     55000     92

print(df_mi.loc[("HR", "Alice")])               # specific row by both levels
# Salary    50000
# Score        88

print(df_mi.loc["IT", "Salary"])                # IT dept, Salary column only
# Name
# Carol    80000
# Dave     90000

# ── xs — cross-section selector ───────────────────────────────────────────────

print(df_mi.xs("Alice", level="Name"))          # select across level by value
#       Salary  Score
# Dept
# HR     50000     88

# ── MultiIndex on columns ─────────────────────────────────────────────────────

col_index = pd.MultiIndex.from_tuples([("2024", "Q1"), ("2024", "Q2"),
                                        ("2025", "Q1"), ("2025", "Q2")],
                                       names=["Year", "Quarter"])
df_col = pd.DataFrame([[100, 110, 120, 130],
                        [200, 210, 220, 230]],
                       index=["Alice", "Bob"],
                       columns=col_index)
print(df_col)
# Year        2024       2025
# Quarter       Q1    Q2   Q1    Q2
# Alice        100   110  120   130
# Bob          200   210  220   230

print(df_col["2024"])                           # select year 2024 columns only
# Quarter   Q1   Q2
# Alice    100  110
# Bob      200  210

# ── GroupBy with MultiIndex ───────────────────────────────────────────────────

print(df_mi.groupby(level="Dept")["Salary"].mean())
# Dept
# HR    52500.0
# IT    85000.0

print(df_mi.groupby(level="Dept")["Score"].sum())
# Dept
# HR    180
# IT    182

# ── reset_index — flatten MultiIndex back to columns ─────────────────────────

print(df_mi.reset_index())
#   Dept   Name  Salary  Score
# 0   HR  Alice   50000     88
# 1   HR    Bob   55000     92
# 2   IT  Carol   80000     95
# 3   IT   Dave   90000     87

# ── stack / unstack — pivot between row and column levels ────────────────────

df_us = df_mi.unstack(level="Name")            # moves Name level to columns
print(df_us)
#       Salary                    Score
# Name   Alice    Bob  Carol  Dave  Alice  Bob  Carol  Dave
# Dept
# HR     50000  55000    NaN   NaN     88   92    NaN   NaN
# IT       NaN    NaN  80000  90000    NaN  NaN     95    87

df_st = df_us.stack(level="Name")              # moves Name back to rows
print(df_st)                                   # restores original MultiIndex shape

# ── 20. zip() — Python Built-in ───────────────────────────────────────────────
# zip() pairs elements from multiple iterables together, index by index
# Returns an iterator of tuples — stops at the shortest iterable

# ── Basic zip ─────────────────────────────────────────────────────────────────

names  = ["Alice", "Bob", "Carol"]
ages   = [30, 25, 28]
scores = [90, 85, 92]

zipped = zip(names, ages)
print(list(zipped))                             # [('Alice', 30), ('Bob', 25), ('Carol', 28)]

# ── zip with 3 iterables ──────────────────────────────────────────────────────

for name, age, score in zip(names, ages, scores):
    print(f"{name} | Age: {age} | Score: {score}")
# Alice | Age: 30 | Score: 90
# Bob   | Age: 25 | Score: 85
# Carol | Age: 28 | Score: 92

# ── zip to create a dictionary ────────────────────────────────────────────────

keys   = ["Name", "Age", "Score"]
values = ["Alice", 30, 90]

d = dict(zip(keys, values))
print(d)                                        # {'Name': 'Alice', 'Age': 30, 'Score': 90}

# ── zip to create a DataFrame ─────────────────────────────────────────────────

df_zip = pd.DataFrame(zip(names, ages, scores), columns=["Name", "Age", "Score"])
print(df_zip)
#     Name  Age  Score
# 0  Alice   30     90
# 1    Bob   25     85
# 2  Carol   28     92

# ── unzip with * (unpacking) ──────────────────────────────────────────────────

pairs = [("Alice", 90), ("Bob", 85), ("Carol", 92)]
names_out, scores_out = zip(*pairs)             # unzips list of tuples into separate tuples
print(names_out)                                # ('Alice', 'Bob', 'Carol')
print(scores_out)                               # (90, 85, 92)

# ── zip stops at shortest iterable ───────────────────────────────────────────

a = [1, 2, 3, 4]
b = ["x", "y"]                                 # shorter list

print(list(zip(a, b)))                          # [(1, 'x'), (2, 'y')]  → stops at 2

# use zip_longest to fill missing values with None (or custom fill)
from itertools import zip_longest
print(list(zip_longest(a, b, fillvalue="-")))   # [(1,'x'), (2,'y'), (3,'-'), (4,'-')]

# ── 21. pivot_table ───────────────────────────────────────────────────────────
# Summarize data like an Excel pivot — group, aggregate, and reshape in one step
# Syntax: df.pivot_table(values, index, columns, aggfunc, fill_value)

df_pt = pd.DataFrame({
    "Dept":   ["HR", "IT", "HR", "IT", "HR", "IT"],
    "Level":  ["Junior", "Junior", "Senior", "Senior", "Junior", "Senior"],
    "Gender": ["F", "M", "F", "M", "M", "F"],
    "Salary": [50000, 80000, 70000, 90000, 52000, 88000]
})

# ── Basic pivot_table ─────────────────────────────────────────────────────────
# values  → column to aggregate
# index   → rows
# aggfunc → how to aggregate (default: mean)

print(df_pt.pivot_table(values="Salary", index="Dept"))
# Dept
# HR    57333.33   ← mean salary for HR
# IT    86000.00   ← mean salary for IT

# ── index + columns → 2D cross-table ─────────────────────────────────────────
# columns → spread unique values across columns

print(df_pt.pivot_table(values="Salary", index="Dept", columns="Level"))
# Level    Junior    Senior
# Dept
# HR      51000.0   70000.0
# IT      80000.0   89000.0

# ── aggfunc — change the aggregation function ─────────────────────────────────

print(df_pt.pivot_table(values="Salary", index="Dept", aggfunc="sum"))
# Dept
# HR    172000   ← total salary for HR
# IT    258000   ← total salary for IT

print(df_pt.pivot_table(values="Salary", index="Dept", aggfunc="count"))
# Dept
# HR    3
# IT    3

print(df_pt.pivot_table(values="Salary", index="Dept", aggfunc=["min","max","mean"]))
#         min     max         mean
# Dept
# HR    50000   70000  57333.333333
# IT    80000   90000  86000.000000

# ── fill_value — replace NaN in result ───────────────────────────────────────

print(df_pt.pivot_table(values="Salary", index="Dept",
                         columns="Gender", aggfunc="mean", fill_value=0))
# Gender       F         M
# Dept
# HR     60000.0   52000.0
# IT     88000.0   85000.0

# ── margins — add row/column totals ──────────────────────────────────────────

print(df_pt.pivot_table(values="Salary", index="Dept",
                         columns="Level", aggfunc="mean", margins=True))
# Level      Junior    Senior          All
# Dept
# HR        51000.0   70000.0  57333.333333
# IT        80000.0   89000.0  86000.000000
# All       65500.0   82333.3  71833.333333

# ── pivot_table vs groupby ────────────────────────────────────────────────────
# groupby → returns Series/DataFrame, flexible, multi-step
# pivot_table → returns 2D cross-table, Excel-like, single step

# groupby equivalent of pivot_table above:
print(df_pt.groupby(["Dept", "Level"])["Salary"].mean().unstack())
# Level    Junior    Senior
# Dept
# HR      51000.0   70000.0
# IT      80000.0   89000.0
Keep Learning

Pages

Python Pandas DataFrames

No comments:

Post a Comment

Follow In

Translate

Thank for your visit

About Me