Pages

Python Pandas DataFrames

 # A DataFrame is a collection of Series — each column is a Series sharing the same index.

# DataFrame is a 2D labeled table with rows and columns — like a spreadsheet or SQL table.
# From dictionary — most common way
# From list of dicts — each dict = one row
# From NumPy array — with column names
# Custom index — label rows
# Attributes — shape, dtypes, columns, index
# Viewing — head, tail, info, describe
# Selecting columns — single & multiple
# Selecting rows — iloc, loc
# Specific cell — iloc[row, col], loc[row, col]
# Boolean filtering — single & multiple conditions
# Add/remove columns — derived columns, drop
# Sorting — sort_values ascending/descending
# Missing values — isnull, dropna, fillna
# GroupBy — groupby, agg
import pandas as pd
import numpy as np

# ── 1. Creating a DataFrame from Dictionary ───────────────────────────────────
# Keys = column names, Values = list of column data

df = pd.DataFrame({
"Name": ["Alice", "Bob", "Carol"],
"Age": [30, 25, 28],
"Score": [90, 85, 92]
})
print(df)
# Name Age Score
# 0 Alice 30 90
# 1 Bob 25 85
# 2 Carol 28 92

# ── 2. Creating from List of Dictionaries ─────────────────────────────────────
# Each dict = one row

df = pd.DataFrame([
{"Name": "Alice", "Age": 30, "Score": 90},
{"Name": "Bob", "Age": 25, "Score": 85},
{"Name": "Carol", "Age": 28, "Score": 92}
])
print(df)
# Name Age Score
# 0 Alice 30 90
# 1 Bob 25 85
# 2 Carol 28 92

# ── 3. Creating from NumPy Array ──────────────────────────────────────────────

arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df = pd.DataFrame(arr, columns=["A", "B", "C"])
print(df)
# A B C
# 0 1 2 3
# 1 4 5 6
# 2 7 8 9

# ── 4. Custom Index ───────────────────────────────────────────────────────────

df = pd.DataFrame({
"Name": ["Alice", "Bob", "Carol"],
"Score": [90, 85, 92]
}, index=["r1", "r2", "r3"])
print(df)
# Name Score
# r1 Alice 90
# r2 Bob 85
# r3 Carol 92

# ── 5. DataFrame Attributes ───────────────────────────────────────────────────

df = pd.DataFrame({
"Name": ["Alice", "Bob", "Carol"],
"Age": [30, 25, 28],
"Score": [90, 85, 92]
})

print(df.shape) # (3, 3) → rows, cols
print(df.ndim) # 2 → dimensions
print(df.size) # 9 → total elements
print(df.dtypes) # Name object, Age int64, Score int64
print(df.columns.tolist()) # ['Name', 'Age', 'Score']
print(df.index.tolist()) # [0, 1, 2]

# ── 6. Viewing Data ───────────────────────────────────────────────────────────

print(df.head(2)) # first 2 rows
# Name Age Score
# 0 Alice 30 90
# 1 Bob 25 85

print(df.tail(2)) # last 2 rows
# Name Age Score
# 1 Bob 25 85
# 2 Carol 28 92

print(df.info()) # column names, non-null counts, dtypes
print(df.describe()) # count, mean, std, min, 25%, 50%, 75%, max

# ── 7. Selecting Columns ──────────────────────────────────────────────────────

print(df["Name"]) # Series → Alice, Bob, Carol
print(df[["Name", "Score"]]) # DataFrame with 2 columns

# ── 8. Selecting Rows ─────────────────────────────────────────────────────────

print(df.iloc[0]) # first row by position
# Name Alice
# Age 30
# Score 90

print(df.iloc[0:2]) # rows 0 and 1 by position
print(df.loc[0]) # row by label/index
print(df.loc[0:1]) # rows 0 to 1 by label (inclusive)

# ── 9. Selecting Specific Cell ────────────────────────────────────────────────

print(df.iloc[0, 1]) # 30 → row 0, col 1 (by position)
print(df.loc[0, "Age"]) # 30 → row 0, col "Age" (by label)

# ── 10. Boolean Filtering ─────────────────────────────────────────────────────

print(df[df["Age"] > 26])
# Name Age Score
# 0 Alice 30 90
# 2 Carol 28 92

print(df[df["Score"] >= 90])
# Name Age Score
# 0 Alice 30 90
# 2 Carol 28 92

print(df[(df["Age"] > 24) & (df["Score"] > 85)]) # multiple conditions
# Name Age Score
# 0 Alice 30 90
# 2 Carol 28 92

# ── 11. Adding / Removing Columns ─────────────────────────────────────────────

df["Grade"] = ["A", "B", "A"] # add new column
print(df)
# Name Age Score Grade
# 0 Alice 30 90 A
# 1 Bob 25 85 B
# 2 Carol 28 92 A

df["Pass"] = df["Score"] >= 90 # derived column
print(df["Pass"]) # 0 True, 1 False, 2 True

df.drop(columns=["Grade"], inplace=True) # remove column
print(df.columns.tolist()) # ['Name', 'Age', 'Score', 'Pass']

# ── 12. Sorting ───────────────────────────────────────────────────────────────

print(df.sort_values("Age")) # sort by Age ascending
# Name Age Score
# 1 Bob 25 85
# 2 Carol 28 92
# 0 Alice 30 90

print(df.sort_values("Score", ascending=False)) # sort by Score descending
# Name Age Score
# 2 Carol 28 92
# 0 Alice 30 90
# 1 Bob 25 85

# ── 13. Handling Missing Values ───────────────────────────────────────────────

df2 = pd.DataFrame({
"Name": ["Alice", "Bob", "Carol"],
"Age": [30, None, 28],
"Score": [90, 85, None]
})

print(df2.isnull()) # True where value is missing
print(df2.isnull().sum()) # Age 1, Score 1 → count of NaN per column
print(df2.dropna()) # remove rows with any NaN
print(df2.fillna(0)) # replace NaN with 0
print(df2.fillna(df2.mean(numeric_only=True))) # replace NaN with column mean

# ── 14. Aggregate & GroupBy ───────────────────────────────────────────────────

df3 = pd.DataFrame({
"Dept": ["HR", "IT", "HR", "IT", "HR"],
"Name": ["Alice", "Bob", "Carol", "Dave", "Eve"],
"Salary": [50000, 80000, 55000, 90000, 52000]
})

print(df3.groupby("Dept")["Salary"].mean())
# Dept
# HR 52333.33
# IT 85000.00

print(df3.groupby("Dept")["Salary"].agg(["min", "max", "mean"]))
# min max mean
# Dept
# HR 50000 55000 52333.333333
# IT 80000 90000 85000.000000

No comments:

Post a Comment

Please comment below to feedback or ask questions.