Keep Learning: Python Pandas Dataframes cheatsheet

import pandas as pd
import numpy as np

# ═══════════════════════════════════════════════════════════════════════════════
#                        PANDAS DATAFRAME CHEATSHEET
# ═══════════════════════════════════════════════════════════════════════════════

# ── CREATING ──────────────────────────────────────────────────────────────────
pd.DataFrame({"A": [1,2], "B": [3,4]})          # from dict       → most common
pd.DataFrame([{"A":1,"B":3}, {"A":2,"B":4}])    # from list of dicts
pd.DataFrame(np.array([[1,2],[3,4]]), columns=["A","B"])  # from NumPy array
pd.DataFrame({"A":[1,2]}, index=["r1","r2"])     # custom row index

# ── ATTRIBUTES ────────────────────────────────────────────────────────────────
# df.shape          → (rows, cols)
# df.ndim           → 2
# df.size           → total elements
# df.dtypes         → dtype per column
# df.columns        → column labels
# df.index          → row labels

# ── VIEWING ───────────────────────────────────────────────────────────────────
# df.head(n)        → first n rows
# df.tail(n)        → last n rows
# df.info()         → dtypes + non-null counts
# df.describe()     → count, mean, std, min, max

# ── SELECTING ─────────────────────────────────────────────────────────────────
# df["col"]                   → single column (Series)
# df[["col1","col2"]]         → multiple columns (DataFrame)
# df.iloc[0]                  → row by position
# df.iloc[0:2]                → rows by position slice
# df.loc[0]                   → row by label
# df.loc[0:1]                 → rows by label (inclusive)
# df.iloc[0, 1]               → cell by position
# df.loc[0, "col"]            → cell by label

# ── FILTERING ─────────────────────────────────────────────────────────────────
# df[df["Age"] > 26]                          → single condition
# df[(df["Age"] > 26) & (df["Score"] >= 90)]  → multiple conditions (& = AND, | = OR)

# ── ADD / REMOVE COLUMNS ──────────────────────────────────────────────────────
# df["new"]  = [...]                          → add column
# df["pass"] = df["Score"] >= 90             → derived column
# df.drop(columns=["col"], inplace=True)      → remove column

# ── SORTING ───────────────────────────────────────────────────────────────────
# df.sort_values("col")                       → ascending
# df.sort_values("col", ascending=False)      → descending

# ── MISSING VALUES ────────────────────────────────────────────────────────────
# df.isnull()               → True where NaN
# df.isnull().sum()         → count of NaN per column
# df.dropna()               → drop rows with any NaN
# df.dropna(axis=1)         → drop columns with any NaN
# df.dropna(how="all")      → drop rows where ALL values are NaN
# df.dropna(thresh=2)       → keep rows with at least 2 non-NaN values
# df.fillna(0)              → fill NaN with 0
# df.fillna("")             → fill NaN with empty string
# df.fillna("N/A")          → fill NaN with custom label
# df.fillna({"col": val})   → fill per column
# df.fillna(method="ffill") → forward fill from previous row
# df.fillna(method="bfill") → backward fill from next row
# df.fillna(df.mean())      → fill with column mean

# ── GROUPBY ───────────────────────────────────────────────────────────────────
# df.groupby("col")["val"].mean()             → mean per group
# df.groupby("col")["val"].agg(["min","max"]) → multiple aggregations
# df.groupby("col").size()                    → row count per group
# df.groupby(["col1","col2"])["val"].mean()   → group by multiple columns

# ── UNIQUE ────────────────────────────────────────────────────────────────────
# df["col"].unique()        → array of unique values
# df["col"].nunique()       → count of unique values
# df.nunique()              → unique count per column
# df["col"].value_counts()  → frequency of each unique value

# ── MERGING & JOINING ─────────────────────────────────────────────────────────
# pd.merge(left, right)                       → inner join on common column (default)
# pd.merge(left, right, on="ID", how="inner") → inner  : only matching rows
# pd.merge(left, right, on="ID", how="left")  → left   : all from left + matches
# pd.merge(left, right, on="ID", how="right") → right  : all from right + matches
# pd.merge(left, right, on="ID", how="outer") → outer  : all rows from both
# pd.merge(left, right, on=["col1","col2"])   → merge on multiple columns
# left.join(right)                            → join on INDEX (default: left join)

# ── CONCATENATING ─────────────────────────────────────────────────────────────
# pd.concat([df1, df2], ignore_index=True)    → stack rows (axis=0)
# pd.concat([df1, df2], axis=1)              → stack columns (axis=1)

# ── OPERATIONS ────────────────────────────────────────────────────────────────
# df["col"].apply(func)                       → apply function to each value
# df[cols].apply(func, axis=0)               → apply function to each column
# df["col"].map(func)                         → transform each value in Series
# df.rename(columns={"old":"new"})            → rename columns
# df["col"].astype(float)                     → change dtype
# df.drop_duplicates()                        → remove duplicate rows
# df.set_index("col")                         → make column the index
# df.reset_index()                            → move index back to column
# df["col"].str.upper()                       → string methods via .str

# ── PIVOT TABLE ───────────────────────────────────────────────────────────────
# df.pivot_table(values="Salary", index="Dept")                  → mean per group
# df.pivot_table(values, index, columns="Level")                 → 2D cross-table
# df.pivot_table(values, index, aggfunc="sum")                   → sum
# df.pivot_table(values, index, aggfunc=["min","max","mean"])     → multiple aggs
# df.pivot_table(values, index, columns, fill_value=0)           → fill NaN
# df.pivot_table(values, index, columns, margins=True)           → add totals row/col

# ── MULTIINDEX ────────────────────────────────────────────────────────────────
# pd.MultiIndex.from_arrays([l1, l2], names=["A","B"])  → from lists
# pd.MultiIndex.from_tuples([(a,b),...], names=["A","B"])→ from tuples
# pd.MultiIndex.from_product([l1, l2], names=["A","B"]) → all combinations
# df.loc["HR"]                   → select level 0
# df.loc[("HR","Alice")]         → select both levels
# df.xs("Alice", level="Name")   → cross-section by value
# df.unstack("col")              → move row level → columns
# df.stack("col")                → move column level → rows

# ── DATA I/O ──────────────────────────────────────────────────────────────────
# df.to_csv("file.csv", index=False)          → save CSV
# pd.read_csv("file.csv")                     → load CSV
# pd.read_csv("file.csv", usecols=["A","B"])  → load specific columns
# pd.read_csv("file.csv", nrows=100)          → load first N rows
# df.to_json("file.json", orient="records")   → save JSON
# pd.read_json("file.json")                   → load JSON
# df.to_excel("file.xlsx", index=False)       → save Excel (needs openpyxl)
# pd.read_excel("file.xlsx")                  → load Excel

# ── ZIP (built-in) ────────────────────────────────────────────────────────────
# zip(l1, l2)                    → pairs elements → iterator of tuples
# list(zip(l1, l2))              → [(a1,b1), (a2,b2), ...]
# dict(zip(keys, values))        → {k:v, ...}
# pd.DataFrame(zip(l1,l2,l3), columns=[...])  → DataFrame from zipped lists
# zip(*pairs)                    → unzip list of tuples
# zip_longest(l1, l2, fillvalue="-")  → zip with fill for unequal lengths

Pandas Dataframes Cheatsheet
Keep Learning

Pages

Python Pandas Dataframes cheatsheet

No comments:

Post a Comment

Follow In

Translate

Thank for your visit

About Me