Python Pandas Dataframes cheatsheet

 import pandas as pd

import numpy as np

# ═══════════════════════════════════════════════════════════════════════════════
# PANDAS DATAFRAME CHEATSHEET
# ═══════════════════════════════════════════════════════════════════════════════

# ── CREATING ──────────────────────────────────────────────────────────────────
pd.DataFrame({"A": [1,2], "B": [3,4]}) # from dict → most common
pd.DataFrame([{"A":1,"B":3}, {"A":2,"B":4}]) # from list of dicts
pd.DataFrame(np.array([[1,2],[3,4]]), columns=["A","B"]) # from NumPy array
pd.DataFrame({"A":[1,2]}, index=["r1","r2"]) # custom row index

# ── ATTRIBUTES ────────────────────────────────────────────────────────────────
# df.shape → (rows, cols)
# df.ndim → 2
# df.size → total elements
# df.dtypes → dtype per column
# df.columns → column labels
# df.index → row labels

# ── VIEWING ───────────────────────────────────────────────────────────────────
# df.head(n) → first n rows
# df.tail(n) → last n rows
# df.info() → dtypes + non-null counts
# df.describe() → count, mean, std, min, max

# ── SELECTING ─────────────────────────────────────────────────────────────────
# df["col"] → single column (Series)
# df[["col1","col2"]] → multiple columns (DataFrame)
# df.iloc[0] → row by position
# df.iloc[0:2] → rows by position slice
# df.loc[0] → row by label
# df.loc[0:1] → rows by label (inclusive)
# df.iloc[0, 1] → cell by position
# df.loc[0, "col"] → cell by label

# ── FILTERING ─────────────────────────────────────────────────────────────────
# df[df["Age"] > 26] → single condition
# df[(df["Age"] > 26) & (df["Score"] >= 90)] → multiple conditions (& = AND, | = OR)

# ── ADD / REMOVE COLUMNS ──────────────────────────────────────────────────────
# df["new"] = [...] → add column
# df["pass"] = df["Score"] >= 90 → derived column
# df.drop(columns=["col"], inplace=True) → remove column

# ── SORTING ───────────────────────────────────────────────────────────────────
# df.sort_values("col") → ascending
# df.sort_values("col", ascending=False) → descending

# ── MISSING VALUES ────────────────────────────────────────────────────────────
# df.isnull() → True where NaN
# df.isnull().sum() → count of NaN per column
# df.dropna() → drop rows with any NaN
# df.dropna(axis=1) → drop columns with any NaN
# df.dropna(how="all") → drop rows where ALL values are NaN
# df.dropna(thresh=2) → keep rows with at least 2 non-NaN values
# df.fillna(0) → fill NaN with 0
# df.fillna("") → fill NaN with empty string
# df.fillna("N/A") → fill NaN with custom label
# df.fillna({"col": val}) → fill per column
# df.fillna(method="ffill") → forward fill from previous row
# df.fillna(method="bfill") → backward fill from next row
# df.fillna(df.mean()) → fill with column mean

# ── GROUPBY ───────────────────────────────────────────────────────────────────
# df.groupby("col")["val"].mean() → mean per group
# df.groupby("col")["val"].agg(["min","max"]) → multiple aggregations
# df.groupby("col").size() → row count per group
# df.groupby(["col1","col2"])["val"].mean() → group by multiple columns

# ── UNIQUE ────────────────────────────────────────────────────────────────────
# df["col"].unique() → array of unique values
# df["col"].nunique() → count of unique values
# df.nunique() → unique count per column
# df["col"].value_counts() → frequency of each unique value

# ── MERGING & JOINING ─────────────────────────────────────────────────────────
# pd.merge(left, right) → inner join on common column (default)
# pd.merge(left, right, on="ID", how="inner") → inner : only matching rows
# pd.merge(left, right, on="ID", how="left") → left : all from left + matches
# pd.merge(left, right, on="ID", how="right") → right : all from right + matches
# pd.merge(left, right, on="ID", how="outer") → outer : all rows from both
# pd.merge(left, right, on=["col1","col2"]) → merge on multiple columns
# left.join(right) → join on INDEX (default: left join)

# ── CONCATENATING ─────────────────────────────────────────────────────────────
# pd.concat([df1, df2], ignore_index=True) → stack rows (axis=0)
# pd.concat([df1, df2], axis=1) → stack columns (axis=1)

# ── OPERATIONS ────────────────────────────────────────────────────────────────
# df["col"].apply(func) → apply function to each value
# df[cols].apply(func, axis=0) → apply function to each column
# df["col"].map(func) → transform each value in Series
# df.rename(columns={"old":"new"}) → rename columns
# df["col"].astype(float) → change dtype
# df.drop_duplicates() → remove duplicate rows
# df.set_index("col") → make column the index
# df.reset_index() → move index back to column
# df["col"].str.upper() → string methods via .str

# ── PIVOT TABLE ───────────────────────────────────────────────────────────────
# df.pivot_table(values="Salary", index="Dept") → mean per group
# df.pivot_table(values, index, columns="Level") → 2D cross-table
# df.pivot_table(values, index, aggfunc="sum") → sum
# df.pivot_table(values, index, aggfunc=["min","max","mean"]) → multiple aggs
# df.pivot_table(values, index, columns, fill_value=0) → fill NaN
# df.pivot_table(values, index, columns, margins=True) → add totals row/col

# ── MULTIINDEX ────────────────────────────────────────────────────────────────
# pd.MultiIndex.from_arrays([l1, l2], names=["A","B"]) → from lists
# pd.MultiIndex.from_tuples([(a,b),...], names=["A","B"])→ from tuples
# pd.MultiIndex.from_product([l1, l2], names=["A","B"]) → all combinations
# df.loc["HR"] → select level 0
# df.loc[("HR","Alice")] → select both levels
# df.xs("Alice", level="Name") → cross-section by value
# df.unstack("col") → move row level → columns
# df.stack("col") → move column level → rows

# ── DATA I/O ──────────────────────────────────────────────────────────────────
# df.to_csv("file.csv", index=False) → save CSV
# pd.read_csv("file.csv") → load CSV
# pd.read_csv("file.csv", usecols=["A","B"]) → load specific columns
# pd.read_csv("file.csv", nrows=100) → load first N rows
# df.to_json("file.json", orient="records") → save JSON
# pd.read_json("file.json") → load JSON
# df.to_excel("file.xlsx", index=False) → save Excel (needs openpyxl)
# pd.read_excel("file.xlsx") → load Excel

# ── ZIP (built-in) ────────────────────────────────────────────────────────────
# zip(l1, l2) → pairs elements → iterator of tuples
# list(zip(l1, l2)) → [(a1,b1), (a2,b2), ...]
# dict(zip(keys, values)) → {k:v, ...}
# pd.DataFrame(zip(l1,l2,l3), columns=[...]) → DataFrame from zipped lists
# zip(*pairs) → unzip list of tuples
# zip_longest(l1, l2, fillvalue="-") → zip with fill for unequal lengths

Pandas Dataframes Cheatsheet




No comments:

Post a Comment

Please comment below to feedback or ask questions.