Keep Learning: Pandas Data Input and Output

# Pandas Data Input and Output
# Reading and writing data between DataFrames and external sources (CSV, JSON, Excel, SQL)

import pandas as pd

# ── Sample DataFrame used throughout ─────────────────────────────────────────

df = pd.DataFrame({
    "Name":   ["Alice", "Bob", "Carol", "Dave"],
    "Age":    [30, 25, 28, 35],
    "Dept":   ["HR", "IT", "HR", "IT"],
    "Salary": [50000, 80000, 55000, 90000]
})

# ══════════════════════════════════════════════════════════════════════════════
# ── 1. CSV ────────────────────────────────────────────────────────────────────
# Most common format — comma separated values
# ══════════════════════════════════════════════════════════════════════════════

# ── Write to CSV ──────────────────────────────────────────────────────────────
df.to_csv("employees.csv", index=False)         # index=False skips row numbers
# creates employees.csv:
# Name,Age,Dept,Salary
# Alice,30,HR,50000
# Bob,25,IT,80000
# ...

df.to_csv("employees_idx.csv", index=True)      # index=True includes row numbers (default)
# 0,Alice,30,HR,50000
# 1,Bob,25,IT,80000

df.to_csv("employees.csv", sep="|", index=False)  # custom delimiter
# Name|Age|Dept|Salary

df[["Name","Salary"]].to_csv("names.csv", index=False)  # save specific columns only

# ── Read from CSV ─────────────────────────────────────────────────────────────
df.to_csv("employees.csv", index=False)         # reset to comma for examples below

df_csv = pd.read_csv("employees.csv")           # basic read
print(df_csv)
#     Name  Age Dept  Salary
# 0  Alice   30   HR   50000
# 1    Bob   25   IT   80000
# 2  Carol   28   HR   55000
# 3   Dave   35   IT   90000

# ── read_csv options ──────────────────────────────────────────────────────────
pd.read_csv("employees.csv", sep=",")           # delimiter (default comma)
pd.read_csv("employees.csv", header=0)          # row 0 as column names (default)
pd.read_csv("employees.csv", header=None)       # no header row — uses 0,1,2...
pd.read_csv("employees.csv", usecols=["Name","Salary"])  # load specific columns only
pd.read_csv("employees.csv", nrows=2)           # load first 2 rows only
pd.read_csv("employees.csv", skiprows=1)        # skip first row
pd.read_csv("employees.csv", index_col="Name")  # set Name as index
pd.read_csv("employees.csv", na_values=["NA","-","?"])  # treat as NaN
pd.read_csv("employees.csv", dtype={"Age": float})      # force dtype on column

df_custom = pd.read_csv("employees.csv", usecols=["Name","Salary"], nrows=2)
print(df_custom)
#     Name  Salary
# 0  Alice   50000
# 1    Bob   80000

# ── Inspect after loading ─────────────────────────────────────────────────────
print(df_csv.shape)                             # (4, 4)
print(df_csv.dtypes)                            # Name object, Age int64 ...
print(df_csv.head(2))                           # first 2 rows
print(df_csv.info())                            # column info + non-null count

# ══════════════════════════════════════════════════════════════════════════════
# ── 2. JSON ───────────────────────────────────────────────────────────────────
# JavaScript Object Notation — common in APIs and web data
# ══════════════════════════════════════════════════════════════════════════════

# ── Write to JSON ─────────────────────────────────────────────────────────────
df.to_json("employees.json", orient="records")  # list of row dicts (most readable)
# [{"Name":"Alice","Age":30,"Dept":"HR","Salary":50000},...]

df.to_json("employees.json", orient="records", indent=2)  # pretty-printed
df.to_json("cols.json", orient="columns")       # dict of column dicts
df.to_json("index.json", orient="index")        # dict keyed by row index

# ── Read from JSON ────────────────────────────────────────────────────────────
df_json = pd.read_json("employees.json")
print(df_json)
#     Name  Age Dept  Salary
# 0  Alice   30   HR   50000
# 1    Bob   25   IT   80000
# 2  Carol   28   HR   55000
# 3   Dave   35   IT   90000

# ── orient options for read ───────────────────────────────────────────────────
# pd.read_json("file.json", orient="records")  # list of row dicts
# pd.read_json("file.json", orient="columns")  # dict of column dicts
# pd.read_json("file.json", orient="index")    # dict keyed by index

# ── From JSON string directly ─────────────────────────────────────────────────
json_str = '[{"Name":"Alice","Age":30},{"Name":"Bob","Age":25}]'
df_from_str = pd.read_json(json_str)
print(df_from_str)
#     Name  Age
# 0  Alice   30
# 1    Bob   25

# ══════════════════════════════════════════════════════════════════════════════
# ── 3. Excel ──────────────────────────────────────────────────────────────────
# requires: pip install openpyxl
# ══════════════════════════════════════════════════════════════════════════════

# ── Write to Excel ────────────────────────────────────────────────────────────
# df.to_excel("employees.xlsx", index=False)                    # single sheet
# df.to_excel("employees.xlsx", index=False, sheet_name="Data") # named sheet

# ── Multiple sheets ───────────────────────────────────────────────────────────
# hr = df[df["Dept"] == "HR"]
# it = df[df["Dept"] == "IT"]
# with pd.ExcelWriter("departments.xlsx") as writer:
#     hr.to_excel(writer, sheet_name="HR", index=False)
#     it.to_excel(writer, sheet_name="IT", index=False)

# ── Read from Excel ───────────────────────────────────────────────────────────
# df_xl = pd.read_excel("employees.xlsx")
# df_xl = pd.read_excel("employees.xlsx", sheet_name="Data")
# df_xl = pd.read_excel("employees.xlsx", usecols=["Name","Salary"])
# df_xl = pd.read_excel("employees.xlsx", nrows=2)

# ── Read all sheets at once ───────────────────────────────────────────────────
# all_sheets = pd.read_excel("departments.xlsx", sheet_name=None)
# all_sheets["HR"]   → DataFrame for HR sheet
# all_sheets["IT"]   → DataFrame for IT sheet

# ══════════════════════════════════════════════════════════════════════════════
# ── 4. SQL ────────────────────────────────────────────────────────────────────
# requires: pip install sqlalchemy
# ══════════════════════════════════════════════════════════════════════════════

import sqlite3

conn = sqlite3.connect(":memory:")              # in-memory SQLite database

# ── Write to SQL ──────────────────────────────────────────────────────────────
df.to_sql("employees", conn, index=False, if_exists="replace")
# if_exists options:
#   "fail"    → raise error if table exists
#   "replace" → drop table and recreate
#   "append"  → add rows to existing table

# ── Read from SQL ─────────────────────────────────────────────────────────────
df_sql = pd.read_sql("SELECT * FROM employees", conn)
print(df_sql)
#     Name  Age Dept  Salary
# 0  Alice   30   HR   50000
# 1    Bob   25   IT   80000
# 2  Carol   28   HR   55000
# 3   Dave   35   IT   90000

df_filtered = pd.read_sql("SELECT Name, Salary FROM employees WHERE Dept='IT'", conn)
print(df_filtered)
#   Name  Salary
# 0  Bob   80000
# 1 Dave   90000

df_query = pd.read_sql_query("SELECT * FROM employees WHERE Age > 28", conn)
print(df_query)
#     Name  Age Dept  Salary
# 0  Alice   30   HR   50000
# 1   Dave   35   IT   90000

conn.close()

# ══════════════════════════════════════════════════════════════════════════════
# ── 5. Clipboard ──────────────────────────────────────────────────────────────
# Read/write from system clipboard (useful for quick copy-paste from Excel)
# ══════════════════════════════════════════════════════════════════════════════

# df.to_clipboard(index=False)   # copy to clipboard
# pd.read_clipboard()            # paste from clipboard into DataFrame

# ══════════════════════════════════════════════════════════════════════════════
# ── 6. Common I/O Options Summary ────────────────────────────────────────────
# ══════════════════════════════════════════════════════════════════════════════

# ┌─────────────┬──────────────────────────────┬──────────────────────────┐
# │  Format     │  Read                        │  Write                   │
# ├─────────────┼──────────────────────────────┼──────────────────────────┤
# │  CSV        │  pd.read_csv("f.csv")        │  df.to_csv("f.csv")      │
# │  JSON       │  pd.read_json("f.json")      │  df.to_json("f.json")    │
# │  Excel      │  pd.read_excel("f.xlsx")     │  df.to_excel("f.xlsx")   │
# │  SQL        │  pd.read_sql("SELECT...",con)│  df.to_sql("table", con) │
# │  Clipboard  │  pd.read_clipboard()         │  df.to_clipboard()       │
# └─────────────┴──────────────────────────────┴──────────────────────────┘

# ── Shared useful options ─────────────────────────────────────────────────────
# index=False       → don't write row numbers
# usecols=[...]     → load specific columns only
# nrows=N           → load first N rows
# dtype={"col":type}→ force data type on column
# na_values=[...]   → treat these as NaN
# header=None       → no header row in file
# index_col="col"   → use column as row index
Keep Learning

Pages

Pandas Data Input and Output

No comments:

Post a Comment

Follow In

Translate

Thank for your visit

About Me