Python Scikit - Linear Regression Model

 # Scikit-Learn — Python's most popular Machine Learning library

# Used to build models that learn patterns from data and make predictions
# Install: pip install scikit-learn
# Import convention: from sklearn import ...

# ══════════════════════════════════════════════════════════════════════════════
# ── What is Machine Learning (ML)? ───────────────────────────────────────────
# Machine Learning = teaching a computer to learn from data
# Instead of writing rules manually, you give data and the computer finds patterns
#
# Real examples:
# Email spam filter → learns from past emails which are spam or not
# House price → learns from past sales to predict price of a new house
# Loan approval → learns from past data to approve or reject a loan
# ══════════════════════════════════════════════════════════════════════════════

# ══════════════════════════════════════════════════════════════════════════════
# ── 1. Features and Labels ────────────────────────────────────────────────────
# Feature = input columns — the information you give the model to learn from
# Label = output column — the answer you want the model to predict
#
# Example: predict salary based on age and experience
# Features (X) → age, experience ← what we know
# Label (y) → salary ← what we want to predict
# ══════════════════════════════════════════════════════════════════════════════

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

df = pd.DataFrame({
"age": [22, 25, 28, 30, 33, 35, 38, 40, 43, 45],
"experience": [1, 3, 5, 7, 9, 11, 14, 16, 19, 21],
"salary": [30000, 35000, 42000, 50000, 55000,
62000, 70000, 75000, 82000, 90000]
})

X = df[["age", "experience"]] # Features — input columns (always uppercase X)
y = df["salary"] # Label — output column (always lowercase y)

print(X.shape) # Output: (10, 2) → 10 rows, 2 feature columns
print(y.shape) # Output: (10,) → 10 labels (one per row)

# ══════════════════════════════════════════════════════════════════════════════
# ── 2. Training and Testing Data Split ───────────────────────────────────────
# We split data into two parts:
# Training set → model LEARNS from this data (we show it the answers)
# Testing set → model is TESTED on this data (we hide the answers)
#
# Why split?
# If we test on the same data we trained on, the model just memorizes —
# it won't work well on new unseen data
# Splitting tells us how well the model works on NEW data
# Common split: 80% training, 20% testing
#
# ══════════════════════════════════════════════════════════════════════════════

X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2, # test_size=0.2 → 20% goes to test, 80% to train
random_state=42 # random_state=42 → same split every time you run (reproducible)
)

print(len(X_train)) # Output: 8 → 8 rows used for training
print(len(X_test)) # Output: 2 → 2 rows used for testing

# ── See which rows were picked for training and testing ───────────────────────
print(X_train.index.tolist()) # Output: [5, 0, 7, 2, 9, 4, 3, 6] → row numbers used for training (shuffled)
print(X_test.index.tolist()) # Output: [1, 8] → row numbers used for testing

print(X_train) # Output: shows age + experience for all 8 training rows with their index
print(X_test) # Output: shows age + experience for 2 test rows with their index

# ── What each variable holds ──────────────────────────────────────────────────
# X_train → training features (model learns from these)
# y_train → training labels (correct answers shown during training)
# X_test → testing features (model predicts from these)
# y_test → testing labels (correct answers we compare predictions against)

# ══════════════════════════════════════════════════════════════════════════════
# ── 3. ML Workflow — Train → Predict → Evaluate ──────────────────────────────
# Step 1: Choose a model
# Step 2: Train (fit) the model on training data
# Step 3: Predict on test data
# Step 4: Evaluate — compare predictions to actual answers
# ══════════════════════════════════════════════════════════════════════════════

# ── Step 1: Choose a model ────────────────────────────────────────────────────
model = LinearRegression() # LinearRegression — finds a straight-line trend in data

# ── Step 2: Train the model ───────────────────────────────────────────────────
# fit() = train — show the model X_train (features) and y_train (correct answers)
# model learns the relationship between features and salary
model.fit(X_train, y_train) # Output: model learns from 8 rows of training data

# ── Step 3: Predict ───────────────────────────────────────────────────────────
# predict() = use learned pattern to guess salary for test rows
predictions = model.predict(X_test) # model predicts salary for X_test rows
print(predictions) # Output: array of predicted salaries e.g. [78000. 34000.]

# compare predictions vs actual answers
print("Predicted:", predictions) # Output: Predicted: [78xxx. 34xxx.]
print("Actual: ", list(y_test)) # Output: Actual: [75000, 35000] (approx)

# ══════════════════════════════════════════════════════════════════════════════
# ── 4. Model Evaluation ───────────────────────────────────────────────────────
# How do we know if the model is good or bad?
# We compare predictions to actual values using metrics
#
# Two common metrics:
# MAE (Mean Absolute Error) → average gap between predicted and actual
# lower = better, 0 = perfect
# R² (R-squared score) → how much of the pattern the model captured
# 1.0 = perfect, 0 = no pattern, <0 = worse than average
# ══════════════════════════════════════════════════════════════════════════════

mae = mean_absolute_error(y_test, predictions) # average error in salary prediction
r2 = r2_score(y_test, predictions) # how well model fits the data

print(f"MAE : {mae:.0f}") # Output: MAE : 2500 → on average, prediction is off by ₹2500
print(f"R² : {r2:.2f}") # Output: R² : 0.98 → model explains 98% of the pattern

# ── How to read R² ────────────────────────────────────────────────────────────
# R² = 1.00 → perfect predictions
# R² = 0.90 → model explains 90% of the variation in salary — very good
# R² = 0.50 → model explains only 50% — okay
# R² = 0.00 → model learned nothing
# R² < 0.00 → worse than just predicting the average every time

# ══════════════════════════════════════════════════════════════════════════════
# ── Full ML Workflow Summary ──────────────────────────────────────────────────
# ┌──────────────────────────────────────────────────────────────────────────┐
# │ Step 1 Define X (features) and y (label) │
# │ Step 2 Split data → train_test_split(X, y, test_size=0.2) │
# │ Step 3 Choose model → model = LinearRegression() │
# │ Step 4 Train model → model.fit(X_train, y_train) │
# │ Step 5 Predict → predictions = model.predict(X_test) │
# │ Step 6 Evaluate → mean_absolute_error / r2_score │
# └──────────────────────────────────────────────────────────────────────────┘
# ══════════════════════════════════════════════════════════════════════════════

# ══════════════════════════════════════════════════════════════════════════════
# ── Quick Reference ───────────────────────────────────────────────────────────
# ══════════════════════════════════════════════════════════════════════════════

# X = df[["col1","col2"]] → features (input columns)
# y = df["col"] → label (output column to predict)
#
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# test_size=0.2 → 20% test, 80% train
# random_state=42 → same split every run
#
# from sklearn.linear_model import LinearRegression
# model = LinearRegression() → choose model
# model.fit(X_train, y_train) → train the model
# predictions = model.predict(X_test)→ predict on test data
#
# from sklearn.metrics import mean_absolute_error, r2_score
# mean_absolute_error(y_test, pred) → average prediction error (lower = better)
# r2_score(y_test, pred) → fit quality 0 to 1 (higher = better)

No comments:

Post a Comment

Please comment below to feedback or ask questions.