I am stuck in linear regression. When I inserted 2 to 3 values on my own in X and Y, it worked well. But when I trained the dataset from Kaggle, it gave w, b, and cost function values as NaN. Could somebody please guide me on what I am doing wrong? I have attached the dataset and provided the code.
from google.colab import files
uploaded = files.upload()
import pandas as pd
# Read the uploaded file
df = pd.read_csv('Study_vs_Score_data.csv')
# View top rows to confirm
print(df.head())
X = df['Attendance_Hours']
y = df['Final_Marks']
print(X.head())
print(y.head())
# Drop rows with NaN
df = df.dropna()
# Convert columns to numeric
df['Attendance_Hours'] = pd.to_numeric(df['Attendance_Hours'], errors='coerce')
df['Final_Marks'] = pd.to_numeric(df['Final_Marks'], errors='coerce')
# Drop any rows that still have NaN after conversion
df = df.dropna()
# Import required libraries
import numpy as np
import matplotlib.pyplot as plt
# Step 1: Prepare the data
# Let's create simple data for example
# X: input feature, y: target output
X = df['Attendance_Hours']
y = df['Final_Marks'] # Here, y = 2 * X
# Number of data points
m = len(X)
print(f"Number of rows are: {m}")
# Step 2: Initialize parameters (weight and bias)
w = 0 # initial slope
b = 0 # initial intercept
# Step 3: Set learning rate and number of iterations
learning_rate = 0.01
iterations = 1000
# Step 4: Perform Gradient Descent
for i in range(iterations):
# Calculate the predicted value
y_pred = w * X + b
# Calculate cost (mean squared error)
cost = (1/m) * sum((y_pred - y) ** 2)
# Calculate gradients
dw = (2/m) * sum((y_pred - y) * X)
db = (2/m) * sum(y_pred - y)
# Update parameters
w = w - learning_rate * dw
b = b - learning_rate * db
# Print cost every 100 iterations
if i % 100 == 0:
print(f"Iteration {i}, Learning_rate: {learning_rate}, Cost: {cost:.4f}, w: {w:.4f}, b: {b:.4f}")
# Step 5: Print final results
print(f"\nFinal weight (w): {w}")
print(f"Final bias (b): {b}")
# Step 6: Plot the data and regression line
plt.scatter(X, y, color='blue', label='Actual Data')
plt.plot(X, w*X + b, color='red', label='Regression Line')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()
[Study_vs_Score_data.csv|attachment](upload://gdslqmRLWmDcXP2IdBTMo7CEd04.csv) (35.5 KB)
[Study_vs_Score_data.csv|attachment](upload://gdslqmRLWmDcXP2IdBTMo7CEd04.csv) (35.5 KB)