Linear regression Mohammed Alnoor

import pandas as pd
import numpy as np
import statsmodels.api as sm

Load the dataset

data = pd.read_csv(‘your_dataset.csv’)

Define the independent and dependent variables

X = data[[‘independent_var1’, ‘independent_var2’, ‘independent_var3’]]
y = data[‘dependent_var’]

Stepwise regression using statsmodels

def stepwise_selection(X, y,
initial_list=,
threshold_in=0.01,
threshold_out = 0.05,
verbose=True):
included = list(initial_list)
while True:
changed=False
# forward step
excluded = list(set(X.columns)-set(included))
new_pval = pd.Series(index=excluded)
for new_column in excluded:
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
new_pval[new_column] = model.pvalues[new_column]
best_pval = new_pval.min()
if best_pval < threshold_in:
best_feature = new_pval.idxmin()
included.append(best_feature)
changed=True
if verbose:
print(‘Add {:30} with p-value {:.6}’.format(best_feature, best_pval))

    # backward step
    model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
    # use all coefs except intercept
    pvalues = model.pvalues.iloc[1:]
    worst_pval = pvalues.max() # null if pvalues is empty
    if worst_pval > threshold_out:
        changed=True
        worst_feature = pvalues.idxmax()
        included.remove(worst_feature)
        if verbose:
            print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
    if not changed:
        break
return included

result = stepwise_selection(X, y)

print(‘resulting features:’)
print(result)

Do you have a question?

Is this code from one of the course assignments?