import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm


TrainExer21 = pd.read_csv('TrainExer21.txt', sep='\t', header=0, index_col=0)


TrainExer21.head()


x = pd.Series.to_numpy(TrainExer21['Female'])
X = sm.add_constant(x)
y = pd.Series.to_numpy(TrainExer21['LogWage'])
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.073
Model:                            OLS   Adj. R-squared:                  0.071
Method:                 Least Squares   F-statistic:                     39.00
Date:                Tue, 28 Feb 2023   Prob (F-statistic):           9.10e-10
Time:                        01:45:13   Log-Likelihood:                -289.65
No. Observations:                 500   AIC:                             583.3
Df Residuals:                     498   BIC:                             591.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.7336      0.024    194.453      0.000       4.686       4.781
x1            -0.2506      0.040     -6.245      0.000      -0.329      -0.172
==============================================================================
Omnibus:                        8.330   Durbin-Watson:                   1.384
Prob(Omnibus):                  0.016   Jarque-Bera (JB):                7.009
Skew:                           0.212   Prob(JB):                       0.0301
Kurtosis:                       2.603   Cond. No.                         2.42
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


plt.scatter(x, y)
plt.plot(X, results.predict(X))
plt.xlabel('Female')
plt.ylabel('LogWage')
plt.title('LogWage vs. Female')

Text(0.5, 1.0, 'LogWage vs. Female')


e = y - results.predict(X)


x_i = pd.Series.to_numpy(TrainExer21['Educ'])
X_i = sm.add_constant(x_i)
model_i = sm.OLS(e, X_i)
results_i = model_i.fit()
print(results_i.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.284
Model:                            OLS   Adj. R-squared:                  0.282
Method:                 Least Squares   F-statistic:                     197.4
Date:                Tue, 28 Feb 2023   Prob (F-statistic):           5.23e-38
Time:                        01:45:13   Log-Likelihood:                -206.18
No. Observations:                 500   AIC:                             416.4
Df Residuals:                     498   BIC:                             424.8
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.4526      0.036    -12.524      0.000      -0.524      -0.382
x1             0.2178      0.016     14.050      0.000       0.187       0.248
==============================================================================
Omnibus:                        4.168   Durbin-Watson:                   1.930
Prob(Omnibus):                  0.124   Jarque-Bera (JB):                4.205
Skew:                           0.201   Prob(JB):                        0.122
Kurtosis:                       2.799   Cond. No.                         5.92
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


plt.scatter(x_i, e)
plt.plot(X_i, results_i.predict(X_i))
plt.xlabel('Educ')
plt.ylabel('LogWage')
plt.title('LogWage vs. Educ')

Text(0.5, 1.0, 'LogWage vs. Educ')


x_ii = pd.Series.to_numpy(TrainExer21['Parttime'])
X_ii = sm.add_constant(x_ii)
model_ii = sm.OLS(e, X_ii)
results_ii = model_ii.fit()
print(results_ii.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.011
Model:                            OLS   Adj. R-squared:                  0.009
Method:                 Least Squares   F-statistic:                     5.394
Date:                Tue, 28 Feb 2023   Prob (F-statistic):             0.0206
Time:                        01:45:13   Log-Likelihood:                -286.96
No. Observations:                 500   AIC:                             577.9
Df Residuals:                     498   BIC:                             586.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0284      0.023     -1.246      0.213      -0.073       0.016
x1             0.0987      0.043      2.323      0.021       0.015       0.182
==============================================================================
Omnibus:                        7.495   Durbin-Watson:                   1.376
Prob(Omnibus):                  0.024   Jarque-Bera (JB):                6.645
Skew:                           0.218   Prob(JB):                       0.0361
Kurtosis:                       2.640   Cond. No.                         2.43
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


plt.scatter(x_ii, e)
plt.plot(X_ii, results_ii.predict(X_ii))
plt.xlabel('Parttime')
plt.ylabel('LogWage')
plt.title('LogWage vs. Parttime')

Text(0.5, 1.0, 'LogWage vs. Parttime')

	Wage	LogWage	Female	Age	Educ	Parttime
Observ.
1	66	4.190	0	49	1	1
2	34	3.526	1	42	1	1
3	70	4.248	1	42	1	1
4	47	3.850	0	38	1	0
5	107	4.673	1	54	1	1

Training Exercise 2.1¶