import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
pd.options.display.max_columns = None

df = pd.read_pickle('colombia_real_estate.pkl').reset_index().drop('index', axis=1)
df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 691 entries, 0 to 690
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Tipo                 691 non-null    object 
 1   Propiedad            691 non-null    object 
 2   Habitaciones         691 non-null    int16  
 3   Baños                691 non-null    int8   
 4   Área construída      691 non-null    float32
 5   Área privada         691 non-null    float32
 6   Estrato              691 non-null    int8   
 7   Antigüedad           691 non-null    object 
 8   Administración       691 non-null    int64  
 9   Precio m²            691 non-null    int64  
 10  Dirección            691 non-null    object 
 11  Barrio               691 non-null    object 
 12  Ciudad               691 non-null    object 
 13  Precio (COP)         691 non-null    int64  
 14  Descripción general  691 non-null    object 
 15  No. Fotos            691 non-null    int64  
 16  Fecha Sacada         691 non-null    object 
 17  Enlace               691 non-null    object 
 18  Balcón               691 non-null    int32  
 19  Baño Auxiliar        691 non-null    int32  
 20  Cocina Integral      691 non-null    int32  
dtypes: float32(2), int16(1), int32(3), int64(4), int8(2), object(9)
memory usage: 86.5+ KB


df.isna().sum()

Tipo                   0
Propiedad              0
Habitaciones           0
Baños                  0
Área construída        0
Área privada           0
Estrato                0
Antigüedad             0
Administración         0
Precio m²              0
Dirección              0
Barrio                 0
Ciudad                 0
Precio (COP)           0
Descripción general    0
No. Fotos              0
Fecha Sacada           0
Enlace                 0
Balcón                 0
Baño Auxiliar          0
Cocina Integral        0
dtype: int64


df.shape

(691, 21)


# Drop columns that aren't needed (save Enlace for late)
enlaces = df['Enlace']
df = df.drop(['Barrio', 'Descripción general', 'Enlace', 'Fecha Sacada', 'Dirección'], axis=1)

# Make Estrato a categorical variable
df['Estrato'] = df['Estrato'].apply(lambda x : 'Estrato ' + str(x))


# Change data type of columns
df['Habitaciones'] = df['Habitaciones'].astype('int16')
df['Baños'] = df['Baños'].astype('int8')
df['Área construída'] = df['Área construída'].astype('float32')
df['Área privada'] = df['Área privada'].astype('float32')
df['Administración'] = df['Administración'].astype('int64')
df['Precio m²'] = df['Precio m²'].astype('int64')
df['Precio (COP)'] = df['Precio (COP)'].astype('int64')
df['No. Fotos'] = df['No. Fotos'].astype('int64')


# Get dummy values for the categorical columns
clean_df = df.copy()

for col in df.select_dtypes(include='object').columns:
    clean_df = pd.concat([clean_df, pd.get_dummies(df[col])], axis=1)
    clean_df = clean_df.drop(col, axis=1)


clean_df


from sklearn.model_selection import train_test_split

X = clean_df.drop(['Precio (COP)', 'Precio m²'], axis=1)
y = clean_df['Precio (COP)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=1)


results = sm.OLS(y_train, sm.add_constant(X_train)).fit()
results.summary()


plt.scatter(clean_df['Área construída'], clean_df['Precio (COP)'])
plt.xlabel('Área construída')
plt.ylabel('Precio')

Text(0, 0.5, 'Precio')


# Remove the outlier
clean_df = clean_df[clean_df['Precio (COP)'] < 2e10]

# Re-create the X and y variables
X = clean_df.drop(['Precio (COP)', 'Precio m²'], axis=1)
y = clean_df['Precio (COP)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=1)


plt.scatter(clean_df['Área construída'], clean_df['Precio (COP)'])
plt.xlabel('Área construída')
plt.ylabel('Precio')

Text(0, 0.5, 'Precio')


plt.scatter(clean_df['Área privada'], clean_df['Precio (COP)'])
plt.xlabel('Área privada')
plt.ylabel('Precio')

Text(0, 0.5, 'Precio')


plt.scatter(clean_df['No. Fotos'], clean_df['Precio (COP)'])
plt.xlabel('No. Fotos')
plt.ylabel('Precio')

Text(0, 0.5, 'Precio')


plt.scatter(clean_df['Baños'], clean_df['Precio (COP)'])
plt.xlabel('Baños')
plt.ylabel('Precio')

Text(0, 0.5, 'Precio')


plt.scatter(clean_df['Administración'], clean_df['Precio (COP)'])
plt.xlabel('Administración')
plt.ylabel('Precio')

Text(0, 0.5, 'Precio')


results = sm.OLS(y_train, sm.add_constant(X_train)).fit()
results.summary()


fig = sm.qqplot(results.resid, line="s")
plt.show()


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=1)

fig, ax = plt.subplots(figsize=(15,10))
sm.graphics.plot_regress_exog(results,'Área construída', fig=fig)
plt.show()

eval_env: 1


results = sm.OLS(y_train, sm.add_constant(X_train['Área construída'])).fit()
results.summary()


results = sm.OLS(y_train, sm.add_constant(X_train)).fit()

stud_res = results.outlier_test()

sns.scatterplot(y=stud_res['student_resid'], x=X_train['Área construída'], hue=(stud_res['student_resid']>3)|(stud_res['student_resid']<-3))
plt.axhline(y=0, color='black', linestyle='--')
plt.show()

C:\Users\HaydenH\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\linalg\linalg.py:2146: RuntimeWarning: overflow encountered in det
  r = _umath_linalg.det(a, signature=signature)
C:\Users\HaydenH\AppData\Local\Programs\Python\Python310\lib\site-packages\statsmodels\stats\outliers_influence.py:696: RuntimeWarning: invalid value encountered in sqrt
  return self.resid / sigma / np.sqrt(1 - hii)


results = sm.OLS(y_train, sm.add_constant(X_train[['Área construída', 'Baños', 'Habitaciones']])).fit()
results.summary()


sm.graphics.influence_plot(results)


df.loc[250]

Tipo               Inmobiliaria
Propiedad                  Casa
Habitaciones                 18
Baños                         5
Área construída           428.0
Área privada                0.0
Estrato               Estrato 4
Antigüedad         16 a 30 años
Administración                0
Precio m²               3037383
Ciudad                 Medellín
Precio (COP)         1300000000
No. Fotos                     8
Balcón                        1
Baño Auxiliar                 1
Cocina Integral               0
Name: 250, dtype: object


from statsmodels.stats.outliers_influence import variance_inflation_factor

X = clean_df.select_dtypes(['int64', 'float64', 'int32', 'float32', 'int8', 'int16']).drop(columns=['Precio (COP)'])

vif_df = pd.DataFrame()
vif_df['feature'] = X.columns
vif_df['VIF'] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

vif_df.sort_values(by='VIF', ascending=False)


clean_df['Precio (USD)'] = round(clean_df['Precio (COP)'] / 4500, 2)

C:\Users\HaydenH\AppData\Local\Temp\ipykernel_31004\883967646.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['Precio (USD)'] = round(clean_df['Precio (COP)'] / 4500, 2)


clean_df


X = clean_df.drop(['Precio (COP)', 'Precio m²', 'Precio (USD)'], axis=1)
y = clean_df['Precio (USD)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=1)

results = sm.OLS(y_train, sm.add_constant(X_train['Área construída'])).fit()
results.summary()


from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler()
y_scaler = StandardScaler()

X_train_std = pd.DataFrame(X_scaler.fit_transform(X_train), columns=X_train.columns)
y_train_std = y_scaler.fit_transform(pd.DataFrame(y_train))

results = sm.OLS(y_train_std, sm.add_constant(X_train_std)).fit()
results.summary()


results_as_html = results.summary().tables[1].as_html()
results_df = pd.read_html(results_as_html, header=0, index_col=0)[0]

results_df[
    (results_df['P>|t|'] < 0.05) &
    (results_df['[0.025'] * results_df['0.975]'] > 0)
].sort_values(by='coef', ascending=False)


%%time

models_fwd = pd.DataFrame(columns=["RSS", "MSE", "model", "AIC", "BIC", "AdjR2"])

predictors = []

for i in range(len(X_train.columns)):
    remaining_predictors = [p for p in X_train_std.columns if p not in predictors]
    results = []
    
    for p in remaining_predictors:
        pred = predictors + [p]
        model = sm.OLS(y_train_std, sm.add_constant(X_train_std[pred]))
        regr = model.fit()
        
        model_info = {
            'model': regr,
            'RSS': ((regr.predict(sm.add_constant(X_train_std[pred])) - pd.DataFrame(y_train_std)[0]) ** 2).sum(),
            'AIC': regr.aic,
            'BIC': regr.bic,
            'AdjR2': regr.rsquared_adj
        }
        results.append(model_info)
        
    models = pd.DataFrame(results)
    best_model = models.loc[models['RSS'].argmin()]
    
    models_fwd.loc[i, 'RSS'] = best_model['RSS']
    models_fwd.loc[i, 'AIC'] = best_model['AIC']
    models_fwd.loc[i, 'BIC'] = best_model['BIC']
    models_fwd.loc[i, 'model'] = best_model['model']
    models_fwd.loc[i, 'AdjR2'] = best_model['AdjR2']
    
    exog_names = best_model['model'].model.exog_names
    exog_names.remove('const')
    
    variable_added = list(set(exog_names).difference(set(predictors)))[0]
    predictors.append(variable_added)

CPU times: total: 8.94 s
Wall time: 7.43 s


plt.plot(np.arange(len(models_fwd)) +1, models_fwd['BIC'])
plt.title('BIC as Number of Predictor Variables Increase')
plt.ylabel('BIC')
plt.xlabel('Number of Predictors')
plt.show()


models_fwd.sort_values(by='BIC').iloc[0, 2].model.exog_names

['Área construída',
 'Estrato 6',
 'Estrato 5',
 'Estrato 4',
 'Menor a 1 año',
 'Área privada',
 'Medellín',
 'Bogotá',
 'Más de 30 años',
 'Baño Auxiliar',
 'Administración',
 'Baños',
 'Habitaciones']


models_fwd.sort_values(by='BIC').iloc[1, 2].model.exog_names

['Área construída',
 'Estrato 6',
 'Estrato 5',
 'Estrato 4',
 'Menor a 1 año',
 'Área privada',
 'Medellín',
 'Bogotá',
 'Más de 30 años',
 'Baño Auxiliar',
 'Administración']


variables_to_use = models_fwd.sort_values(by='BIC').iloc[1, 2].model.exog_names

results = sm.OLS(y_train_std, sm.add_constant(X_train_std[variables_to_use])).fit()
results.summary()


X_test_std = pd.DataFrame(X_scaler.transform(X_test), columns=X_test.columns)[variables_to_use]
y_test_std = y_scaler.transform(pd.DataFrame(y_test))

predicted = results.predict(sm.add_constant(X_test_std))


from sklearn.metrics import mean_squared_error

mean_squared_error(y_test_std, predicted)

0.5418669876320477


predicted = np.array(predicted.values.tolist()).reshape(-1, 1)
predicted = np.ravel(y_scaler.inverse_transform(predicted))

predictions = pd.DataFrame({'Predicted Price': predicted, 'Actual Price': y_test})
predictions['Error'] = predictions['Predicted Price'] - predictions['Actual Price']


predictions.sort_values(by='Error', ascending=False).head()


pd.options.display.max_colwidth = 100

enlaces[677]

'https://www.fincaraiz.com.co/inmueble/casa-en-venta/calima/cali/7387424'


enlaces[102]

'https://www.fincaraiz.com.co/inmueble/apartamento-en-venta/san-jose/santa-marta/6431027'


predictions.sort_values(by='Error', ascending=True).head()


enlaces[249]

'https://www.fincaraiz.com.co/inmueble/casa-en-venta/medellin/medellin/7213889'


y_train_std = pd.DataFrame(y_train_std)[0]
y_test_std = pd.DataFrame(y_test_std)[0]

X_test_std = pd.DataFrame(X_scaler.transform(X_test), columns=X_test.columns)
X_test_std.insert(0, 'const', 1)


from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [100, 200, 500],
    'criterion': ['squared_error', 'absolute_error'],
    'max_features': ['sqrt', 'log2', None]
}

grid = GridSearchCV(RandomForestRegressor(), params, cv=10)
grid.fit(sm.add_constant(X_train_std), y_train_std)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid={'criterion': ['squared_error', 'absolute_error'],
                         'max_features': ['sqrt', 'log2', None],
                         'n_estimators': [100, 200, 500]})

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid={'criterion': ['squared_error', 'absolute_error'],
                         'max_features': ['sqrt', 'log2', None],
                         'n_estimators': [100, 200, 500]})

RandomForestRegressor()

RandomForestRegressor()


from sklearn.svm import SVR

params = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [.1, .5, 1, 5, 10]
}

svr_grid = GridSearchCV(SVR(), params, cv=10)
svr_grid.fit(sm.add_constant(X_train_std), y_train_std)

GridSearchCV(cv=10, estimator=SVR(),
             param_grid={'C': [0.1, 0.5, 1, 5, 10],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})

GridSearchCV(cv=10, estimator=SVR(),
             param_grid={'C': [0.1, 0.5, 1, 5, 10],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})

SVR()

SVR()


# Linear Regression MSE
mean_squared_error(y_test_std, predicted)

0.5418669876320477


# Random Forest MSE
rf = RandomForestRegressor(**grid.best_params_)
rf.fit(sm.add_constant(X_train_std), y_train_std)
predicted = rf.predict(X_test_std)
mean_squared_error(y_test_std, predicted)

0.6395066158850118


# Support Vector Regressor
svr = SVR(**svr_grid.best_params_)
svr.fit(sm.add_constant(X_train_std), y_train_std)
predicted = svr.predict(X_test_std)
mean_squared_error(y_test_std, predicted)

0.6116958259105769

	Tipo	Propiedad	Habitaciones	Baños	Área construída	Área privada	Estrato	Antigüedad	Administración	Precio m²	Dirección	Barrio	Ciudad	Precio (COP)	Descripción general	No. Fotos	Fecha Sacada	Enlace	Baño Auxiliar	Cocina Integral
0	Usado	Apartamento	2	1	41.000000	0.000000	3	Menor a 1 año	100000	4878048	Cra 67 cll 53 # 09 sur	Bogotá	Bogotá	200000000	Vendo o permuto !!!!! Hermoso apt en conjunto reserva de madelena, barrio madelena acabados de ...	19	2023-01-27 11:44:51.166921	https://www.fincaraiz.com.co/inmueble/apartamento-en-venta/bogota/bogota-dc/10006740	0	0
1	Usado	Apartamento	2	1	32.400002	32.400002	3	9 a 15 años	58000	5092592	Transversal 73 11B 33 Int 7 Apt 101	Bogotá	Bogotá	165000000	Apartamento tipo apartaestudio, con patio, un baño, dos habitaciones, cocina semiintegral.	7	2023-01-27 11:44:52.096644	https://www.fincaraiz.com.co/inmueble/apartamento-en-venta/bogota/bogota-dc/10001169	0	0
2	Usado	Apartamento	3	1	52.000000	0.000000	3	9 a 15 años	110000	3557692	Diagonal 4 b # 39-90	Bogotá	Bogotá	185000000	Apartamento iluminado, 3 habitaciones, 1 baño, cocina integral, ascensor, 5to piso, parqueadero ...	7	2023-01-27 11:44:53.900864	https://www.fincaraiz.com.co/inmueble/apartamento-en-venta/bogota/bogota-dc/10003762	0	0
3	Usado	Apartamento	3	2	70.000000	70.000000	3	16 a 30 años	280000	4142857	Kra. 103 d # 83-82	Bogotá	Bogotá	290000000	apartamento de tres alcobas sala comedor cocina zona de ropas alcoba principal con baño y otro b...	11	2023-01-27 11:44:54.797900	https://www.fincaraiz.com.co/inmueble/apartamento-en-venta/bogota/bogota-dc/10003565	1	1
4	Usado	Apartamento	2	1	43.000000	0.000000	2	1 a 8 años	66000	2790697	CLL 104 sur 14-80	Bogotá	Bogotá	120000000	Apartamento con excelente vista a parque natural, ubicado en el piso séptimo, esquinero y consta...	9	2023-01-27 11:44:55.683734	https://www.fincaraiz.com.co/inmueble/apartamento-en-venta/bogota/bogota-dc/10005549	0	0

	Habitaciones	Baños	Área construída	Área privada	Administración	Precio m²	Precio (COP)	No. Fotos	Balcón	Baño Auxiliar	Cocina Integral	Duplex	Inmobiliaria	Loft	Negociable	Oportunidad	PentHouse	Usado	Apartamento	Casa	Estrato 0	Estrato 1	Estrato 2	Estrato 3	Estrato 4	Estrato 5	Estrato 6	1 a 8 años	16 a 30 años	9 a 15 años	Menor a 1 año	Mnan	Más de 30 años	Armenia	Bogotá	Cali	Fusagasugá	Medellín	Santa Marta	Santa marta	Villavicencio
0	2	1	41.000000	0.000000	100000	4878048	200000000	19	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	1	0	0	0	0	0	0	1	0	0	0	1	0	0	0	0	0	0
1	2	1	32.400002	32.400002	58000	5092592	165000000	7	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	1	0	0	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0
2	3	1	52.000000	0.000000	110000	3557692	185000000	7	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	1	0	0	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0
3	3	2	70.000000	70.000000	280000	4142857	290000000	11	0	1	1	0	0	0	0	0	0	1	1	0	0	0	0	1	0	0	0	0	1	0	0	0	0	0	1	0	0	0	0	0	0
4	2	1	43.000000	0.000000	66000	2790697	120000000	9	0	0	0	0	0	0	0	0	0	1	1	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	1	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
686	3	3	89.000000	0.000000	0	4213483	375000000	24	1	1	1	0	1	0	0	0	0	0	0	1	0	0	0	0	1	0	0	0	0	1	0	0	0	0	0	1	0	0	0	0	0
687	3	1	9.260000	9.260000	0	19438	180000000	8	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	1	0	0	0	0	0
688	3	1	92.000000	92.000000	0	1847826	170000000	8	1	0	1	0	1	0	0	0	0	0	0	1	0	0	1	0	0	0	0	1	0	0	0	0	0	0	0	1	0	0	0	0	0
689	4	3	94.000000	94.000000	385000	3925531	369000000	30	0	1	1	0	1	0	0	0	0	0	1	0	0	0	0	0	0	1	0	0	1	0	0	0	0	0	0	1	0	0	0	0	0
690	3	2	54.000000	54.000000	115000	2870370	155000000	11	0	1	0	0	1	0	0	0	0	0	1	0	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	1	0	0	0	0	0

Dep. Variable:	Precio (COP)	R-squared:	0.043
Model:	OLS	Adj. R-squared:	-0.023
Method:	Least Squares	F-statistic:	0.6528
Date:	Tue, 25 Apr 2023	Prob (F-statistic):	0.933
Time:	02:46:13	Log-Likelihood:	-13117.
No. Observations:	518	AIC:	2.630e+04
Df Residuals:	484	BIC:	2.645e+04
Df Model:	33
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-4.185e+09	3.46e+09	-1.210	0.227	-1.1e+10	2.61e+09
Habitaciones	-4.779e+08	1.04e+09	-0.461	0.645	-2.51e+09	1.56e+09
Baños	3.017e+09	1.5e+09	2.012	0.045	7.1e+07	5.96e+09
Área construída	-4.61e+06	1.59e+07	-0.291	0.771	-3.58e+07	2.65e+07
Área privada	-4.311e+06	1.14e+07	-0.377	0.707	-2.68e+07	1.82e+07
Administración	-6413.7365	4458.025	-1.439	0.151	-1.52e+04	2345.737
No. Fotos	6.829e+07	1.86e+08	0.366	0.714	-2.98e+08	4.35e+08
Balcón	4.364e+09	2.61e+09	1.670	0.096	-7.71e+08	9.5e+09
Baño Auxiliar	-3.421e+09	2.59e+09	-1.322	0.187	-8.51e+09	1.67e+09
Cocina Integral	3.212e+09	2.43e+09	1.324	0.186	-1.55e+09	7.98e+09
Duplex	-6.247e+09	8.32e+09	-0.751	0.453	-2.26e+10	1.01e+10
Inmobiliaria	1.096e+09	5.29e+09	0.207	0.836	-9.3e+09	1.15e+10
Loft	-9.944e+08	8.74e+09	-0.114	0.909	-1.82e+10	1.62e+10
Negociable	-3.508e-06	1.31e-05	-0.267	0.789	-2.93e-05	2.23e-05
Oportunidad	3.865e+09	1.31e+10	0.294	0.769	-2.2e+10	2.97e+10
PentHouse	-5.259e+09	2.17e+10	-0.243	0.808	-4.78e+10	3.73e+10
Usado	3.354e+09	5.46e+09	0.614	0.539	-7.38e+09	1.41e+10
Apartamento	-6.941e+08	2.07e+09	-0.336	0.737	-4.75e+09	3.36e+09
Casa	-3.491e+09	2.47e+09	-1.412	0.158	-8.35e+09	1.37e+09
Estrato 0	-4.371e+07	9.29e+09	-0.005	0.996	-1.83e+10	1.82e+10
Estrato 1	2.233e+09	6.03e+09	0.371	0.711	-9.61e+09	1.41e+10
Estrato 2	-1.638e+09	3.77e+09	-0.434	0.665	-9.05e+09	5.78e+09
Estrato 3	-1.203e+09	2.57e+09	-0.468	0.640	-6.26e+09	3.85e+09
Estrato 4	5.561e+08	2.88e+09	0.193	0.847	-5.1e+09	6.21e+09
Estrato 5	-2.052e+09	3.38e+09	-0.608	0.544	-8.68e+09	4.58e+09
Estrato 6	-2.039e+09	4.57e+09	-0.446	0.656	-1.1e+10	6.94e+09
1 a 8 años	-1.981e+09	2.42e+09	-0.818	0.414	-6.74e+09	2.78e+09
16 a 30 años	-6.876e+08	2.5e+09	-0.275	0.783	-5.6e+09	4.22e+09
9 a 15 años	3.352e+09	2.63e+09	1.273	0.204	-1.82e+09	8.53e+09
Menor a 1 año	-2.466e+09	5.51e+09	-0.447	0.655	-1.33e+10	8.36e+09
Mnan	-8.519e+08	2.7e+09	-0.316	0.752	-6.15e+09	4.45e+09
Más de 30 años	-1.55e+09	4.03e+09	-0.384	0.701	-9.48e+09	6.38e+09
Armenia	6.112e+08	3.89e+09	0.157	0.875	-7.02e+09	8.25e+09
Bogotá	4.325e+09	3.99e+09	1.083	0.279	-3.52e+09	1.22e+10
Cali	8.834e+08	3.51e+09	0.252	0.801	-6.01e+09	7.78e+09
Fusagasugá	-1.626e+09	6.71e+09	-0.242	0.809	-1.48e+10	1.16e+10
Medellín	7.63e+08	3.58e+09	0.213	0.831	-6.27e+09	7.79e+09
Santa Marta	-8.77e+08	3.41e+09	-0.257	0.797	-7.58e+09	5.82e+09
Santa marta	-5.372e+09	1.33e+10	-0.404	0.686	-3.15e+10	2.07e+10
Villavicencio	-2.893e+09	3.46e+09	-0.837	0.403	-9.69e+09	3.9e+09

Omnibus:	1247.824	Durbin-Watson:	1.980
Prob(Omnibus):	0.000	Jarque-Bera (JB):	4804563.386
Skew:	21.248	Prob(JB):	0.00
Kurtosis:	472.893	Cond. No.	3.36e+16

Predicting House Prices - Real Estate in Colombia¶

The Data¶

Data Cleaning¶

Regression Analysis¶

Get train and validation set¶

Initial Regression¶

Assumptions¶

Linear relationship¶

Heteroskedastic Data¶

Outliers¶

Leverage Points¶

Collinearity¶

Preprocessing¶

What factors are important in determining the value of a house in Colombia?¶

Forward subset selection¶

How well does the model work?¶

An example¶

Non-Parametric Models¶

Random Forest¶

Support Vector Regressor¶

Comparison¶

Conclusion¶

Omnibus:	387.531	Durbin-Watson:	2.008
Prob(Omnibus):	0.000	Jarque-Bera (JB):	11242.681
Skew:	2.916	Prob(JB):	0.00
Kurtosis:	25.088	Cond. No.	4.01e+16

Omnibus:	394.014	Durbin-Watson:	1.975
Prob(Omnibus):	0.000	Jarque-Bera (JB):	10975.611
Skew:	3.010	Prob(JB):	0.00
Kurtosis:	24.755	Cond. No.	289.

Omnibus:	411.682	Durbin-Watson:	1.940
Prob(Omnibus):	0.000	Jarque-Bera (JB):	12413.628
Skew:	3.190	Prob(JB):	0.00
Kurtosis:	26.142	Cond. No.	517.

	feature	VIF
1	Baños	13.908978
0	Habitaciones	7.588467
6	No. Fotos	6.541586
2	Área construída	5.739424
5	Precio m²	4.110849
3	Área privada	2.153872
8	Baño Auxiliar	2.090382
7	Balcón	2.027569
9	Cocina Integral	1.855900
4	Administración	1.729555

Dep. Variable:	Precio (USD)	R-squared:	0.512
Model:	OLS	Adj. R-squared:	0.511
Method:	Least Squares	F-statistic:	541.2
Date:	Tue, 25 Apr 2023	Prob (F-statistic):	2.34e-82
Time:	02:46:19	Log-Likelihood:	-6588.4
No. Observations:	517	AIC:	1.318e+04
Df Residuals:	515	BIC:	1.319e+04
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Área construída	0.5497	0.041	13.564	0.000	0.470	0.629
Estrato 6	0.1996	0.028	7.031	0.000	0.144	0.255
Baños	0.1529	0.045	3.377	0.001	0.064	0.242
Área privada	0.1224	0.030	4.130	0.000	0.064	0.181
Menor a 1 año	0.0866	0.025	3.527	0.000	0.038	0.135
Medellín	0.0863	0.025	3.497	0.001	0.038	0.135
Villavicencio	-0.0531	0.024	-2.239	0.026	-0.100	-0.006
Baño Auxiliar	-0.0750	0.029	-2.594	0.010	-0.132	-0.018
Más de 30 años	-0.0838	0.025	-3.403	0.001	-0.132	-0.035
Estrato 3	-0.0851	0.021	-4.150	0.000	-0.125	-0.045
Habitaciones	-0.0963	0.035	-2.764	0.006	-0.165	-0.028
Estrato 2	-0.1050	0.025	-4.179	0.000	-0.154	-0.056

Dep. Variable:	y	R-squared:	0.685
Model:	OLS	Adj. R-squared:	0.678
Method:	Least Squares	F-statistic:	99.66
Date:	Tue, 25 Apr 2023	Prob (F-statistic):	6.82e-119
Time:	02:46:27	Log-Likelihood:	-435.29
No. Observations:	517	AIC:	894.6
Df Residuals:	505	BIC:	945.6
Df Model:	11
Covariance Type:	nonrobust

Omnibus:	388.030	Durbin-Watson:	2.053
Prob(Omnibus):	0.000	Jarque-Bera (JB):	11156.288
Skew:	2.926	Prob(JB):	0.00
Kurtosis:	24.992	Cond. No.	2.56

	Predicted Price	Actual Price	Error
677	706642.981395	377777.78	328865.201395
671	335328.019435	200000.00	135328.019435
102	199433.732672	95555.56	103878.172672
603	163728.140858	60000.00	103728.140858
258	216364.436065	120000.00	96364.436065

	Predicted Price	Actual Price	Error
249	480144.617638	1222222.22	-742077.602362
305	596894.006345	1111111.11	-514217.103655
83	366645.633554	666666.67	-300021.036446
572	5985.152453	255555.56	-249570.407547
315	402700.062228	511111.11	-108411.047772

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	8.32e+07	4.22e+07	1.973	0.049	3.47e+05	1.66e+08
Habitaciones	-3.435e+07	1.24e+07	-2.764	0.006	-5.88e+07	-9.93e+06
Baños	5.943e+07	1.76e+07	3.377	0.001	2.48e+07	9.4e+07
Área construída	2.447e+06	1.8e+05	13.564	0.000	2.09e+06	2.8e+06
Área privada	5.425e+05	1.31e+05	4.130	0.000	2.84e+05	8.01e+05
Administración	106.0197	56.731	1.869	0.062	-5.450	217.490
No. Fotos	-2.809e+06	2.29e+06	-1.224	0.221	-7.32e+06	1.7e+06
Balcón	-1.597e+07	3.14e+07	-0.509	0.611	-7.76e+07	4.57e+07
Baño Auxiliar	-8.054e+07	3.1e+07	-2.594	0.010	-1.42e+08	-1.95e+07
Cocina Integral	-1.3e+07	2.93e+07	-0.444	0.657	-7.05e+07	4.45e+07
Duplex	-5.879e+06	1.06e+08	-0.055	0.956	-2.15e+08	2.03e+08
Inmobiliaria	-4.82e+07	6.15e+07	-0.784	0.434	-1.69e+08	7.26e+07
Loft	-1.052e+07	1.01e+08	-0.104	0.917	-2.08e+08	1.87e+08
Negociable	2.458e+07	2.77e+08	0.089	0.929	-5.2e+08	5.69e+08
Oportunidad	-2.278e+07	1.63e+08	-0.140	0.889	-3.43e+08	2.98e+08
PentHouse	2.151e+08	1.65e+08	1.303	0.193	-1.09e+08	5.39e+08
Usado	-6.906e+07	6.81e+07	-1.014	0.311	-2.03e+08	6.47e+07
Apartamento	4.095e+07	2.39e+07	1.714	0.087	-5.99e+06	8.79e+07
Casa	4.225e+07	3.16e+07	1.337	0.182	-1.98e+07	1.04e+08
Estrato 0	-2.206e+07	1.36e+08	-0.162	0.871	-2.9e+08	2.46e+08
Estrato 1	-9.109e+07	7.63e+07	-1.194	0.233	-2.41e+08	5.88e+07
Estrato 2	-1.683e+08	4.81e+07	-3.498	0.001	-2.63e+08	-7.37e+07
Estrato 3	-8.054e+07	3.43e+07	-2.349	0.019	-1.48e+08	-1.32e+07
Estrato 4	1.287e+07	3.69e+07	0.349	0.727	-5.96e+07	8.54e+07
Estrato 5	7.565e+07	4.09e+07	1.847	0.065	-4.81e+06	1.56e+08
Estrato 6	3.567e+08	5.55e+07	6.425	0.000	2.48e+08	4.66e+08
1 a 8 años	-3.568e+06	2.96e+07	-0.121	0.904	-6.16e+07	5.45e+07
16 a 30 años	-1.441e+07	3.09e+07	-0.466	0.641	-7.52e+07	4.63e+07
9 a 15 años	1.23e+07	3.09e+07	0.399	0.690	-4.83e+07	7.29e+07
Menor a 1 año	2.755e+08	7.11e+07	3.873	0.000	1.36e+08	4.15e+08
Mnan	-7.795e+06	3.5e+07	-0.223	0.824	-7.65e+07	6.09e+07
Más de 30 años	-1.788e+08	4.77e+07	-3.745	0.000	-2.73e+08	-8.5e+07
Armenia	5.41e+07	4.67e+07	1.158	0.247	-3.77e+07	1.46e+08
Bogotá	1.19e+08	4.89e+07	2.431	0.015	2.28e+07	2.15e+08
Cali	-9.185e+06	4.27e+07	-0.215	0.830	-9.31e+07	7.47e+07
Fusagasugá	-4.401e+07	8.16e+07	-0.540	0.590	-2.04e+08	1.16e+08
Medellín	1.685e+08	4.31e+07	3.911	0.000	8.39e+07	2.53e+08
Santa Marta	-9.156e+06	4.21e+07	-0.217	0.828	-9.19e+07	7.36e+07
Santa marta	-1.641e+08	1.62e+08	-1.016	0.310	-4.82e+08	1.53e+08

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	3.989e+07	2.55e+07	1.565	0.118	-1.02e+07	9e+07
Área construída	3.186e+06	1.37e+05	23.263	0.000	2.92e+06	3.46e+06

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	7.065e+07	4.21e+07	1.678	0.094	-1.21e+07	1.53e+08
Área construída	2.811e+06	1.88e+05	14.984	0.000	2.44e+06	3.18e+06
Baños	1.089e+08	1.7e+07	6.408	0.000	7.55e+07	1.42e+08
Habitaciones	-8.061e+07	1.27e+07	-6.326	0.000	-1.06e+08	-5.56e+07

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	8865.4148	5663.087	1.565	0.118	-2260.178	2e+04
Área construída	708.0216	30.435	23.263	0.000	648.229	767.814