尝试参与了 Kaggle 的房价预测项目【house-prices-advanced-regression-techniques】,初步直接使用所有特征参与训练。(~ 才不是因为不会特征工程~)
效果一般般:
import tensorflow as tf
import pandas as pd
import numpy as np
from matplotlib.pyplot import plot
import seaborn as sns
def mean_norm(df_input): #@save
return df_input.apply(lambda x: (x-x.mean())/ x.std(), axis=0)
def de_mean_norm(result, df_input:pd.DataFrame): #@save
return result * df_input.std() + df_input.mean()
train_data = pd.read_csv("../data/house-prices-advanced-regression-techniques/train.csv")
test_data = pd.read_csv("../data/house-prices-advanced-regression-techniques/test.csv")
id_data = train_data['Id']
test_id_data = test_data["Id"]
y_data = train_data["SalePrice"]
train_data.drop("Id",axis=1, inplace=True)
test_data.drop("Id",axis=1, inplace=True)
train_data.drop("SalePrice",axis=1, inplace=True)
all_data = pd.concat([train_data, test_data])
all_data.fillna(value=0, inplace=True)
all_data = pd.get_dummies(all_data, dummy_na=True)
all_data.fillna(value=0, inplace=True)
# all_data.describe
train_data_dummy_x = all_data[:len(train_data)]
test_data_dummy_x = all_data[len(train_data):]
train_data_dummy_x_mean = mean_norm(train_data_dummy_x)
train_data_dummy_x_mean.fillna(value=0, inplace=True)
y_data_mean = mean_norm(pd.DataFrame(y_data))
train_data_dummy_x_mean.describe, y_data_mean.describe
(<bound method NDFrame.describe of MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt \
0 0.073350 0.212804 -0.207071 0.651256 -0.517023 1.050634
1 -0.872264 0.645526 -0.091855 -0.071812 2.178881 0.156680
2 0.073350 0.299349 0.073455 0.651256 -0.517023 0.984415
1458 -0.30589 0.0
1459 -0.30589 0.0
[1460 rows x 354 columns]>,
<bound method NDFrame.describe of SalePrice
0 0.347154
1 0.007286
2 0.535970
3 -0.515105
4 0.869545
... ...
1455 -0.074534
1456 0.366036
1457 1.077242
1458 -0.488356
1459 -0.420697
[1460 rows x 1 columns]>)
train_data_dummy_x_mean.shape, y_data_mean[:100]
((1460, 354),
SalePrice
0 0.347154
1 0.007286
2 0.535970
3 -0.515105
4 0.869545
.. ...
95 0.051343
96 0.416387
97 -1.084699
98 -1.232605
99 -0.654199
[100 rows x 1 columns])
适当 dropout 防止过拟合
net = tf.keras.Sequential([
tf.keras.layers.Dense(units=708, input_dim=354, activation=tf.keras.activations.tanh),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(units=177, activation=tf.keras.activations.tanh),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(10, activation=tf.keras.activations.sigmoid),
tf.keras.layers.Dense(1)
])
opt = tf.keras.optimizers.SGD(learning_rate=0.001)
net.compile(optimizer=opt, loss=tf.keras.losses.MeanSquaredError(), metrics=['accuracy'])
train_data_dummy_X = tf.constant(train_data_dummy_x_mean)
train_data_y = tf.constant(y_data_mean)
train_data_dummy_X.shape, train_data_y.shape
(TensorShape([1460, 354]), TensorShape([1460, 1]))
history = net.fit(train_data_dummy_X, train_data_y, batch_size=50, epochs=200, validation_split=0.2, callbacks=[], shuffle= True)
net.summary()
Epoch 1/200
24/24 [==============================] - 0s 10ms/step - loss: 2.6157 - accuracy: 0.0000e+00 - val_loss: 2.2767 - val_accuracy: 0.0000e+00
Epoch 2/200
24/24 [==============================] - 0s 7ms/step - loss: 1.9880 - accuracy: 0.0000e+00 - val_loss: 1.7652 - val_accuracy: 0.0000e+00
Epoch 3/200
Epoch 200/200
24/24 [==============================] - 0s 5ms/step - loss: 0.1689 - accuracy: 0.0000e+00 - val_loss: 0.2754 - val_accuracy: 0.0000e+00
Model: "sequential_6"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_24 (Dense) (None, 708) 251340
dropout_8 (Dropout) (None, 708) 0
dense_25 (Dense) (None, 177) 125493
dropout_9 (Dropout) (None, 177) 0
dense_26 (Dense) (None, 10) 1780
dense_27 (Dense) (None, 1) 11
=================================================================
Total params: 378,624
Trainable params: 378,624
Non-trainable params: 0
_________________________________________________________________
import matplotlib.pyplot as plt
print(history.history.keys())
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["accuracy"], label="accuracy")
plt.plot(history.history["val_loss"], label="val_loss")
plt.plot(history.history["val_accuracy"], label="val_accuracy")
# plt.plot(history.history["sparse_categorical_accuracy"], label="sparse_categorical_accuracy")
# plt.plot(history.history["val_sparse_categorical_accuracy"], label="val_sparse_categorical_accuracy")
plt.legend()
plt.show()
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])
test_data_dummy_x_mean = mean_norm(test_data_dummy_x)
train_data_dummy_x_mean.fillna(value=0, inplace=True)
# train_data_dummy_x_mean.describe
<bound method NDFrame.describe of MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt \
0 0.073350 0.212804 -0.207071 0.651256 -0.517023 1.050634
1 -0.872264 0.645526 -0.091855 -0.071812 2.178881 0.156680
2 0.073350 0.299349 0.073455 0.651256 -0.517023 0.984415
3 0.309753 0.068564 -0.096864 0.651256 -0.517023 -1.862993
4 0.073350 0.760919 0.375020 1.374324 -0.517023 0.951306
... ... ... ... ... ... ...
[1460 rows x 354 columns]>
test_data_dummy_x_mean.fillna(0, inplace=True)
py = net.predict(test_data_dummy_x_mean)
pp = de_mean_norm(py, y_data)
pp[:100]
array([[120861.6 ],
[179080.06 ],
[197782.52 ],
[ 82506.18 ],
[129601.914]], dtype=float32)
with open("../data/house-prices-advanced-regression-techniques/test_submission.csv", "w") as f:
f.write("Id,SalePrice\n")
i = 0
for p in pp:
f.write("%d,%f\n"%(i+1461, p[0]))
i = i + 1
f.close()