# https://www.kaggle.com/datasets/mirichoi0218/insurance
import numpy as np
import tensorflow as tf
import pandas as pd
def mean_norm(df_input): #@save
return df_input.apply(lambda x: (x-x.mean())/ x.std(), axis=0)
def de_mean_norm(result, df_input:pd.DataFrame): #@save
return result * df_input.std() + df_input.mean()
insurenceData = pd.read_csv("../data/insurance.csv")
insurenceData.head(10)
| age | sex | bmi | children | smoker | region | charges |
---|
0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
---|
1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
---|
2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
---|
3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
---|
4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
---|
5 | 31 | female | 25.740 | 0 | no | southeast | 3756.62160 |
---|
6 | 46 | female | 33.440 | 1 | no | southeast | 8240.58960 |
---|
7 | 37 | female | 27.740 | 3 | no | northwest | 7281.50560 |
---|
8 | 37 | male | 29.830 | 2 | no | northeast | 6406.41070 |
---|
9 | 60 | female | 25.840 | 0 | no | northwest | 28923.13692 |
---|
sex_mapping = {'female':0, 'male':1}
insurenceData['sex'] =insurenceData['sex'].map(sex_mapping)
insurenceData.head(10)
| age | sex | bmi | children | smoker | region | charges |
---|
0 | 19 | 0 | 27.900 | 0 | yes | southwest | 16884.92400 |
---|
1 | 18 | 1 | 33.770 | 1 | no | southeast | 1725.55230 |
---|
2 | 28 | 1 | 33.000 | 3 | no | southeast | 4449.46200 |
---|
3 | 33 | 1 | 22.705 | 0 | no | northwest | 21984.47061 |
---|
4 | 32 | 1 | 28.880 | 0 | no | northwest | 3866.85520 |
---|
5 | 31 | 0 | 25.740 | 0 | no | southeast | 3756.62160 |
---|
6 | 46 | 0 | 33.440 | 1 | no | southeast | 8240.58960 |
---|
7 | 37 | 0 | 27.740 | 3 | no | northwest | 7281.50560 |
---|
8 | 37 | 1 | 29.830 | 2 | no | northeast | 6406.41070 |
---|
9 | 60 | 0 | 25.840 | 0 | no | northwest | 28923.13692 |
---|
# insurenceData['region'].value_counts()
region_mapping = {'southeast':0, 'southwest':1, "northwest": 2, "northeast": 3}
insurenceData.region =insurenceData.region.map(region_mapping)
insurenceData.head(10)
| age | sex | bmi | children | smoker | region | charges |
---|
0 | 19 | 0 | 27.900 | 0 | yes | 1 | 16884.92400 |
---|
1 | 18 | 1 | 33.770 | 1 | no | 0 | 1725.55230 |
---|
2 | 28 | 1 | 33.000 | 3 | no | 0 | 4449.46200 |
---|
3 | 33 | 1 | 22.705 | 0 | no | 2 | 21984.47061 |
---|
4 | 32 | 1 | 28.880 | 0 | no | 2 | 3866.85520 |
---|
5 | 31 | 0 | 25.740 | 0 | no | 0 | 3756.62160 |
---|
6 | 46 | 0 | 33.440 | 1 | no | 0 | 8240.58960 |
---|
7 | 37 | 0 | 27.740 | 3 | no | 2 | 7281.50560 |
---|
8 | 37 | 1 | 29.830 | 2 | no | 3 | 6406.41070 |
---|
9 | 60 | 0 | 25.840 | 0 | no | 2 | 28923.13692 |
---|
# insurenceData['smoker'].value_counts()
smoker_mapping = {"yes": 1, "no": 0}
insurenceData.smoker =insurenceData.smoker.map(smoker_mapping)
insurenceData.head(100)
| age | sex | bmi | children | smoker | region | charges |
---|
0 | 19 | 0 | 27.900 | 0 | 1 | 1 | 16884.92400 |
---|
1 | 18 | 1 | 33.770 | 1 | 0 | 0 | 1725.55230 |
---|
2 | 28 | 1 | 33.000 | 3 | 0 | 0 | 4449.46200 |
---|
3 | 33 | 1 | 22.705 | 0 | 0 | 2 | 21984.47061 |
---|
4 | 32 | 1 | 28.880 | 0 | 0 | 2 | 3866.85520 |
---|
... | ... | ... | ... | ... | ... | ... | ... |
---|
95 | 28 | 0 | 37.620 | 1 | 0 | 0 | 3766.88380 |
---|
96 | 54 | 0 | 30.800 | 3 | 0 | 1 | 12105.32000 |
---|
97 | 55 | 1 | 38.280 | 0 | 0 | 0 | 10226.28420 |
---|
98 | 56 | 1 | 19.950 | 0 | 1 | 3 | 22412.64850 |
---|
99 | 38 | 1 | 19.300 | 0 | 1 | 1 | 15820.69900 |
---|
100 rows × 7 columns
rData = mean_norm(insurenceData)
# rData = insurenceData
# print(rData)
Y = rData['charges']
print('1charges: ', Y)
x = rData.drop(columns='charges')
X = tf.convert_to_tensor(x)
print('charges: ', Y)
y = tf.convert_to_tensor(Y)
print(X,y)
1charges: 0 0.298472
1 -0.953333
2 -0.728402
3 0.719574
4 -0.776512
...
1333 -0.220468
1334 -0.913661
1335 -0.961237
1336 -0.930014
1337 1.310563
Name: charges, Length: 1338, dtype: float64
charges: 0 0.298472
1 -0.953333
2 -0.728402
3 0.719574
4 -0.776512
...
1333 -0.220468
1334 -0.913661
1335 -0.961237
1336 -0.930014
1337 1.310563
Name: charges, Length: 1338, dtype: float64
tf.Tensor(
[[-1.4382265 -1.010141 -0.45315057 -0.90827406 1.9698501 -0.40272369]
[-1.50940108 0.98922092 0.50943062 -0.07873775 -0.50727343 -1.2875255 ]
[-0.7976553 0.98922092 0.38316358 1.58033487 -0.50727343 -1.2875255 ]
...
[-1.50940108 -1.010141 1.01449877 -0.90827406 -0.50727343 -1.2875255 ]
[-1.29587735 -1.010141 -0.79751522 -0.90827406 -0.50727343 -0.40272369]
[ 1.55110577 -1.010141 -0.26129026 -0.90827406 1.9698501 0.48207812]], shape=(1338, 6), dtype=float64) tf.Tensor(
[ 0.2984722 -0.95333272 -0.72840232 ... -0.96123683 -0.93001377
1.31056344], shape=(1338,), dtype=float64)
net = tf.keras.Sequential()
net.add(tf.keras.layers.Dense(units=1, input_dim=6))
initializer = tf.initializers.RandomNormal(stddev=0.1)
net.add(tf.keras.layers.Dense(1, kernel_initializer=initializer))
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
# metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
net.compile(optimizer=opt, loss=tf.keras.losses.MeanSquaredError(), metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
history = net.fit(X, y, batch_size=1000, epochs=500, validation_split=0.2, callbacks=[], shuffle= True)
net.summary()
Epoch 1/500
2/2 [==============================] - 0s 78ms/step - loss: 0.2575 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2666 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 2/500
2/2 [==============================] - 0s 56ms/step - loss: 0.2573 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2663 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 3/500
2/2 [==============================] - 0s 47ms/step - loss: 0.2570 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2661 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 4/500
2/2 [==============================] - 0s 56ms/step - loss: 0.2568 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2659 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 5/500
2/2 [==============================] - 0s 30ms/step - loss: 0.2566 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2656 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 6/500
2/2 [==============================] - 0s 53ms/step - loss: 0.2565 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2653 - val_sparse_categorical_accuracy: 0.0000e+00
。。。。。。。。。。。。。。。。。。。。。
2/2 [==============================] - 0s 53ms/step - loss: 0.2484 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2521 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 499/500
2/2 [==============================] - 0s 25ms/step - loss: 0.2484 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2520 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 500/500
2/2 [==============================] - 0s 54ms/step - loss: 0.2484 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2520 - val_sparse_categorical_accuracy: 0.0000e+00
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense (Dense) (None, 1) 7
dense_1 (Dense) (None, 1) 2
=================================================================
Total params: 9
Trainable params: 9
Non-trainable params: 0
_________________________________________________________________
import matplotlib.pyplot as plt
print(history.history.keys())
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
# plt.plot(history.history["sparse_categorical_accuracy"], label="sparse_categorical_accuracy")
# plt.plot(history.history["val_sparse_categorical_accuracy"], label="val_sparse_categorical_accuracy")
plt.legend()
plt.show()
dict_keys(['loss', 'sparse_categorical_accuracy', 'val_loss', 'val_sparse_categorical_accuracy'])
pY = net.predict([[19,0,27.900,0,1,1]])
array([[11.344333]], dtype=float32)
oriY = insurenceData['charges']
count 1338.000000
mean 13270.422265
std 12110.011237
min 1121.873900
25% 4740.287150
50% 9382.033000
75% 16639.912515
max 63770.428010
Name: charges, dtype: float64
# 反归一化
de_mean_norm(pY, oriY)
array([[150650.42]], dtype=float32)