【调包侠的机器学习】医疗保险线性回归

# https://www.kaggle.com/datasets/mirichoi0218/insurance

import numpy as np 
import tensorflow as tf
import pandas as pd

def mean_norm(df_input): #@save
    return df_input.apply(lambda x: (x-x.mean())/ x.std(), axis=0)

def de_mean_norm(result, df_input:pd.DataFrame): #@save
    return result * df_input.std() +  df_input.mean()

insurenceData = pd.read_csv("../data/insurance.csv")
insurenceData.head(10)

	age	sex	bmi	children	smoker	region	charges
0	19	female	27.900	0	yes	southwest	16884.92400
1	18	male	33.770	1	no	southeast	1725.55230
2	28	male	33.000	3	no	southeast	4449.46200
3	33	male	22.705	0	no	northwest	21984.47061
4	32	male	28.880	0	no	northwest	3866.85520
5	31	female	25.740	0	no	southeast	3756.62160
6	46	female	33.440	1	no	southeast	8240.58960
7	37	female	27.740	3	no	northwest	7281.50560
8	37	male	29.830	2	no	northeast	6406.41070
9	60	female	25.840	0	no	northwest	28923.13692

sex_mapping = {'female':0, 'male':1}
insurenceData['sex'] =insurenceData['sex'].map(sex_mapping)

insurenceData.head(10)

	age	sex	bmi	children	smoker	region	charges
0	19	0	27.900	0	yes	southwest	16884.92400
1	18	1	33.770	1	no	southeast	1725.55230
2	28	1	33.000	3	no	southeast	4449.46200
3	33	1	22.705	0	no	northwest	21984.47061
4	32	1	28.880	0	no	northwest	3866.85520
5	31	0	25.740	0	no	southeast	3756.62160
6	46	0	33.440	1	no	southeast	8240.58960
7	37	0	27.740	3	no	northwest	7281.50560
8	37	1	29.830	2	no	northeast	6406.41070
9	60	0	25.840	0	no	northwest	28923.13692

# insurenceData['region'].value_counts()
region_mapping = {'southeast':0, 'southwest':1, "northwest": 2, "northeast": 3}
insurenceData.region =insurenceData.region.map(region_mapping)
insurenceData.head(10)

	age	sex	bmi	children	smoker	region	charges
0	19	0	27.900	0	yes	1	16884.92400
1	18	1	33.770	1	no	0	1725.55230
2	28	1	33.000	3	no	0	4449.46200
3	33	1	22.705	0	no	2	21984.47061
4	32	1	28.880	0	no	2	3866.85520
5	31	0	25.740	0	no	0	3756.62160
6	46	0	33.440	1	no	0	8240.58960
7	37	0	27.740	3	no	2	7281.50560
8	37	1	29.830	2	no	3	6406.41070
9	60	0	25.840	0	no	2	28923.13692

# insurenceData['smoker'].value_counts()
smoker_mapping = {"yes": 1, "no": 0}
insurenceData.smoker =insurenceData.smoker.map(smoker_mapping)
insurenceData.head(100)

	age	sex	bmi	children	smoker	region	charges
0	19	0	27.900	0	1	1	16884.92400
1	18	1	33.770	1	0	0	1725.55230
2	28	1	33.000	3	0	0	4449.46200
3	33	1	22.705	0	0	2	21984.47061
4	32	1	28.880	0	0	2	3866.85520
...	...	...	...	...	...	...	...
95	28	0	37.620	1	0	0	3766.88380
96	54	0	30.800	3	0	1	12105.32000
97	55	1	38.280	0	0	0	10226.28420
98	56	1	19.950	0	1	3	22412.64850
99	38	1	19.300	0	1	1	15820.69900

100 rows × 7 columns

rData = mean_norm(insurenceData)
# rData = insurenceData
# print(rData)
Y = rData['charges']

print('1charges: ', Y)
x = rData.drop(columns='charges')
X = tf.convert_to_tensor(x)
print('charges: ', Y)
y = tf.convert_to_tensor(Y)
print(X,y)

1charges:  0       0.298472
1      -0.953333
2      -0.728402
3       0.719574
4      -0.776512
          ...   
1333   -0.220468
1334   -0.913661
1335   -0.961237
1336   -0.930014
1337    1.310563
Name: charges, Length: 1338, dtype: float64
charges:  0       0.298472
1      -0.953333
2      -0.728402
3       0.719574
4      -0.776512
          ...   
1333   -0.220468
1334   -0.913661
1335   -0.961237
1336   -0.930014
1337    1.310563
Name: charges, Length: 1338, dtype: float64
tf.Tensor(
[[-1.4382265  -1.010141   -0.45315057 -0.90827406  1.9698501  -0.40272369]
 [-1.50940108  0.98922092  0.50943062 -0.07873775 -0.50727343 -1.2875255 ]
 [-0.7976553   0.98922092  0.38316358  1.58033487 -0.50727343 -1.2875255 ]
 ...
 [-1.50940108 -1.010141    1.01449877 -0.90827406 -0.50727343 -1.2875255 ]
 [-1.29587735 -1.010141   -0.79751522 -0.90827406 -0.50727343 -0.40272369]
 [ 1.55110577 -1.010141   -0.26129026 -0.90827406  1.9698501   0.48207812]], shape=(1338, 6), dtype=float64) tf.Tensor(
[ 0.2984722  -0.95333272 -0.72840232 ... -0.96123683 -0.93001377
  1.31056344], shape=(1338,), dtype=float64)

net = tf.keras.Sequential()
net.add(tf.keras.layers.Dense(units=1, input_dim=6))
initializer = tf.initializers.RandomNormal(stddev=0.1)
net.add(tf.keras.layers.Dense(1, kernel_initializer=initializer))
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
# metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
net.compile(optimizer=opt, loss=tf.keras.losses.MeanSquaredError(), metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

history = net.fit(X, y, batch_size=1000, epochs=500, validation_split=0.2, callbacks=[], shuffle= True)
net.summary()

Epoch 1/500
2/2 [==============================] - 0s 78ms/step - loss: 0.2575 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2666 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 2/500
2/2 [==============================] - 0s 56ms/step - loss: 0.2573 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2663 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 3/500
2/2 [==============================] - 0s 47ms/step - loss: 0.2570 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2661 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 4/500
2/2 [==============================] - 0s 56ms/step - loss: 0.2568 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2659 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 5/500
2/2 [==============================] - 0s 30ms/step - loss: 0.2566 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2656 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 6/500
2/2 [==============================] - 0s 53ms/step - loss: 0.2565 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2653 - val_sparse_categorical_accuracy: 0.0000e+00

。。。。。。。。。。。。。。。。。。。。。

2/2 [==============================] - 0s 53ms/step - loss: 0.2484 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2521 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 499/500
2/2 [==============================] - 0s 25ms/step - loss: 0.2484 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2520 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 500/500
2/2 [==============================] - 0s 54ms/step - loss: 0.2484 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2520 - val_sparse_categorical_accuracy: 0.0000e+00
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense (Dense)               (None, 1)                 7         
                                                                 
 dense_1 (Dense)             (None, 1)                 2         
                                                                 
=================================================================
Total params: 9
Trainable params: 9
Non-trainable params: 0
_________________________________________________________________

import matplotlib.pyplot as plt
print(history.history.keys())
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
# plt.plot(history.history["sparse_categorical_accuracy"], label="sparse_categorical_accuracy")
# plt.plot(history.history["val_sparse_categorical_accuracy"], label="val_sparse_categorical_accuracy")
plt.legend()
plt.show()

dict_keys(['loss', 'sparse_categorical_accuracy', 'val_loss', 'val_sparse_categorical_accuracy'])

png

pY = net.predict([[19,0,27.900,0,1,1]])

pY

array([[11.344333]], dtype=float32)

oriY = insurenceData['charges']

oriY.describe()

count     1338.000000
mean     13270.422265
std      12110.011237
min       1121.873900
25%       4740.287150
50%       9382.033000
75%      16639.912515
max      63770.428010
Name: charges, dtype: float64

# 反归一化
de_mean_norm(pY, oriY)

array([[150650.42]], dtype=float32)