banner
RustyNail

RustyNail

coder. 【blog】https://rustynail.me 【nostr】wss://ts.relays.world/ wss://relays.world/nostr

【调包侠的机器学习】医疗保险线性回归

# https://www.kaggle.com/datasets/mirichoi0218/insurance

import numpy as np 
import tensorflow as tf
import pandas as pd
def mean_norm(df_input): #@save
    return df_input.apply(lambda x: (x-x.mean())/ x.std(), axis=0)
def de_mean_norm(result, df_input:pd.DataFrame): #@save
    return result * df_input.std() +  df_input.mean()
insurenceData = pd.read_csv("../data/insurance.csv")
insurenceData.head(10)
agesexbmichildrensmokerregioncharges
019female27.9000yessouthwest16884.92400
118male33.7701nosoutheast1725.55230
228male33.0003nosoutheast4449.46200
333male22.7050nonorthwest21984.47061
432male28.8800nonorthwest3866.85520
531female25.7400nosoutheast3756.62160
646female33.4401nosoutheast8240.58960
737female27.7403nonorthwest7281.50560
837male29.8302nonortheast6406.41070
960female25.8400nonorthwest28923.13692
sex_mapping = {'female':0, 'male':1}
insurenceData['sex'] =insurenceData['sex'].map(sex_mapping)
insurenceData.head(10)
agesexbmichildrensmokerregioncharges
019027.9000yessouthwest16884.92400
118133.7701nosoutheast1725.55230
228133.0003nosoutheast4449.46200
333122.7050nonorthwest21984.47061
432128.8800nonorthwest3866.85520
531025.7400nosoutheast3756.62160
646033.4401nosoutheast8240.58960
737027.7403nonorthwest7281.50560
837129.8302nonortheast6406.41070
960025.8400nonorthwest28923.13692
# insurenceData['region'].value_counts()
region_mapping = {'southeast':0, 'southwest':1, "northwest": 2, "northeast": 3}
insurenceData.region =insurenceData.region.map(region_mapping)
insurenceData.head(10)
agesexbmichildrensmokerregioncharges
019027.9000yes116884.92400
118133.7701no01725.55230
228133.0003no04449.46200
333122.7050no221984.47061
432128.8800no23866.85520
531025.7400no03756.62160
646033.4401no08240.58960
737027.7403no27281.50560
837129.8302no36406.41070
960025.8400no228923.13692
# insurenceData['smoker'].value_counts()
smoker_mapping = {"yes": 1, "no": 0}
insurenceData.smoker =insurenceData.smoker.map(smoker_mapping)
insurenceData.head(100)
agesexbmichildrensmokerregioncharges
019027.90001116884.92400
118133.7701001725.55230
228133.0003004449.46200
333122.70500221984.47061
432128.8800023866.85520
........................
9528037.6201003766.88380
9654030.80030112105.32000
9755138.28000010226.28420
9856119.95001322412.64850
9938119.30001115820.69900

100 rows × 7 columns

rData = mean_norm(insurenceData)
# rData = insurenceData
# print(rData)
Y = rData['charges']

print('1charges: ', Y)
x = rData.drop(columns='charges')
X = tf.convert_to_tensor(x)
print('charges: ', Y)
y = tf.convert_to_tensor(Y)
print(X,y)
1charges:  0       0.298472
1      -0.953333
2      -0.728402
3       0.719574
4      -0.776512
          ...   
1333   -0.220468
1334   -0.913661
1335   -0.961237
1336   -0.930014
1337    1.310563
Name: charges, Length: 1338, dtype: float64
charges:  0       0.298472
1      -0.953333
2      -0.728402
3       0.719574
4      -0.776512
          ...   
1333   -0.220468
1334   -0.913661
1335   -0.961237
1336   -0.930014
1337    1.310563
Name: charges, Length: 1338, dtype: float64
tf.Tensor(
[[-1.4382265  -1.010141   -0.45315057 -0.90827406  1.9698501  -0.40272369]
 [-1.50940108  0.98922092  0.50943062 -0.07873775 -0.50727343 -1.2875255 ]
 [-0.7976553   0.98922092  0.38316358  1.58033487 -0.50727343 -1.2875255 ]
 ...
 [-1.50940108 -1.010141    1.01449877 -0.90827406 -0.50727343 -1.2875255 ]
 [-1.29587735 -1.010141   -0.79751522 -0.90827406 -0.50727343 -0.40272369]
 [ 1.55110577 -1.010141   -0.26129026 -0.90827406  1.9698501   0.48207812]], shape=(1338, 6), dtype=float64) tf.Tensor(
[ 0.2984722  -0.95333272 -0.72840232 ... -0.96123683 -0.93001377
  1.31056344], shape=(1338,), dtype=float64)
net = tf.keras.Sequential()
net.add(tf.keras.layers.Dense(units=1, input_dim=6))
initializer = tf.initializers.RandomNormal(stddev=0.1)
net.add(tf.keras.layers.Dense(1, kernel_initializer=initializer))
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
# metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
net.compile(optimizer=opt, loss=tf.keras.losses.MeanSquaredError(), metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
history = net.fit(X, y, batch_size=1000, epochs=500, validation_split=0.2, callbacks=[], shuffle= True)
net.summary()
Epoch 1/500
2/2 [==============================] - 0s 78ms/step - loss: 0.2575 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2666 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 2/500
2/2 [==============================] - 0s 56ms/step - loss: 0.2573 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2663 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 3/500
2/2 [==============================] - 0s 47ms/step - loss: 0.2570 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2661 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 4/500
2/2 [==============================] - 0s 56ms/step - loss: 0.2568 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2659 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 5/500
2/2 [==============================] - 0s 30ms/step - loss: 0.2566 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2656 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 6/500
2/2 [==============================] - 0s 53ms/step - loss: 0.2565 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2653 - val_sparse_categorical_accuracy: 0.0000e+00

。。。。。。。。。。。。。。。。。。。。。

2/2 [==============================] - 0s 53ms/step - loss: 0.2484 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2521 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 499/500
2/2 [==============================] - 0s 25ms/step - loss: 0.2484 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2520 - val_sparse_categorical_accuracy: 0.0000e+00
Epoch 500/500
2/2 [==============================] - 0s 54ms/step - loss: 0.2484 - sparse_categorical_accuracy: 0.0000e+00 - val_loss: 0.2520 - val_sparse_categorical_accuracy: 0.0000e+00
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense (Dense)               (None, 1)                 7         
                                                                 
 dense_1 (Dense)             (None, 1)                 2         
                                                                 
=================================================================
Total params: 9
Trainable params: 9
Non-trainable params: 0
_________________________________________________________________
import matplotlib.pyplot as plt
print(history.history.keys())
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
# plt.plot(history.history["sparse_categorical_accuracy"], label="sparse_categorical_accuracy")
# plt.plot(history.history["val_sparse_categorical_accuracy"], label="val_sparse_categorical_accuracy")
plt.legend()
plt.show()
dict_keys(['loss', 'sparse_categorical_accuracy', 'val_loss', 'val_sparse_categorical_accuracy'])



png

pY = net.predict([[19,0,27.900,0,1,1]])
pY
array([[11.344333]], dtype=float32)
oriY = insurenceData['charges']
oriY.describe()
count     1338.000000
mean     13270.422265
std      12110.011237
min       1121.873900
25%       4740.287150
50%       9382.033000
75%      16639.912515
max      63770.428010
Name: charges, dtype: float64
# 反归一化
de_mean_norm(pY, oriY)
array([[150650.42]], dtype=float32)
Loading...
Ownership of this post data is guaranteed by blockchain and smart contracts to the creator alone.