Used chat records from group friends
# !pip install pymysql
# !pip install matplotlib
import tensorflow as tf
import numpy as np
import pymysql
import pandas as pd
from matplotlib.pyplot import plot
SENT_LENGTH = 1024
conn = pymysql.connect()
cs = conn.cursor()
cs.execute()
all_data = cs.fetchall()
all_data = [a[0] for a in all_data]
all_data[0]
'This u speed is too slow'
def padding(origin, endding, maxL):
if len(origin) >= maxL:
return origin[:maxL]
for i in range(maxL - len(origin)):
origin.append(endding)
return origin
!pip install jieba
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
import jieba
# Dictionary of all characters
all_char = set(''.join(all_data))
# encode to id
ids_from_chars = tf.keras.layers.StringLookup(vocabulary=list(all_char), mask_token=None)
# decode to char
chars_from_ids = tf.keras.layers.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)
# all char to all ints
all_data_num = ids_from_chars(list(''.join(all_data)))
all_data_num.shape, len(list(''.join(all_data)))
(TensorShape([147119]), 147119)
## to tensor
ids_dataset = tf.data.Dataset.from_tensor_slices(all_data_num)
for ids in ids_dataset.take(10):
print(chars_from_ids(ids).numpy().decode('utf-8'))
This
u
speed
is
too
slow
淦
# max length
seq_length = 100
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)
# for seq in sequences.take(1):
# for c in chars_from_ids(seq).numpy():
# print(c.decode('utf-8'))
# generate x,y
def split_input_target(sequence):
input_text = sequence[:-1]
target_text = sequence[1:]
return input_text, target_text
dataset = sequences.map(split_input_target)
def text_from_ids(ids):
return tf.strings.reduce_join([x.numpy().decode('utf-8') for x in chars_from_ids(ids)], axis=-1)
for input_example, target_example in dataset.take(1):
print("Input :", text_from_ids(input_example).numpy().decode("utf-8"))
print("Target:", text_from_ids(target_example).numpy().decode("utf-8"))
Input : This u speed is too slow 淦 I found sudo-prompt, it's okay. But this library is too garbage, it doesn't return that process object after executing the command. What are you doing? A client code electron
??? Let the barbecue guy write you a new library, my vscode is also carrying the barbershop
Target: u speed is too slow 淦 I found sudo-prompt, it's okay. But this library is too garbage, it doesn't return that process object after executing the command. What are you doing? A client code electron
??? Let the barbecue guy write you a new library, my vscode is also carrying the barbershop now
# Batch size
BATCH_SIZE = 256
# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 1000
dataset = (
dataset
.shuffle(BUFFER_SIZE)
.batch(BATCH_SIZE, drop_remainder=True)
.prefetch(tf.data.experimental.AUTOTUNE))
dataset
<PrefetchDataset element_spec=(TensorSpec(shape=(256, 100), dtype=tf.int64, name=None), TensorSpec(shape=(256, 100), dtype=tf.int64, name=None))>
# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())
# The embedding dimension
embedding_dim = 256
# Number of RNN units
rnn_units = 1024
# Model
class MyModel(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, rnn_units):
super().__init__(self)
# Dictionary size
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
# GRU network
# self.gru = tf.keras.layers.GRU(rnn_units,
# return_sequences=True,
# return_state=True)
self.rnn = tf.keras.layers.SimpleRNN(rnn_units, return_sequences=True, return_state=True)
# Output is the size of the dictionary space
self.dense = tf.keras.layers.Dense(vocab_size)
def call(self, inputs, states=None, return_state=False, training=False):
x = inputs
x = self.embedding(x, training=training)
if states is None:
states = self.rnn.get_initial_state(x)
# states = self.gru.get_initial_state(x)
x, states = self.rnn(x, initial_state=states, training=training)
# x, states = self.gru(x, initial_state=states, training=training)
x = self.dense(x, training=training)
if return_state:
return x, states
else:
return x
model = MyModel(
vocab_size=vocab_size,
embedding_dim=embedding_dim,
rnn_units=rnn_units)
for input_example_batch, target_example_batch in dataset.take(1):
example_batch_predictions = model(input_example_batch)
print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
(256, 100, 2707) # (batch_size, sequence_length, vocab_size)
model.summary()
Model: "my_model_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_1 (Embedding) multiple 692992
simple_rnn_1 (SimpleRNN) multiple 1311744
dense_1 (Dense) multiple 2774675
=================================================================
Total params: 4,779,411
Trainable params: 4,779,411
Non-trainable params: 0
_________________________________________________________________
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices
array([ 196, 1635, 858, 1146, 2427, 2403, 1519, 1179, 1108, 2644, 220,
647, 320, 2198, 2584, 877, 2240, 465, 2452, 443, 368, 128,
617, 2263, 401, 2111, 1505, 1328, 2615, 1895, 31, 440, 315,
566, 2298, 2527, 1890, 2498, 2412, 1971, 296, 1594, 458, 2343,
948, 2544, 1103, 668, 1156, 289, 406, 2270, 1455, 1187, 2687,
873, 1899, 929, 2706, 2385, 1935, 160, 197, 258, 1187, 2703,
1585, 2018, 210, 451, 857, 97, 76, 1130, 2286, 549, 2618,
375, 735, 48, 1930, 897, 2428, 2261, 1117, 696, 300, 720,
1159, 2628, 569, 1215, 145, 537, 1668, 795, 205, 2141, 2254,
1568], dtype=int64)
print("Input:\n", text_from_ids(input_example_batch[0]).numpy().decode("utf-8"))
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy().decode("utf-8"))
Input:
There are some reviews to be modified, just go in and agree with the reviewers, it's uncomfortable for those of us who write code. Forget it, this is not a big deal, a small problem, a few hundred lines really haven't appeared much else, just follow your own habits. There are some specifications that I also find unsightly, there is one I borrowed and remember called
Next Char Predictions:
岗斧丑绿球估擅羊怪届Q跪丫-磁概⢴直慰淫滑[R宇咩烤接虫⠔悬婆绵鹿龙营浆扩赤铺领渍遍生园硫傅渠W踢☕走绷赌件久艳着娱真E界们撅镇件肤乃原晨万旅佬男澡5苗闯句鸣填画闻使结减禁瞧陪圭石豪背不碳爪隐猫助摊;
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss: ", example_batch_mean_loss)
Prediction shape: (256, 100, 2707) # (batch_size, sequence_length, vocab_size)
Mean loss: tf.Tensor(7.906991, shape=(), dtype=float32)
tf.exp(example_batch_mean_loss).numpy()
2716.205
opt = tf.keras.optimizers.Adam(0.001)
model.compile(optimizer=opt, loss=loss, metrics=['accuracy'])
import os
# Directory where the checkpoints will be saved
checkpoint_dir = './qqmsg_rnn_training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
filepath=checkpoint_prefix,
save_weights_only=True)
EPOCHS = 200
history = model.fit(dataset, epochs=EPOCHS,
callbacks=[checkpoint_callback])
Epoch 1/200
5/5 [==============================] - 1s 150ms/step - loss: 4.4653 - accuracy: 0.2164
Epoch 2/200
5/5 [==============================] - 1s 147ms/step - loss: 4.4367 - accuracy: 0.2187
5/5 [==============================] - 1s 145ms/step - loss: 0.5430 - accuracy: 0.9156
Epoch 200/200
5/5 [==============================] - 1s 152ms/step - loss: 0.5547 - accuracy: 0.9111
import matplotlib.pyplot as plt
print(history.history.keys())
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["accuracy"], label="accuracy")
# plt.plot(history.history["val_loss"], label="val_loss")
# plt.plot(history.history["val_accuracy"], label="val_accuracy")
# plt.plot(history.history["sparse_categorical_accuracy"], label="sparse_categorical_accuracy")
# plt.plot(history.history["val_sparse_categorical_accuracy"], label="val_sparse_categorical_accuracy")
plt.legend()
plt.show()
dict_keys(['loss', 'accuracy'])
class OneStep(tf.keras.Model):
def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
super().__init__()
self.temperature = temperature
self.model = model
self.chars_from_ids = chars_from_ids
self.ids_from_chars = ids_from_chars
# Create a mask to prevent "[UNK]" from being generated.
skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
sparse_mask = tf.SparseTensor(
# Put a -inf at each bad index.
values=[-float('inf')]*len(skip_ids),
indices=skip_ids,
# Match the shape to the vocabulary
dense_shape=[len(ids_from_chars.get_vocabulary())])
self.prediction_mask = tf.sparse.to_dense(sparse_mask)
@tf.function
def generate_one_step(self, inputs, states=None):
# Convert strings to token IDs.
input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
input_ids = self.ids_from_chars(input_chars).to_tensor()
# Run the model.
# predicted_logits.shape is [batch, char, next_char_logits]
predicted_logits, states = self.model(inputs=input_ids, states=states,
return_state=True)
# Only use the last prediction.
predicted_logits = predicted_logits[:, -1, :]
predicted_logits = predicted_logits/self.temperature
# Apply the prediction mask: prevent "[UNK]" from being generated.
predicted_logits = predicted_logits + self.prediction_mask
# Sample the output logits to generate token IDs.
predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
predicted_ids = tf.squeeze(predicted_ids, axis=-1)
# Convert from token ids to characters
predicted_chars = self.chars_from_ids(predicted_ids)
# Return the characters and model state.
return predicted_chars, states
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)
import time
start = time.time()
states = None
next_char = tf.constant(["Rich woman", "Customer service"])
result = [next_char]
for n in range(50):
next_char, states = one_step_model.generate_one_step(next_char, states=states)
result.append(next_char)
result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)
Rich woman, want to touch you, send you more than 100 generated products received what actual products to do
Look at the work that has come, look at all the native d+10 China
________________________________________________________________________________
Run time: 0.16722774505615234