【Package Swapper's Machine Learning】RNN Training Text Generation

Used chat records from group friends

# !pip install pymysql
# !pip install matplotlib
import tensorflow as tf
import numpy as np
import pymysql
import pandas as pd
from matplotlib.pyplot import plot

SENT_LENGTH = 1024

conn = pymysql.connect()
cs = conn.cursor()
cs.execute()
all_data = cs.fetchall()
all_data = [a[0] for a in all_data]
all_data[0]

'This u speed is too slow'

def padding(origin, endding, maxL):
    if len(origin) >= maxL:
        return origin[:maxL]
    for i in range(maxL - len(origin)):
        origin.append(endding)
    return origin

!pip install jieba

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple

import jieba

# Dictionary of all characters
all_char = set(''.join(all_data))
# encode to id 
ids_from_chars = tf.keras.layers.StringLookup(vocabulary=list(all_char), mask_token=None)
# decode to char
chars_from_ids = tf.keras.layers.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

# all char to all ints
all_data_num = ids_from_chars(list(''.join(all_data)))
all_data_num.shape, len(list(''.join(all_data)))

(TensorShape([147119]), 147119)

## to tensor
ids_dataset = tf.data.Dataset.from_tensor_slices(all_data_num)
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

This
u
speed
is
too
slow
淦

# max length
seq_length = 100

sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)
# for seq in sequences.take(1):
#      for c in chars_from_ids(seq).numpy():
#             print(c.decode('utf-8'))

# generate x,y
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

def text_from_ids(ids):
    return tf.strings.reduce_join([x.numpy().decode('utf-8') for x in chars_from_ids(ids)], axis=-1)

for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy().decode("utf-8"))
    print("Target:", text_from_ids(target_example).numpy().decode("utf-8"))

Input : This u speed is too slow 淦 I found sudo-prompt, it's okay. But this library is too garbage, it doesn't return that process object after executing the command. What are you doing? A client code electron
??? Let the barbecue guy write you a new library, my vscode is also carrying the barbershop
Target: u speed is too slow 淦 I found sudo-prompt, it's okay. But this library is too garbage, it doesn't return that process object after executing the command. What are you doing? A client code electron
??? Let the barbecue guy write you a new library, my vscode is also carrying the barbershop now

# Batch size
BATCH_SIZE = 256

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 1000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(256, 100), dtype=tf.int64, name=None), TensorSpec(shape=(256, 100), dtype=tf.int64, name=None))>

# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

# Model
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
#      Dictionary size
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
#     GRU network
#         self.gru = tf.keras.layers.GRU(rnn_units,
#                                        return_sequences=True,
#                                        return_state=True)
        self.rnn = tf.keras.layers.SimpleRNN(rnn_units, return_sequences=True, return_state=True)
        # Output is the size of the dictionary space
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.rnn.get_initial_state(x)
#             states = self.gru.get_initial_state(x)
        x, states = self.rnn(x, initial_state=states, training=training)
#         x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        else:
            return x

model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(256, 100, 2707) # (batch_size, sequence_length, vocab_size)

model.summary()

Model: "my_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding_1 (Embedding)     multiple                  692992    
                                                                 
 simple_rnn_1 (SimpleRNN)    multiple                  1311744   
                                                                 
 dense_1 (Dense)             multiple                  2774675   
                                                                 
=================================================================
Total params: 4,779,411
Trainable params: 4,779,411
Non-trainable params: 0
_________________________________________________________________

sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

sampled_indices

array([ 196, 1635,  858, 1146, 2427, 2403, 1519, 1179, 1108, 2644,  220,
        647,  320, 2198, 2584,  877, 2240,  465, 2452,  443,  368,  128,
        617, 2263,  401, 2111, 1505, 1328, 2615, 1895,   31,  440,  315,
        566, 2298, 2527, 1890, 2498, 2412, 1971,  296, 1594,  458, 2343,
        948, 2544, 1103,  668, 1156,  289,  406, 2270, 1455, 1187, 2687,
        873, 1899,  929, 2706, 2385, 1935,  160,  197,  258, 1187, 2703,
       1585, 2018,  210,  451,  857,   97,   76, 1130, 2286,  549, 2618,
        375,  735,   48, 1930,  897, 2428, 2261, 1117,  696,  300,  720,
       1159, 2628,  569, 1215,  145,  537, 1668,  795,  205, 2141, 2254,
       1568], dtype=int64)

print("Input:\n", text_from_ids(input_example_batch[0]).numpy().decode("utf-8"))
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy().decode("utf-8"))

Input:
 There are some reviews to be modified, just go in and agree with the reviewers, it's uncomfortable for those of us who write code. Forget it, this is not a big deal, a small problem, a few hundred lines really haven't appeared much else, just follow your own habits. There are some specifications that I also find unsightly, there is one I borrowed and remember called

Next Char Predictions:
 岗斧丑绿球估擅羊怪届Q跪丫－磁概⢴直慰淫滑[R宇咩烤接虫⠔悬婆绵鹿龙营浆扩赤铺领渍遍生园硫傅渠W踢☕走绷赌件久艳着娱真E界们撅镇件肤乃原晨万旅佬男澡5苗闯句鸣填画闻使结减禁瞧陪圭石豪背不碳爪隐猫助摊；

loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)

Prediction shape:  (256, 100, 2707)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(7.906991, shape=(), dtype=float32)

tf.exp(example_batch_mean_loss).numpy()

2716.205

opt = tf.keras.optimizers.Adam(0.001)
model.compile(optimizer=opt, loss=loss, metrics=['accuracy'])

import os
# Directory where the checkpoints will be saved
checkpoint_dir = './qqmsg_rnn_training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

EPOCHS = 200
history = model.fit(dataset, epochs=EPOCHS,
                    callbacks=[checkpoint_callback])

Epoch 1/200
5/5 [==============================] - 1s 150ms/step - loss: 4.4653 - accuracy: 0.2164
Epoch 2/200
5/5 [==============================] - 1s 147ms/step - loss: 4.4367 - accuracy: 0.2187

5/5 [==============================] - 1s 145ms/step - loss: 0.5430 - accuracy: 0.9156
Epoch 200/200
5/5 [==============================] - 1s 152ms/step - loss: 0.5547 - accuracy: 0.9111

import matplotlib.pyplot as plt
print(history.history.keys())
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["accuracy"], label="accuracy")
# plt.plot(history.history["val_loss"], label="val_loss")
# plt.plot(history.history["val_accuracy"], label="val_accuracy")
# plt.plot(history.history["sparse_categorical_accuracy"], label="sparse_categorical_accuracy")
# plt.plot(history.history["val_sparse_categorical_accuracy"], label="val_sparse_categorical_accuracy")
plt.legend()
plt.show()

dict_keys(['loss', 'accuracy'])

png

class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

import time
start = time.time()
states = None
next_char = tf.constant(["Rich woman", "Customer service"])
result = [next_char]

for n in range(50):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

Rich woman, want to touch you, send you more than 100 generated products received what actual products to do

Look at the work that has come, look at all the native d+10 China 

________________________________________________________________________________

Run time: 0.16722774505615234