-
Notifications
You must be signed in to change notification settings - Fork 3
/
article_title_and_readcount_rels.py
179 lines (140 loc) · 6.45 KB
/
article_title_and_readcount_rels.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
## --- Randomly generate input data of a title and reading volume, and use pytorch to predict the relationship between the title and reading volume: Input title to predict reading volume, save model to lmdb and visualization the process
# To save the model to an LMDB database and visualize the training process, we need to integrate LMDB as a storage system and use a plotting library like `matplotlib` to track the loss during training. Here's how we can approach this:
# ### Steps:
# 1. **Save Model to LMDB**: LMDB (Lightning Memory-Mapped Database) is a fast, memory-mapped, key-value store. We'll save the model parameters into an LMDB database.
# 2. **Visualize Training Loss**: During training, we will log the loss at each epoch and then plot it.
# #### Updated Code with LMDB and Visualization
# First, you need to install `lmdb` and `matplotlib` libraries if you haven't:
# ```bash
#pip install lmdb matplotlib
# ```
# Now, let's update the code:
# ```python
import torch
import torch.nn as nn
import torch.optim as optim
import random
import string
import lmdb
import pickle
import matplotlib.pyplot as plt
# Step 1: Generate random data
def generate_random_title(length=5):
"""Generates a random title consisting of lowercase letters."""
return ''.join(random.choice(string.ascii_lowercase) for _ in range(length))
def generate_random_volume():
"""Generates a random reading volume between 100 and 10000."""
return random.randint(100, 10000)
# Create dataset
num_samples = 1000
titles = [generate_random_title() for _ in range(num_samples)]
volumes = [generate_random_volume() for _ in range(num_samples)]
# Step 2: Preprocess titles (character-level encoding)
vocab = string.ascii_lowercase # 26 letters
char_to_idx = {char: idx + 1 for idx, char in enumerate(vocab)} # Map a to 1, b to 2, ...
max_title_len = 10 # Assuming max title length is 10
def encode_title(title):
"""Encodes a title into a fixed-length sequence of indices."""
encoded = [char_to_idx[char] for char in title]
if len(encoded) < max_title_len:
encoded += [0] * (max_title_len - len(encoded)) # Padding
return encoded[:max_title_len] # Truncate if too long
encoded_titles = [encode_title(title) for title in titles]
# Convert data to tensors
X = torch.tensor(encoded_titles, dtype=torch.long)
y = torch.tensor(volumes, dtype=torch.float32)
# Step 3: Build the neural network
class TitleVolumePredictor(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim):
super(TitleVolumePredictor, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.fc1 = nn.Linear(embed_dim * max_title_len, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, 1)
def forward(self, x):
x = self.embedding(x)
x = x.view(x.size(0), -1) # Flatten the embeddings
x = torch.relu(self.fc1(x))
x = self.fc2(x)
return x
# Hyperparameters
vocab_size = len(vocab) + 1 # +1 for padding
embed_dim = 10
hidden_dim = 64
model = TitleVolumePredictor(vocab_size, embed_dim, hidden_dim)
# Step 4: Train the model
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# LMDB setup for saving the model
env = lmdb.open("model_db", map_size=10**9) # 1 GB
# Training loop with loss tracking
num_epochs = 100
losses = []
for epoch in range(num_epochs):
model.train()
optimizer.zero_grad()
outputs = model(X).squeeze()
loss = criterion(outputs, y)
loss.backward()
optimizer.step()
losses.append(loss.item()) # Log the loss for visualization
if (epoch + 1) % 10 == 0:
print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')
# Step 5: Save the model to LMDB
with env.begin(write=True) as txn:
# Serialize model's state dict to a binary blob
model_blob = pickle.dumps(model.state_dict())
txn.put(b'model', model_blob)
# Step 6: Visualize training loss
plt.plot(range(num_epochs), losses)
plt.title("Training Loss Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.show()
# Step 7: Predict the reading volume for a new title
def predict_reading_volume(title):
model.eval()
encoded_title = torch.tensor(encode_title(title)).unsqueeze(0) # Add batch dimension
with torch.no_grad():
predicted_volume = model(encoded_title).item()
return predicted_volume
# Example usage
new_title = "hello"
predicted_volume = predict_reading_volume(new_title)
print(f"Predicted reading volume for title '{new_title}': {predicted_volume:.2f}")
# Step 8: Retrieve the model from LMDB
with env.begin() as txn:
model_blob = txn.get(b'model')
model_state_dict = pickle.loads(model_blob)
# Load the state dict into a new model instance
model_loaded = TitleVolumePredictor(vocab_size, embed_dim, hidden_dim)
model_loaded.load_state_dict(model_state_dict)
model_loaded.eval()
# Predict with the loaded model
predicted_volume_loaded = model_loaded(torch.tensor(encode_title(new_title)).unsqueeze(0)).item()
print(f"Predicted volume with loaded model: {predicted_volume_loaded:.2f}")
# ```
# ### Breakdown:
# 1. **LMDB Integration**:
# - We initialize an LMDB environment (`env = lmdb.open("model_db", map_size=10**9)`).
# - After training, the model's state dictionary is serialized using `pickle` and saved to the LMDB database (`txn.put(b'model', model_blob)`).
# - To retrieve the model, the binary blob is deserialized from the LMDB store and loaded back into a PyTorch model instance.
# 2. **Training Loss Visualization**:
# - During training, we log the loss at each epoch in the `losses` list.
# - After training, `matplotlib` is used to plot the loss curve (`plt.plot()`), showing how the loss decreases over epochs.
# 3. **Prediction with Loaded Model**:
# - After saving and retrieving the model from LMDB, we make predictions using the loaded model to ensure it performs as expected.
# This approach ensures the model is saved efficiently in LMDB and can be retrieved for further predictions. The loss curve helps track the training progress visually.
## --- run -----
# Epoch [10/100], Loss: 32760820.0000
# Epoch [20/100], Loss: 32745422.0000
# Epoch [30/100], Loss: 32723140.0000
# Epoch [40/100], Loss: 32691936.0000
# Epoch [50/100], Loss: 32650022.0000
# Epoch [60/100], Loss: 32595292.0000
# Epoch [70/100], Loss: 32525454.0000
# Epoch [80/100], Loss: 32438452.0000
# Epoch [90/100], Loss: 32332394.0000
# Epoch [100/100], Loss: 32205452.0000
# Predicted reading volume for title 'hello': 60.77
# Predicted volume with loaded model: 60.77
#