Molbap HF Staff commited on
Commit
8caaff8
Β·
1 Parent(s): 68997c6

better frags

Browse files
src/fragments/tp-plan.html ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ <pre><code class="language-python"># In the model's config (example: ERNIE 4.5-style decoder blocks)
3
+ base_model_tp_plan = {
4
+ "layers.*.self_attn.q_proj": "colwise",
5
+ "layers.*.self_attn.k_proj": "colwise",
6
+ "layers.*.self_attn.v_proj": "colwise",
7
+ "layers.*.self_attn.o_proj": "rowwise",
8
+ "layers.*.mlp.gate_proj": "colwise",
9
+ "layers.*.mlp.up_proj": "colwise",
10
+ "layers.*.mlp.down_proj": "rowwise",
11
+ }
12
+
13
+ # Runtime
14
+ import torch
15
+ from transformers import AutoModelForCausalLM, AutoTokenizer
16
+
17
+ model_id = "your/model-or-local-checkpoint"
18
+ model = AutoModelForCausalLM.from_pretrained(
19
+ model_id,
20
+ dtype=torch.bfloat16,
21
+ tp_plan=base_model_tp_plan, # <-- plan defined above
22
+ )
23
+ tok = AutoTokenizer.from_pretrained(model_id)
24
+ inputs = tok("Hello", return_tensors="pt").to(model.device)
25
+ out = model(**inputs)</code></pre>
26
+
27
+
src/fragments/warmup_demo.html ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <style>
7
+ body {
8
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
9
+ margin: 0;
10
+ padding: 20px;
11
+ background-color: #f5f5f5;
12
+ }
13
+
14
+ .container {
15
+ max-width: 1200px;
16
+ margin: 0 auto;
17
+ background: white;
18
+ border-radius: 12px;
19
+ padding: 30px;
20
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
21
+ }
22
+
23
+ h1 {
24
+ text-align: center;
25
+ color: #333;
26
+ margin-bottom: 10px;
27
+ }
28
+
29
+ .subtitle {
30
+ text-align: center;
31
+ color: #666;
32
+ margin-bottom: 30px;
33
+ font-size: 16px;
34
+ }
35
+
36
+ .demo-container {
37
+ display: flex;
38
+ gap: 40px;
39
+ margin-bottom: 30px;
40
+ }
41
+
42
+ .side {
43
+ flex: 1;
44
+ border: 2px solid #ddd;
45
+ border-radius: 8px;
46
+ padding: 20px;
47
+ background: #fafafa;
48
+ }
49
+
50
+ .side h2 {
51
+ text-align: center;
52
+ margin-top: 0;
53
+ color: #333;
54
+ }
55
+
56
+ .no-warmup h2 {
57
+ color: #d63384;
58
+ }
59
+
60
+ .with-warmup h2 {
61
+ color: #198754;
62
+ }
63
+
64
+ .memory-area {
65
+ height: 400px;
66
+ border: 2px dashed #ccc;
67
+ border-radius: 6px;
68
+ padding: 10px;
69
+ margin: 20px 0;
70
+ background: #fff;
71
+ position: relative;
72
+ overflow: hidden;
73
+ }
74
+
75
+ .layer-box {
76
+ width: 80px;
77
+ height: 30px;
78
+ border: 2px solid #666;
79
+ border-radius: 4px;
80
+ margin: 3px;
81
+ display: inline-block;
82
+ position: relative;
83
+ background: #fff;
84
+ transition: all 0.3s ease;
85
+ }
86
+
87
+ .layer-box.allocating {
88
+ background: #e9ecef;
89
+ border-color: #adb5bd;
90
+ }
91
+
92
+ .layer-box.allocating::after {
93
+ content: "malloc";
94
+ position: absolute;
95
+ top: 50%;
96
+ left: 50%;
97
+ transform: translate(-50%, -50%);
98
+ font-size: 10px;
99
+ color: #666;
100
+ font-weight: bold;
101
+ }
102
+
103
+ .layer-box.loaded {
104
+ background: #d1e7dd;
105
+ border-color: #198754;
106
+ }
107
+
108
+ .layer-box.loaded::after {
109
+ content: "data";
110
+ position: absolute;
111
+ top: 50%;
112
+ left: 50%;
113
+ transform: translate(-50%, -50%);
114
+ font-size: 10px;
115
+ color: #198754;
116
+ font-weight: bold;
117
+ }
118
+
119
+ .warmup-container {
120
+ width: 100%;
121
+ height: 60px;
122
+ border: 3px solid #666;
123
+ border-radius: 6px;
124
+ margin-bottom: 20px;
125
+ background: #fff;
126
+ position: relative;
127
+ overflow: hidden;
128
+ }
129
+
130
+ .warmup-container.allocated {
131
+ border-color: #0d6efd;
132
+ background: #e7f1ff;
133
+ }
134
+
135
+ .warmup-container::before {
136
+ content: "Pre-allocated Memory Pool";
137
+ position: absolute;
138
+ top: 50%;
139
+ left: 50%;
140
+ transform: translate(-50%, -50%);
141
+ font-size: 14px;
142
+ color: #666;
143
+ font-weight: bold;
144
+ z-index: 1;
145
+ }
146
+
147
+ .warmup-container.allocated::before {
148
+ color: #0d6efd;
149
+ }
150
+
151
+ .warmup-fill {
152
+ height: 100%;
153
+ background: linear-gradient(90deg, #198754, #20c997);
154
+ width: 0%;
155
+ transition: width 0.5s ease;
156
+ border-radius: 3px;
157
+ position: relative;
158
+ z-index: 2;
159
+ }
160
+
161
+ .warmup-fill::after {
162
+ content: "Layer Data Loading";
163
+ position: absolute;
164
+ top: 50%;
165
+ left: 50%;
166
+ transform: translate(-50%, -50%);
167
+ font-size: 12px;
168
+ color: white;
169
+ font-weight: bold;
170
+ white-space: nowrap;
171
+ }
172
+
173
+ .timing {
174
+ text-align: center;
175
+ font-size: 24px;
176
+ font-weight: bold;
177
+ margin: 15px 0;
178
+ min-height: 30px;
179
+ }
180
+
181
+ .no-warmup .timing {
182
+ color: #d63384;
183
+ }
184
+
185
+ .with-warmup .timing {
186
+ color: #198754;
187
+ }
188
+
189
+ .controls {
190
+ text-align: center;
191
+ margin: 30px 0;
192
+ }
193
+
194
+ .btn {
195
+ background: #0d6efd;
196
+ color: white;
197
+ border: none;
198
+ padding: 12px 24px;
199
+ border-radius: 6px;
200
+ font-size: 16px;
201
+ cursor: pointer;
202
+ margin: 0 10px;
203
+ transition: background 0.3s ease;
204
+ }
205
+
206
+ .btn:hover {
207
+ background: #0b5ed7;
208
+ }
209
+
210
+ .btn:disabled {
211
+ background: #6c757d;
212
+ cursor: not-allowed;
213
+ }
214
+
215
+
216
+ .description {
217
+ background: #f8f9fa;
218
+ padding: 15px;
219
+ border-radius: 6px;
220
+ margin-top: 15px;
221
+ font-size: 14px;
222
+ line-height: 1.5;
223
+ }
224
+
225
+ .phase-indicator {
226
+ font-size: 14px;
227
+ color: #666;
228
+ text-align: center;
229
+ margin-top: 10px;
230
+ min-height: 20px;
231
+ }
232
+
233
+ .layer-counter {
234
+ text-align: center;
235
+ font-size: 16px;
236
+ color: #495057;
237
+ margin: 10px 0;
238
+ }
239
+ </style>
240
+ </head>
241
+ <body>
242
+ <div class="container">
243
+ <h1>CUDA Caching Allocator Warmup Visualization</h1>
244
+ <p class="subtitle">Demonstrating memory allocation patterns during model loading</p>
245
+
246
+ <div class="controls">
247
+ <button class="btn" id="startBtn" onclick="startDemo()">Start Animation</button>
248
+ <button class="btn" id="resetBtn" onclick="resetDemo()">Reset</button>
249
+ </div>
250
+
251
+ <div class="demo-container">
252
+ <div class="side no-warmup">
253
+ <h2 data-no-toc>❌ Without Warmup</h2>
254
+ <div class="timing" id="noWarmupTime">0.00s</div>
255
+ <div class="layer-counter" id="noWarmupCounter">Layers loaded: 0/10</div>
256
+ <div class="phase-indicator" id="noWarmupPhase"></div>
257
+ <div class="memory-area" id="noWarmupArea"></div>
258
+ <div class="description">
259
+ <strong>Individual Allocations:</strong><br>
260
+ Each model layer triggers a separate cudaMalloc() call, creating memory fragmentation and allocation overhead.
261
+ <br><br>
262
+ πŸ“¦ <strong>Grey "malloc"</strong> = Memory allocation overhead<br>
263
+ βœ… <strong>Green "data"</strong> = Actual layer data loading
264
+ </div>
265
+ </div>
266
+
267
+ <div class="side with-warmup">
268
+ <h2 data-no-toc>βœ… With Warmup</h2>
269
+ <div class="timing" id="warmupTime">0.00s</div>
270
+ <div class="layer-counter" id="warmupCounter">Layers loaded: 0/10</div>
271
+ <div class="phase-indicator" id="warmupPhase"></div>
272
+ <div class="memory-area" id="warmupArea">
273
+ <div class="warmup-container" id="warmupContainer">
274
+ <div class="warmup-fill" id="warmupFill"></div>
275
+ </div>
276
+ <div id="warmupLayers"></div>
277
+ </div>
278
+ <div class="description">
279
+ <strong>Pre-allocated Pool:</strong><br>
280
+ The warmup function calculates total memory needed and makes ONE large allocation. Subsequent layers load directly into this pool, eliminating malloc overhead.
281
+ <br><br>
282
+ πŸ”΅ <strong>Blue container</strong> = Single large malloc (warmup)<br>
283
+ 🟒 <strong>Green progress bar</strong> = Layer data loading (no malloc needed)
284
+ </div>
285
+ </div>
286
+ </div>
287
+ </div>
288
+
289
+ <script>
290
+ let animationSpeed = 1 / 2.4; // Slower animation
291
+ let isRunning = false;
292
+ const totalLayers = 10;
293
+
294
+ function startDemo() {
295
+ if (isRunning) return;
296
+ isRunning = true;
297
+
298
+ document.getElementById('startBtn').disabled = true;
299
+ document.getElementById('resetBtn').disabled = true;
300
+
301
+ // Start both animations simultaneously
302
+ Promise.all([
303
+ animateNoWarmup(),
304
+ animateWithWarmup()
305
+ ]).then(() => {
306
+ isRunning = false;
307
+ document.getElementById('startBtn').disabled = false;
308
+ document.getElementById('resetBtn').disabled = false;
309
+ });
310
+ }
311
+
312
+ function resetDemo() {
313
+ if (isRunning) return;
314
+
315
+ // Clear containers
316
+ document.getElementById('noWarmupArea').innerHTML = '';
317
+ document.getElementById('warmupLayers').innerHTML = '';
318
+ document.getElementById('warmupFill').style.width = '0%';
319
+ document.getElementById('warmupContainer').classList.remove('allocated');
320
+
321
+ // Reset timers
322
+ document.getElementById('noWarmupTime').textContent = '0.00s';
323
+ document.getElementById('warmupTime').textContent = '0.00s';
324
+
325
+ // Reset counters
326
+ document.getElementById('noWarmupCounter').textContent = 'Layers loaded: 0/10';
327
+ document.getElementById('warmupCounter').textContent = 'Layers loaded: 0/10';
328
+
329
+ // Reset phase indicators
330
+ document.getElementById('noWarmupPhase').textContent = '';
331
+ document.getElementById('warmupPhase').textContent = '';
332
+ }
333
+
334
+ async function animateNoWarmup() {
335
+ const container = document.getElementById('noWarmupArea');
336
+ const timeEl = document.getElementById('noWarmupTime');
337
+ const counterEl = document.getElementById('noWarmupCounter');
338
+ const phaseEl = document.getElementById('noWarmupPhase');
339
+
340
+ let currentTime = 0;
341
+ const baseDelay = 200 / animationSpeed; // Base delay between operations
342
+
343
+ phaseEl.textContent = 'Loading model layers...';
344
+
345
+ for (let i = 0; i < totalLayers; i++) {
346
+ // Create layer box
347
+ const layerBox = document.createElement('div');
348
+ layerBox.className = 'layer-box';
349
+ container.appendChild(layerBox);
350
+
351
+ // Malloc phase (grey)
352
+ await sleep(baseDelay * 0.3);
353
+ layerBox.classList.add('allocating');
354
+ currentTime += 0.08; // malloc overhead
355
+ timeEl.textContent = currentTime.toFixed(2) + 's';
356
+
357
+ // Data loading phase (green)
358
+ await sleep(baseDelay * 0.7);
359
+ layerBox.classList.remove('allocating');
360
+ layerBox.classList.add('loaded');
361
+ currentTime += 0.12; // data transfer time
362
+ timeEl.textContent = currentTime.toFixed(2) + 's';
363
+
364
+ // Update counter
365
+ counterEl.textContent = `Layers loaded: ${i + 1}/${totalLayers}`;
366
+ }
367
+
368
+ phaseEl.textContent = 'Complete!';
369
+ }
370
+
371
+ async function animateWithWarmup() {
372
+ const container = document.getElementById('warmupLayers');
373
+ const timeEl = document.getElementById('warmupTime');
374
+ const counterEl = document.getElementById('warmupCounter');
375
+ const phaseEl = document.getElementById('warmupPhase');
376
+ const warmupContainer = document.getElementById('warmupContainer');
377
+ const warmupFill = document.getElementById('warmupFill');
378
+
379
+ let currentTime = 0;
380
+ const baseDelay = 200 / animationSpeed;
381
+
382
+ // Warmup phase
383
+ phaseEl.textContent = 'Warming up allocator...';
384
+ await sleep(baseDelay * 2);
385
+ warmupContainer.classList.add('allocated');
386
+ currentTime += 0.3; // warmup overhead
387
+ timeEl.textContent = currentTime.toFixed(2) + 's';
388
+
389
+ phaseEl.textContent = 'Loading model layers...';
390
+
391
+ // Load layers directly into pre-allocated memory
392
+ for (let i = 0; i < totalLayers; i++) {
393
+ // Create layer indicator (much smaller since no malloc)
394
+ const layerBox = document.createElement('div');
395
+ layerBox.className = 'layer-box loaded';
396
+ layerBox.style.width = '40px';
397
+ layerBox.style.height = '20px';
398
+ container.appendChild(layerBox);
399
+
400
+ // Update progress bar
401
+ const progress = ((i + 1) / totalLayers) * 100;
402
+ warmupFill.style.width = progress + '%';
403
+
404
+ // Only data transfer time (no malloc overhead)
405
+ await sleep(baseDelay * 0.5);
406
+ currentTime += 0.08; // faster due to no malloc
407
+ timeEl.textContent = currentTime.toFixed(2) + 's';
408
+
409
+ // Update counter
410
+ counterEl.textContent = `Layers loaded: ${i + 1}/${totalLayers}`;
411
+ }
412
+
413
+ phaseEl.textContent = 'Complete!';
414
+ }
415
+
416
+ function sleep(ms) {
417
+ return new Promise(resolve => setTimeout(resolve, ms));
418
+ }
419
+ </script>
420
+ </body>
421
+ </html>