Spaces:

Livengood
/

Instance-VRAM-Calculator

Sleeping

App Files Files Community

Instance-VRAM-Calculator / app.py

Livengood

Fix Gradio interactivity: use Tabs wrapper with TabItem pattern

3df3dc4 15 days ago

raw

history blame

12 kB

	"""VRAM Calculator for HuggingFace Models"""
	from __future__ import annotations

	import gradio as gr
	from huggingface_hub import HfApi, hf_hub_download
	import json
	from functools import lru_cache

	api = HfApi()

	GPU_SPECS = {
	"RTX 3080": (10, 0),
	"RTX 3090": (24, 0),
	"RTX 4080": (16, 0),
	"RTX 4090": (24, 0),
	"RTX 5090": (32, 0),
	"M2 Ultra": (192, 0),
	"M3 Max": (128, 0),
	"M4 Max": (128, 0),
	"RTX A6000": (48, 0),
	"L40S": (48, 1.00),
	"A10G": (24, 1.00),
	"L4": (24, 0.70),
	"A100 40GB": (40, 3.00),
	"A100 80GB": (80, 5.00),
	"H100 80GB": (80, 8.00),
	}

	DTYPE_BYTES = {
	"F32": 4, "float32": 4,
	"F16": 2, "float16": 2,
	"BF16": 2, "bfloat16": 2,
	"I8": 1, "int8": 1,
	"U8": 1, "uint8": 1,
	}

	FRAMEWORKS = {
	"None (PyTorch)": 1.20,
	"vLLM": 1.10,
	"TGI": 1.15,
	"llama.cpp": 1.05,
	"Ollama": 1.08,
	}


	def bytes_to_gb(b):
	return b / (1024 ** 3)


	@lru_cache(maxsize=50)
	def fetch_model_info(model_id):
	try:
	return api.model_info(model_id, files_metadata=True)
	except Exception:
	return None


	@lru_cache(maxsize=50)
	def fetch_config(model_id):
	try:
	path = hf_hub_download(model_id, "config.json")
	with open(path) as f:
	return json.load(f)
	except Exception:
	return {}


	def get_params(info):
	if info and hasattr(info, 'safetensors') and info.safetensors:
	params = info.safetensors.total
	dtypes = info.safetensors.parameters
	if dtypes:
	dtype = max(dtypes, key=dtypes.get)
	return params, dtype
	return 0, "F16"


	def calculate(model_id, context, batch, mode, framework, num_gpus, lora_rank):
	"""Main calculation function"""
	try:
	if not model_id or not model_id.strip():
	return "Enter a model ID (e.g., meta-llama/Llama-3.1-8B)"

	model_id = model_id.strip()
	if "/" not in model_id:
	return "Model ID format: organization/model-name"

	info = fetch_model_info(model_id)
	if not info:
	return "Could not fetch model: " + model_id

	config = fetch_config(model_id)
	params, dtype = get_params(info)

	if params == 0:
	return "Could not read parameters (model may use .bin format)"

	dtype_bytes = DTYPE_BYTES.get(dtype, 2)
	params_b = params / 1e9
	weights_gb = bytes_to_gb(params * dtype_bytes)

	layers = config.get("num_hidden_layers", config.get("n_layer", 32))
	kv_heads = config.get("num_key_value_heads", config.get("num_attention_heads", 32))
	head_dim = config.get("head_dim", 128)
	if not head_dim:
	hidden = config.get("hidden_size", 4096)
	heads = config.get("num_attention_heads", 32)
	head_dim = hidden // heads if heads else 128

	kv_bytes = 2 * layers * batch * context * kv_heads * head_dim * dtype_bytes
	kv_gb = bytes_to_gb(kv_bytes)

	out = []
	out.append("## " + model_id)
	out.append("" + str(round(params_b, 1)) + "B parameters \| " + dtype + " \| " + str(layers) + " layers")
	out.append("")

	if mode == "Training (Full)":
	grad_gb = weights_gb
	opt_gb = bytes_to_gb(params * 8)
	act_gb = weights_gb * 2 * batch
	total = weights_gb + grad_gb + opt_gb + act_gb
	out.append("### Training Memory")
	out.append("- Weights: " + str(round(weights_gb, 1)) + " GB")
	out.append("- Gradients: " + str(round(grad_gb, 1)) + " GB")
	out.append("- Optimizer: " + str(round(opt_gb, 1)) + " GB")
	out.append("- Activations: " + str(round(act_gb, 1)) + " GB")
	elif mode == "LoRA":
	base = weights_gb
	lora_params = int(params * lora_rank * 0.0001)
	lora_gb = bytes_to_gb(lora_params * dtype_bytes)
	act_gb = base * 0.3
	total = base + lora_gb + act_gb
	out.append("### LoRA Memory")
	out.append("- Base (frozen): " + str(round(base, 1)) + " GB")
	out.append("- LoRA adapters: " + str(round(lora_gb, 2)) + " GB")
	out.append("- Activations: " + str(round(act_gb, 1)) + " GB")
	elif mode == "QLoRA":
	base = bytes_to_gb(params * 0.5)
	lora_params = int(params * lora_rank * 0.0001)
	lora_gb = bytes_to_gb(lora_params * dtype_bytes)
	act_gb = base * 0.3
	total = base + lora_gb + act_gb
	out.append("### QLoRA Memory")
	out.append("- Base (4-bit): " + str(round(base, 1)) + " GB")
	out.append("- LoRA adapters: " + str(round(lora_gb, 2)) + " GB")
	out.append("- Activations: " + str(round(act_gb, 1)) + " GB")
	else:
	overhead = FRAMEWORKS.get(framework, 1.15)
	extra = (weights_gb + kv_gb) * (overhead - 1)
	total = weights_gb + kv_gb + extra
	out.append("### Inference Memory")
	out.append("- Weights: " + str(round(weights_gb, 1)) + " GB")
	out.append("- KV Cache: " + str(round(kv_gb, 1)) + " GB")
	out.append("- Overhead (" + framework + "): " + str(round(extra, 1)) + " GB")

	if num_gpus > 1:
	per_gpu = total / num_gpus * 1.05
	out.append("")
	out.append("Multi-GPU (" + str(num_gpus) + "x): " + str(round(per_gpu, 1)) + " GB/GPU")
	effective = per_gpu
	else:
	effective = total

	out.append("")
	out.append("## Total: " + str(round(total, 1)) + " GB")

	out.append("")
	out.append("### GPU Options")
	out.append("\| GPU \| VRAM \| Fits \| Headroom \|")
	out.append("\|-----\|------\|------\|----------\|")
	for gpu, (vram, cost) in GPU_SPECS.items():
	fits = "Yes" if vram >= effective else "No"
	hr = vram - effective
	sign = "+" if hr >= 0 else ""
	out.append("\| " + gpu + " \| " + str(vram) + "GB \| " + fits + " \| " + sign + str(round(hr, 1)) + "GB \|")

	if effective > 24:
	out.append("")
	out.append("### Quantization to fit 24GB")
	out.append("\| Method \| Size \|")
	out.append("\|--------\|------\|")
	for name, mult in [("INT8", 1.0), ("4-bit", 0.5), ("3-bit", 0.375)]:
	size = bytes_to_gb(params * mult) * 1.1
	out.append("\| " + name + " \| " + str(round(size, 1)) + "GB \|")

	costs = [(gpu, cost) for gpu, (vram, cost) in GPU_SPECS.items() if vram >= effective and cost > 0]
	if costs:
	costs.sort(key=lambda x: x[1])
	out.append("")
	out.append("### Cloud Costs (8hr/day)")
	out.append("\| GPU \| $/hr \| $/month \|")
	out.append("\|-----\|------\|---------\|")
	for gpu, cost in costs[:4]:
	out.append("\| " + gpu + " \| $" + str(round(cost, 2)) + " \| $" + str(int(cost * 176)) + " \|")

	return "\n".join(out)
	except Exception as e:
	return "Error: " + str(e)


	def compare(models_text, context):
	"""Compare multiple models"""
	try:
	if not models_text:
	return "Enter model IDs, one per line"

	models = [m.strip() for m in models_text.strip().split("\n") if m.strip()]
	if len(models) < 2:
	return "Need at least 2 models"

	out = []
	out.append("## Comparison")
	out.append("\| Model \| Params \| Inference \| Training \| QLoRA \|")
	out.append("\|-------\|--------\|-----------\|----------\|-------\|")

	for mid in models[:5]:
	try:
	info = fetch_model_info(mid)
	config = fetch_config(mid)
	params, dtype = get_params(info)
	if params == 0:
	out.append("\| " + mid + " \| Error \| - \| - \| - \|")
	continue

	db = DTYPE_BYTES.get(dtype, 2)
	w = bytes_to_gb(params * db)

	layers = config.get("num_hidden_layers", 32)
	kv_heads = config.get("num_key_value_heads", 32)
	kv = bytes_to_gb(2 * layers * context * kv_heads * 128 * db)

	inf = w + kv
	train = w * 4 + w * 2
	qlora = bytes_to_gb(params * 0.5) * 1.5

	name = mid.split("/")[-1][:20]
	out.append("\| " + name + " \| " + str(round(params / 1e9, 1)) + "B \| " + str(round(inf, 1)) + "GB \| " + str(round(train, 1)) + "GB \| " + str(round(qlora, 1)) + "GB \|")
	except Exception:
	out.append("\| " + mid + " \| Error \| - \| - \| - \|")

	return "\n".join(out)
	except Exception as e:
	return "Error: " + str(e)


	# Build the interface
	with gr.Blocks(title="VRAM Calculator") as demo:
	gr.Markdown("# VRAM Calculator for LLMs")
	gr.Markdown("Estimate VRAM requirements for HuggingFace models")

	with gr.Tabs():
	with gr.TabItem("Calculator"):
	model_in = gr.Textbox(
	label="Model ID",
	placeholder="meta-llama/Llama-3.1-8B",
	info="Enter a HuggingFace model ID"
	)

	mode_in = gr.Radio(
	choices=["Inference", "Training (Full)", "LoRA", "QLoRA"],
	value="Inference",
	label="Mode"
	)

	with gr.Row():
	ctx_in = gr.Slider(
	minimum=512,
	maximum=131072,
	value=4096,
	step=512,
	label="Context Length"
	)
	batch_in = gr.Slider(
	minimum=1,
	maximum=64,
	value=1,
	step=1,
	label="Batch Size"
	)

	with gr.Accordion("Advanced Options", open=False):
	framework_in = gr.Dropdown(
	choices=list(FRAMEWORKS.keys()),
	value="vLLM",
	label="Framework"
	)
	gpus_in = gr.Slider(
	minimum=1,
	maximum=8,
	value=1,
	step=1,
	label="Number of GPUs"
	)
	lora_in = gr.Slider(
	minimum=4,
	maximum=128,
	value=16,
	step=4,
	label="LoRA Rank"
	)

	calc_btn = gr.Button("Calculate", variant="primary")
	output = gr.Markdown()

	calc_btn.click(
	fn=calculate,
	inputs=[model_in, ctx_in, batch_in, mode_in, framework_in, gpus_in, lora_in],
	outputs=output
	)

	gr.Examples(
	examples=[
	["meta-llama/Llama-3.1-8B"],
	["meta-llama/Llama-3.1-70B"],
	["mistralai/Mistral-7B-v0.1"],
	],
	inputs=[model_in],
	label="Example Models"
	)

	with gr.TabItem("Compare Models"):
	cmp_in = gr.Textbox(
	label="Models (one per line)",
	lines=4,
	placeholder="meta-llama/Llama-3.1-8B\nmistralai/Mistral-7B-v0.1"
	)
	cmp_ctx = gr.Slider(
	minimum=512,
	maximum=131072,
	value=4096,
	step=512,
	label="Context Length"
	)
	cmp_btn = gr.Button("Compare", variant="primary")
	cmp_out = gr.Markdown()

	cmp_btn.click(
	fn=compare,
	inputs=[cmp_in, cmp_ctx],
	outputs=cmp_out
	)

	gr.Markdown("---")
	gr.Markdown("Estimates are approximate. Actual usage may vary.")

	if __name__ == "__main__":
	demo.launch()