import streamlit as st import torch from transformers import AutoTokenizer, AutoModelForCausalLM import datetime import gc import os # Enable memory efficient options os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128' # Set page configuration st.set_page_config( page_title="Qwen2.5-Coder Chat", page_icon="💬", layout="wide", ) # Initialize session state if 'messages' not in st.session_state: st.session_state.messages = [] if 'model_loaded' not in st.session_state: st.session_state.model_loaded = False @st.cache_resource(show_spinner=False) def load_model_and_tokenizer(): try: model_name = "Qwen/Qwen2.5-Coder-3B-Instruct" with st.spinner("🔄 Loading tokenizer..."): # Load tokenizer first tokenizer = AutoTokenizer.from_pretrained( model_name, trust_remote_code=True ) with st.spinner("🔄 Loading model... (this may take a few minutes on CPU)"): # Load model with 8-bit quantization for CPU model = AutoModelForCausalLM.from_pretrained( model_name, device_map={"": "cpu"}, trust_remote_code=True, low_cpu_mem_usage=True, torch_dtype=torch.float32, load_in_8bit=True # Enable 8-bit quantization ) # Force CPU mode and eval mode model = model.to("cpu").eval() # Clear memory after loading gc.collect() torch.cuda.empty_cache() if torch.cuda.is_available() else None st.session_state.model_loaded = True return tokenizer, model except Exception as e: st.error(f"❌ Error loading model: {str(e)}") return None, None def generate_response(prompt, model, tokenizer, max_length=256): try: # Clear memory before generation gc.collect() # Tokenize with shorter maximum length inputs = tokenizer( prompt, return_tensors="pt", max_length=512, truncation=True ).to("cpu") # Generate with minimal parameters for CPU with torch.no_grad(), st.spinner("🤔 Thinking... (please be patient)"): outputs = model.generate( **inputs, max_new_tokens=max_length, temperature=0.7, top_p=0.9, do_sample=True, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, num_beams=1, # Disable beam search early_stopping=True ) # Clear memory after generation gc.collect() response = tokenizer.decode(outputs[0], skip_special_tokens=True) return response[len(prompt):].strip() except torch.cuda.OutOfMemoryError: st.error("💾 Memory exceeded. Try reducing the maximum length.") return None except Exception as e: st.error(f"❌ Error: {str(e)}") return None # Main UI st.title("💬 Qwen2.5-Coder Chat") # Sidebar with minimal settings with st.sidebar: st.header("⚙️ Settings") max_length = st.slider( "Response Length 📏", min_value=64, max_value=512, value=256, step=64, help="Shorter lengths are recommended for CPU" ) if st.button("🗑️ Clear Conversation"): st.session_state.messages = [] st.rerun() # Load model if not st.session_state.model_loaded: tokenizer, model = load_model_and_tokenizer() if model is None: st.stop() else: tokenizer, model = load_model_and_tokenizer() # Display conversation history for message in st.session_state.messages: with st.chat_message(message["role"]): st.markdown(f"{message['content']}\n\n_{message['timestamp']}_") # Chat input if prompt := st.chat_input("💭 Ask me anything about coding..."): # Add user message timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") st.session_state.messages.append({ "role": "user", "content": prompt, "timestamp": timestamp }) # Display user message with st.chat_message("user"): st.markdown(f"{prompt}\n\n_{timestamp}_") # Generate and display response with st.chat_message("assistant"): # Keep only last message for context to reduce memory usage conversation = f"Human: {prompt}\nAssistant:" response = generate_response( conversation, model, tokenizer, max_length=max_length ) if response: timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") st.markdown(f"{response}\n\n_{timestamp}_") # Add response to chat history st.session_state.messages.append({ "role": "assistant", "content": response, "timestamp": timestamp }) else: st.error("❌ Failed to generate response. Please try again with a shorter length.") # Clear memory after response gc.collect()