# combined_pipeline.py
import cv2
import numpy as np
from PIL import Image, ImageDraw
import easyocr
import base64
import io
import re
import torch
import yaml
from torchvision import transforms
from transformers import CLIPModel
from ultralytics import YOLO

# =======================
# LOAD MODELS ON STARTUP
# =======================

# 1️⃣ UI ELEMENT YOLO MODEL
ui_model = YOLO("UI_element.pt")

# 2️⃣ FOOD YOLO MODEL
food_model = YOLO("models/Yolos.pt")

# 3️⃣ OpenCV Face DNN
face_net = cv2.dnn.readNetFromCaffe(
    "deploy.prototxt",
    "res10_300x300_ssd_iter_140000.caffemodel"
)

# 4️⃣ OCR
reader = easyocr.Reader(["en"])

# Keep classes in UI model
KEEP_CLASSES = ["BackgroundImage", "Image", "Text"]
name_to_id = {v: k for k, v in ui_model.names.items()}
KEEP_IDS = [name_to_id[c] for c in KEEP_CLASSES if c in name_to_id]

# Regex patterns for personal info
EMAIL_PATTERN = re.compile(r"\S+@\S+\.\S+")
PHONE_PATTERN = re.compile(r"\b\d{6,15}\b")

# CLIP Setup
device = "cuda" if torch.cuda.is_available() else "cpu"

with open("data/data.yaml", "r") as f:
    data_info = yaml.safe_load(f)
label_list = data_info["names"]

class CLIPMultiLabel(torch.nn.Module):
    def __init__(self, clip_model, num_labels):
        super().__init__()
        self.image_encoder = clip_model.vision_model
        self.proj = clip_model.visual_projection
        self.classifier = torch.nn.Linear(self.proj.out_features, num_labels)

    def forward(self, images):
        with torch.no_grad():
            feats = self.image_encoder(images).pooler_output
            feats = self.proj(feats)
        return self.classifier(feats)

clip_base = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_model = CLIPMultiLabel(clip_base, len(label_list))
clip_model.load_state_dict(torch.load("models/clip_multilabel_checkpoint.pth", map_location=device))
clip_model.to(device)
clip_model.eval()

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

clip_threshold = 0.5

def run_clip(pil_img, thresh=clip_threshold):
    img_tensor = transform(pil_img).unsqueeze(0).to(device)
    with torch.no_grad():
        logits = clip_model(img_tensor)
        probs = torch.sigmoid(logits)
    return [label_list[i] for i in range(len(label_list)) if probs[0, i] > thresh]

# =======================
# COMBINED PIPELINE
# =======================

def run_full_pipeline(image_bytes):
    # 1️⃣ Load image
    np_arr = np.frombuffer(image_bytes, np.uint8)
    img = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
    h, w = img.shape[:2]
    pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))

    # =======================
    # 2️⃣ Mask UI elements
    # =======================
    ui_results = ui_model(img)[0]
    mask = np.zeros((h, w), dtype=np.uint8)
    for box, cls_id in zip(ui_results.boxes.xyxy, ui_results.boxes.cls):
        if int(cls_id) in KEEP_IDS:
            x1, y1, x2, y2 = map(int, box)
            cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1)
    masked_img = cv2.bitwise_and(img, img, mask=mask)

    # =======================
    # 3️⃣ Face blurring
    # =======================
    blob = cv2.dnn.blobFromImage(cv2.resize(masked_img, (300, 300)), 1.0,
                                 (300, 300), (104, 177, 123))
    face_net.setInput(blob)
    detections = face_net.forward()
    for i in range(detections.shape[2]):
        conf = detections[0, 0, i, 2]
        if conf > 0.55:
            box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
            x1, y1, x2, y2 = box.astype(int)
            x1, y1 = max(0, x1), max(0, y1)
            x2, y2 = min(w, x2), min(h, y2)
            face_region = masked_img[y1:y2, x1:x2]
            if face_region.size > 0:
                blurred = cv2.GaussianBlur(face_region, (85, 85), 25)
                masked_img[y1:y2, x1:x2] = blurred

    # =======================
    # 4️⃣ OCR masking
    # =======================
    pil_masked = Image.fromarray(cv2.cvtColor(masked_img, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(pil_masked)
    ocr_res = reader.readtext(masked_img)
    for bbox, text, conf in ocr_res:
        if EMAIL_PATTERN.search(text) or PHONE_PATTERN.search(text):
            (x1, y1), _, (x2, y2), _ = bbox
            draw.rectangle([x1, y1, x2, y2], fill="black")

    masked_img = cv2.cvtColor(np.array(pil_masked), cv2.COLOR_RGB2BGR)

    # =======================
    # 5️⃣ Food detection + CLIP
    # =======================
    food_out = food_model(masked_img)[0]
    boxes = food_out.boxes.xyxy.cpu().numpy()
    classes = food_out.boxes.cls.cpu().numpy().astype(int)

    final_labels = set()
    for i, box in enumerate(boxes):
        x1, y1, x2, y2 = map(int, box)
        crop = pil_masked.crop((x1, y1, x2, y2))
        clip_preds = run_clip(crop)
        final_labels.update(clip_preds)
        yolo_label = label_list[classes[i]]
        final_labels.add(yolo_label)
        cv2.rectangle(masked_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(masked_img, f"{yolo_label}", (x1, y1-10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    # Also run CLIP on whole image
    full_clip = run_clip(pil_masked, thresh=0.3)
    final_labels.update(full_clip)

    # =======================
    # 6️⃣ Convert to Base64
    # =======================
    _, buffer = cv2.imencode(".jpg", masked_img)
    img_b64 = base64.b64encode(buffer).decode()

    return img_b64, list(final_labels)