# combined_pipeline.py import cv2 import numpy as np from PIL import Image, ImageDraw import easyocr import base64 import io import re import torch import yaml from torchvision import transforms from transformers import CLIPModel from ultralytics import YOLO # ======================= # LOAD MODELS ON STARTUP # ======================= # 1️⃣ UI ELEMENT YOLO MODEL ui_model = YOLO("UI_element.pt") # 2️⃣ FOOD YOLO MODEL food_model = YOLO("models/Yolos.pt") # 3️⃣ OpenCV Face DNN face_net = cv2.dnn.readNetFromCaffe( "deploy.prototxt", "res10_300x300_ssd_iter_140000.caffemodel" ) # 4️⃣ OCR reader = easyocr.Reader(["en"]) # Keep classes in UI model KEEP_CLASSES = ["BackgroundImage", "Image", "Text"] name_to_id = {v: k for k, v in ui_model.names.items()} KEEP_IDS = [name_to_id[c] for c in KEEP_CLASSES if c in name_to_id] # Regex patterns for personal info EMAIL_PATTERN = re.compile(r"\S+@\S+\.\S+") PHONE_PATTERN = re.compile(r"\b\d{6,15}\b") # CLIP Setup device = "cuda" if torch.cuda.is_available() else "cpu" with open("data/data.yaml", "r") as f: data_info = yaml.safe_load(f) label_list = data_info["names"] class CLIPMultiLabel(torch.nn.Module): def __init__(self, clip_model, num_labels): super().__init__() self.image_encoder = clip_model.vision_model self.proj = clip_model.visual_projection self.classifier = torch.nn.Linear(self.proj.out_features, num_labels) def forward(self, images): with torch.no_grad(): feats = self.image_encoder(images).pooler_output feats = self.proj(feats) return self.classifier(feats) clip_base = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") clip_model = CLIPMultiLabel(clip_base, len(label_list)) clip_model.load_state_dict(torch.load("models/clip_multilabel_checkpoint.pth", map_location=device)) clip_model.to(device) clip_model.eval() transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor() ]) clip_threshold = 0.5 def run_clip(pil_img, thresh=clip_threshold): img_tensor = transform(pil_img).unsqueeze(0).to(device) with torch.no_grad(): logits = clip_model(img_tensor) probs = torch.sigmoid(logits) return [label_list[i] for i in range(len(label_list)) if probs[0, i] > thresh] # ======================= # COMBINED PIPELINE # ======================= def run_full_pipeline(image_bytes): # 1️⃣ Load image np_arr = np.frombuffer(image_bytes, np.uint8) img = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) h, w = img.shape[:2] pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) # ======================= # 2️⃣ Mask UI elements # ======================= ui_results = ui_model(img)[0] mask = np.zeros((h, w), dtype=np.uint8) for box, cls_id in zip(ui_results.boxes.xyxy, ui_results.boxes.cls): if int(cls_id) in KEEP_IDS: x1, y1, x2, y2 = map(int, box) cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1) masked_img = cv2.bitwise_and(img, img, mask=mask) # ======================= # 3️⃣ Face blurring # ======================= blob = cv2.dnn.blobFromImage(cv2.resize(masked_img, (300, 300)), 1.0, (300, 300), (104, 177, 123)) face_net.setInput(blob) detections = face_net.forward() for i in range(detections.shape[2]): conf = detections[0, 0, i, 2] if conf > 0.55: box = detections[0, 0, i, 3:7] * np.array([w, h, w, h]) x1, y1, x2, y2 = box.astype(int) x1, y1 = max(0, x1), max(0, y1) x2, y2 = min(w, x2), min(h, y2) face_region = masked_img[y1:y2, x1:x2] if face_region.size > 0: blurred = cv2.GaussianBlur(face_region, (85, 85), 25) masked_img[y1:y2, x1:x2] = blurred # ======================= # 4️⃣ OCR masking # ======================= pil_masked = Image.fromarray(cv2.cvtColor(masked_img, cv2.COLOR_BGR2RGB)) draw = ImageDraw.Draw(pil_masked) ocr_res = reader.readtext(masked_img) for bbox, text, conf in ocr_res: if EMAIL_PATTERN.search(text) or PHONE_PATTERN.search(text): (x1, y1), _, (x2, y2), _ = bbox draw.rectangle([x1, y1, x2, y2], fill="black") masked_img = cv2.cvtColor(np.array(pil_masked), cv2.COLOR_RGB2BGR) # ======================= # 5️⃣ Food detection + CLIP # ======================= food_out = food_model(masked_img)[0] boxes = food_out.boxes.xyxy.cpu().numpy() classes = food_out.boxes.cls.cpu().numpy().astype(int) final_labels = set() for i, box in enumerate(boxes): x1, y1, x2, y2 = map(int, box) crop = pil_masked.crop((x1, y1, x2, y2)) clip_preds = run_clip(crop) final_labels.update(clip_preds) yolo_label = label_list[classes[i]] final_labels.add(yolo_label) cv2.rectangle(masked_img, (x1, y1), (x2, y2), (0, 255, 0), 2) cv2.putText(masked_img, f"{yolo_label}", (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) # Also run CLIP on whole image full_clip = run_clip(pil_masked, thresh=0.3) final_labels.update(full_clip) # ======================= # 6️⃣ Convert to Base64 # ======================= _, buffer = cv2.imencode(".jpg", masked_img) img_b64 = base64.b64encode(buffer).decode() return img_b64, list(final_labels)