Spaces:

ADA-DAAI
/

Sale_Agent_Data_Indexing

Running

App Files Files Community

anhkhoiphan commited on about 1 month ago

Commit

a519263

1 Parent(s): 4e3aadb

Bổ sung data processing

Browse files

Files changed (4) hide show

.gitignore +1 -0
app.py +70 -3
data_indexing.py +235 -156
data_processing.py +697 -0

.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 /venv
 .env

 /venv
 .env
+__pycache__/

app.py CHANGED Viewed

@@ -1,4 +1,71 @@
-from data_indexing import create_gradio_interface
-demo = create_gradio_interface()
-demo.launch()

+import sys
+import os
+import gradio as gr
+# Import your interfaces
+from data_indexing import create_indexing_interface
+from data_processing import create_processing_interface
+def create_combined_app():
+    """Create combined app with two tabs"""
+    with gr.Blocks(
+        title="Rạng Đông Data Management System",
+        theme=gr.themes.Soft()
+    ) as app:
+        gr.Markdown("""
+        # 🏢 Rạng Đông Data Management System
+        Hệ thống quản lý dữ liệu sản phẩm và giải pháp của Rạng Đông
+        """)
+        with gr.Tabs() as tabs:
+            # Tab 1: Vector Indexing (MongoDB to Qdrant)
+            with gr.Tab("🗄️ Vector Indexing", id="indexing"):
+                gr.Markdown("""
+                ## Indexing dữ liệu từ MongoDB lên Qdrant
+                Tạo vector embeddings và index dữ liệu từ MongoDB lên Qdrant Vector Database
+                """)
+                # Create indexing interface
+                indexing_interface = create_indexing_interface()
+            # Tab 2: Data Processing (Excel to MongoDB)
+            with gr.Tab("📊 Data Processing", id="processing"):
+                gr.Markdown("""
+                ## Xử lý dữ liệu từ Excel lên MongoDB
+                Upload file Excel, xử lý dữ liệu sản phẩm và đẩy lên MongoDB Atlas
+                """)
+                # Create processing interface
+                processing_interface = create_processing_interface()
+        gr.Markdown("""
+        ---
+        ### 📖 Hướng dẫn sử dụng
+        **Bước 1: Data Processing**
+        1. Upload file Excel chứa dữ liệu sản phẩm (product_Metadata.xlsx)
+        2. Cấu hình MongoDB connection string, database name và test connection
+        3. Chọn loại sản phẩm hoặc xử lý tất cả.
+        **Bước 2: Vector Indexing**
+        1. Chọn collection cần indexing
+        2. Hệ thống sẽ tạo embeddings và đẩy lên Qdrant
+        """)
+    return app
+if __name__ == "__main__":
+    app = create_combined_app()
+    # Launch with appropriate settings for Hugging Face Spaces
+    app.launch()

data_indexing.py CHANGED Viewed

@@ -7,11 +7,6 @@ import sys
 from typing import List, Dict, Tuple, Any, Optional
 import uuid
-# Add project root to Python path
-project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
-if project_root not in sys.path:
-    sys.path.insert(0, project_root)
 from PIL import Image
 from FlagEmbedding import BGEM3FlagModel
 import gradio as gr
@@ -22,6 +17,7 @@ from qdrant_client.http.models import Modifier, Distance, SparseVectorParams, Ve
 import torch
 from transformers import EfficientNetModel, AutoImageProcessor
 from pymongo import MongoClient
 from config import (
     QDRANT_COLLECTION_NAME_SPCHIEUSANG,
@@ -151,6 +147,26 @@ mongodb_solution_collections = {
     "nha_o_xa_hoi": "gp_nha_o_xa_hoi"
 }
 """=================MONGODB CONNECTION========================"""
 class MongoDBConnection:
@@ -686,48 +702,72 @@ class ProductIndexing:
             reload: Whether to recreate collections
             hybrid_mode: Whether to use hybrid text embedding (BGEM3)
         """
-        if reload:
             try:
-                for collection in product_collections:
-                    self.client.recreate_collection(
-                        collection_name=collection,
-                        vectors_config=product_vectors_config,
-                        sparse_vectors_config=sparse_vectors_config
-                    )
-                print("All product collections recreated.")
             except Exception as e:
-                print(f"Error while recreating collections: {e}")
-                return
-        # Setup MongoDB connection
-        if not self.mongodb_conn:
-            if not self.setup_mongodb():
-                print("❌ Failed to connect to MongoDB. Aborting indexing.")
-                return
-        # Create embedding processor
-        embed_object = ProductEmbedding()
-        for collection, product_type in zip(product_collections, product_types):
-            print(f"\n🔄 Processing {product_type} data from MongoDB...")
-            # Generate embeddings for specific product type
-            embeddings = embed_object.run_embedding(
-                product_type=product_type,
-                mongodb_conn=self.mongodb_conn,
-                hybrid_mode=hybrid_mode
-            )
-            # Index embeddings to specific collection
-            self.index(embeddings, collection)
-            self._create_payload_indexes_for_product_type(product_type, collection)
-        # Close MongoDB connection
-        if self.mongodb_conn:
-            self.mongodb_conn.close()
-            self.mongodb_conn = None
-    def indexing_single_product_type(self, product_type: str, collection_name: str, hybrid_mode: bool = True) -> str:
         """
         Indexing a single product group into its Qdrant collection from MongoDB
         Args:
@@ -735,46 +775,56 @@ class ProductIndexing:
             collection_name: Qdrant collection name
             hybrid_mode: Whether to use hybrid text embedding (BGEM3)
         """
-        buffer = io.StringIO()
-        sys.stdout = buffer
-        try:
-            self.client.recreate_collection(
-                collection_name=collection_name,
-                vectors_config=product_vectors_config,
-                sparse_vectors_config=sparse_vectors_config
-            )
-            print(f"Collection {collection_name} created")
-            # Setup MongoDB connection
-            if not self.mongodb_conn:
-                if not self.setup_mongodb():
-                    print("❌ Failed to connect to MongoDB")
-                    sys.stdout = sys.__stdout__
-                    return buffer.getvalue()
-            # Create embedding processor
-            embed_object = ProductEmbedding()
-            print(f"\n🔄 Processing {product_type} data from MongoDB...")
-            embeddings = embed_object.run_embedding(
-                product_type=product_type,
-                mongodb_conn=self.mongodb_conn,
-                hybrid_mode=hybrid_mode
-            )
-            self.index(embeddings, collection_name)
-            # Close MongoDB connection
-            if self.mongodb_conn:
-                self.mongodb_conn.close()
-                self.mongodb_conn = None
-        except Exception as e:
-            print(f"Error while indexing product type {product_type}: {e}")
-        self._create_payload_indexes_for_product_type(product_type, collection_name)
-        sys.stdout = sys.__stdout__
-        return buffer.getvalue()
     def _create_payload_indexes_for_product_type(self, product_type: str, collection_name: str):
         """Create payload indexes based on product type field schemas"""
@@ -1035,83 +1085,113 @@ class SolutionIndexing:
     def run_indexing(self, reload: bool = True):
         """Index all solution data from MongoDB into Qdrant collections."""
-        if reload:
             try:
-                for collection in solution_collections:
-                    self.client.recreate_collection(
-                        collection_name=collection,
-                        vectors_config=qdrant_client.http.models.VectorParams(
-                            size=768,
-                            distance=qdrant_client.http.models.Distance.COSINE,
-                        )
-                    )
-                print("All solution collections recreated.")
             except Exception as e:
-                print(f"Error while recreating collections: {e}")
-                return
-        # Setup MongoDB connection
-        if not self.mongodb_conn:
-            if not self.setup_mongodb():
-                print("❌ Failed to connect to MongoDB. Aborting indexing.")
-                return
-        # Create embedding processor
-        embed_object = SolutionEmbedding()
-        for collection, solution_type in zip(solution_collections, solution_types):
-            print(f"\n🔄 Processing {solution_type} data from MongoDB...")
-            embeddings = embed_object.run_embedding(solution_type, self.mongodb_conn)
-            self.index(embeddings, collection)
-        # Close MongoDB connection
-        if self.mongodb_conn:
-            self.mongodb_conn.close()
-            self.mongodb_conn = None
     def indexing_single_solution(self, solution: str, collection_name: str) -> str:
         """Indexing a single solution into its Qdrant collection from MongoDB"""
-        buffer = io.StringIO()
-        sys.stdout = buffer
-        try:
-            self.client.recreate_collection(
-                collection_name=collection_name,
-                vectors_config=qdrant_client.http.models.VectorParams(
-                    size=768,
-                    distance=qdrant_client.http.models.Distance.COSINE,
                 )
-            )
-            print(f"Collection {collection_name} created")
-            # Setup MongoDB connection
-            if not self.mongodb_conn:
-                if not self.setup_mongodb():
-                    print("❌ Failed to connect to MongoDB")
-                    sys.stdout = sys.__stdout__
-                    return buffer.getvalue()
-            # Create embedding processor
-            embed_object = SolutionEmbedding()
-            print(f"\n🔄 Processing {solution} data from MongoDB...")
-            embeddings = embed_object.run_embedding(solution, self.mongodb_conn)
-            self.index(embeddings, collection_name)
-            # Close MongoDB connection
-            if self.mongodb_conn:
-                self.mongodb_conn.close()
-                self.mongodb_conn = None
-        except Exception as e:
-            print(f"Error while recreating collection and indexing solution {solution}: {e}")
-        sys.stdout = sys.__stdout__
-        return buffer.getvalue()
 """=================GRADIO UI========================"""
-def create_gradio_interface():
     """Create Gradio interface for indexing from MongoDB"""
     product_indexing = ProductIndexing()
     solution_indexing = SolutionIndexing()
@@ -1120,7 +1200,13 @@ def create_gradio_interface():
         gr.Markdown("# 🗄️ Qdrant Data Indexing System (MongoDB)")
         gr.Markdown("Recreate Qdrant Collections and Index Data from MongoDB Atlas")
-        output_box = gr.Textbox(lines=15, label="📋 Logs", interactive=False)
         gr.Markdown("---")
         gr.Markdown("## 🏢 Giải pháp (Solutions)")
@@ -1223,20 +1309,13 @@ def create_gradio_interface():
             inputs=[gr.State("thiet_bi_dien"), gr.State(QDRANT_COLLECTION_NAME_SPTHIETBIDIEN), gr.State(True)],
             outputs=output_box)
-        def index_all_products():
-            buffer = io.StringIO()
-            sys.stdout = buffer
-            product_indexing.run_indexing(reload=True, hybrid_mode=True)
-            sys.stdout = sys.__stdout__
-            return buffer.getvalue()
         btn_all_products.click(
-            index_all_products,
             outputs=output_box)
     return demo
 if __name__ == "__main__":
-    demo = create_gradio_interface()
     demo.launch()

 from typing import List, Dict, Tuple, Any, Optional
 import uuid
 from PIL import Image
 from FlagEmbedding import BGEM3FlagModel
 import gradio as gr
 import torch
 from transformers import EfficientNetModel, AutoImageProcessor
 from pymongo import MongoClient
+import contextlib
 from config import (
     QDRANT_COLLECTION_NAME_SPCHIEUSANG,
     "nha_o_xa_hoi": "gp_nha_o_xa_hoi"
 }
+class OutputCapture:
+    """Context manager to capture stdout and stderr"""
+    def __init__(self):
+        self.buffer = io.StringIO()
+        self.old_stdout = None
+        self.old_stderr = None
+    def __enter__(self):
+        self.old_stdout = sys.stdout
+        self.old_stderr = sys.stderr
+        sys.stdout = self.buffer
+        sys.stderr = self.buffer
+        return self.buffer
+    def __exit__(self, *args):
+        sys.stdout = self.old_stdout
+        sys.stderr = self.old_stderr
+    def getvalue(self):
+        return self.buffer.getvalue()
 """=================MONGODB CONNECTION========================"""
 class MongoDBConnection:
             reload: Whether to recreate collections
             hybrid_mode: Whether to use hybrid text embedding (BGEM3)
         """
+        with OutputCapture() as output:
             try:
+                if reload:
+                    try:
+                        for collection in product_collections:
+                            self.client.recreate_collection(
+                                collection_name=collection,
+                                vectors_config=product_vectors_config,
+                                sparse_vectors_config=sparse_vectors_config
+                            )
+                        print("✅ All product collections recreated.")
+                    except Exception as e:
+                        print(f"❌ Error while recreating collections: {e}")
+                        return output.getvalue()
+                # Setup MongoDB connection
+                if not self.mongodb_conn:
+                    if not self.setup_mongodb():
+                        print("❌ Failed to connect to MongoDB. Aborting indexing.")
+                        return output.getvalue()
+                # Create embedding processor
+                embed_object = ProductEmbedding()
+                for collection, product_type in zip(product_collections, product_types):
+                    print(f"\n{'='*60}")
+                    print(f"🔄 Processing {product_type} data from MongoDB...")
+                    print(f"{'='*60}")
+                    try:
+                        # Generate embeddings for specific product type
+                        embeddings = embed_object.run_embedding(
+                            product_type=product_type,
+                            mongodb_conn=self.mongodb_conn,
+                            hybrid_mode=hybrid_mode
+                        )
+                        # Index embeddings to specific collection
+                        self.index(embeddings, collection)
+                        self._create_payload_indexes_for_product_type(product_type, collection)
+                        print(f"✅ Completed indexing for {product_type}")
+                    except Exception as e:
+                        print(f"❌ Error indexing {product_type}: {e}")
+                        import traceback
+                        print(traceback.format_exc())
+                # Close MongoDB connection
+                if self.mongodb_conn:
+                    self.mongodb_conn.close()
+                    self.mongodb_conn = None
+                print(f"\n{'='*60}")
+                print("🎉 All indexing completed!")
+                print(f"{'='*60}")
             except Exception as e:
+                print(f"❌ Fatal error during indexing: {e}")
+                import traceback
+                print(traceback.format_exc())
+            return output.getvalue()
+    def indexing_single_product_type(self, product_type: str, collection_name: str,
+                                    hybrid_mode: bool = True) -> str:
         """
         Indexing a single product group into its Qdrant collection from MongoDB
         Args:
             collection_name: Qdrant collection name
             hybrid_mode: Whether to use hybrid text embedding (BGEM3)
         """
+        with OutputCapture() as output:
+            try:
+                print(f"{'='*60}")
+                print(f"🚀 Starting indexing for {product_type}")
+                print(f"{'='*60}\n")
+                self.client.recreate_collection(
+                    collection_name=collection_name,
+                    vectors_config=product_vectors_config,
+                    sparse_vectors_config=sparse_vectors_config
+                )
+                print(f"✅ Collection {collection_name} created\n")
+                # Setup MongoDB connection
+                if not self.mongodb_conn:
+                    if not self.setup_mongodb():
+                        print("❌ Failed to connect to MongoDB")
+                        return output.getvalue()
+                # Create embedding processor
+                embed_object = ProductEmbedding()
+                print(f"🔄 Processing {product_type} data from MongoDB...")
+                embeddings = embed_object.run_embedding(
+                    product_type=product_type,
+                    mongodb_conn=self.mongodb_conn,
+                    hybrid_mode=hybrid_mode
+                )
+                print(f"\n📊 Indexing to Qdrant...")
+                self.index(embeddings, collection_name)
+                print(f"\n🔍 Creating payload indexes...")
+                self._create_payload_indexes_for_product_type(product_type, collection_name)
+                # Close MongoDB connection
+                if self.mongodb_conn:
+                    self.mongodb_conn.close()
+                    self.mongodb_conn = None
+                print(f"\n{'='*60}")
+                print(f"✅ Successfully completed indexing for {product_type}")
+                print(f"{'='*60}")
+            except Exception as e:
+                print(f"❌ Error while indexing product type {product_type}: {e}")
+                import traceback
+                print(traceback.format_exc())
+            return output.getvalue()
     def _create_payload_indexes_for_product_type(self, product_type: str, collection_name: str):
         """Create payload indexes based on product type field schemas"""
     def run_indexing(self, reload: bool = True):
         """Index all solution data from MongoDB into Qdrant collections."""
+        with OutputCapture() as output:
             try:
+                if reload:
+                    try:
+                        for collection in solution_collections:
+                            self.client.recreate_collection(
+                                collection_name=collection,
+                                vectors_config=qdrant_client.http.models.VectorParams(
+                                    size=768,
+                                    distance=qdrant_client.http.models.Distance.COSINE,
+                                )
+                            )
+                        print("✅ All solution collections recreated.")
+                    except Exception as e:
+                        print(f"❌ Error while recreating collections: {e}")
+                        return output.getvalue()
+                # Setup MongoDB connection
+                if not self.mongodb_conn:
+                    if not self.setup_mongodb():
+                        print("❌ Failed to connect to MongoDB. Aborting indexing.")
+                        return output.getvalue()
+                # Create embedding processor
+                embed_object = SolutionEmbedding()
+                for collection, solution_type in zip(solution_collections, solution_types):
+                    print(f"\n{'='*60}")
+                    print(f"🔄 Processing {solution_type} data from MongoDB...")
+                    print(f"{'='*60}")
+                    try:
+                        embeddings = embed_object.run_embedding(solution_type, self.mongodb_conn)
+                        self.index(embeddings, collection)
+                        print(f"✅ Completed indexing for {solution_type}")
+                    except Exception as e:
+                        print(f"❌ Error indexing {solution_type}: {e}")
+                        import traceback
+                        print(traceback.format_exc())
+                # Close MongoDB connection
+                if self.mongodb_conn:
+                    self.mongodb_conn.close()
+                    self.mongodb_conn = None
+                print(f"\n{'='*60}")
+                print("🎉 All solution indexing completed!")
+                print(f"{'='*60}")
             except Exception as e:
+                print(f"❌ Fatal error during indexing: {e}")
+                import traceback
+                print(traceback.format_exc())
+            return output.getvalue()
     def indexing_single_solution(self, solution: str, collection_name: str) -> str:
         """Indexing a single solution into its Qdrant collection from MongoDB"""
+        with OutputCapture() as output:
+            try:
+                print(f"{'='*60}")
+                print(f"🚀 Starting indexing for {solution}")
+                print(f"{'='*60}\n")
+                self.client.recreate_collection(
+                    collection_name=collection_name,
+                    vectors_config=qdrant_client.http.models.VectorParams(
+                        size=768,
+                        distance=qdrant_client.http.models.Distance.COSINE,
+                    )
                 )
+                print(f"✅ Collection {collection_name} created\n")
+                # Setup MongoDB connection
+                if not self.mongodb_conn:
+                    if not self.setup_mongodb():
+                        print("❌ Failed to connect to MongoDB")
+                        return output.getvalue()
+                # Create embedding processor
+                embed_object = SolutionEmbedding()
+                print(f"🔄 Processing {solution} data from MongoDB...")
+                embeddings = embed_object.run_embedding(solution, self.mongodb_conn)
+                print(f"\n📊 Indexing to Qdrant...")
+                self.index(embeddings, collection_name)
+                # Close MongoDB connection
+                if self.mongodb_conn:
+                    self.mongodb_conn.close()
+                    self.mongodb_conn = None
+                print(f"\n{'='*60}")
+                print(f"✅ Successfully completed indexing for {solution}")
+                print(f"{'='*60}")
+            except Exception as e:
+                print(f"❌ Error while indexing solution {solution}: {e}")
+                import traceback
+                print(traceback.format_exc())
+            return output.getvalue()
 """=================GRADIO UI========================"""
+def create_indexing_interface():
     """Create Gradio interface for indexing from MongoDB"""
     product_indexing = ProductIndexing()
     solution_indexing = SolutionIndexing()
         gr.Markdown("# 🗄️ Qdrant Data Indexing System (MongoDB)")
         gr.Markdown("Recreate Qdrant Collections and Index Data from MongoDB Atlas")
+        output_box = gr.Textbox(
+            lines=20,
+            label="📋 Logs",
+            interactive=False,
+            show_copy_button=True,
+            max_lines=30
+        )
         gr.Markdown("---")
         gr.Markdown("## 🏢 Giải pháp (Solutions)")
             inputs=[gr.State("thiet_bi_dien"), gr.State(QDRANT_COLLECTION_NAME_SPTHIETBIDIEN), gr.State(True)],
             outputs=output_box)
         btn_all_products.click(
+            product_indexing.run_indexing,
+            inputs=[gr.State(True), gr.State(True)],
             outputs=output_box)
     return demo
 if __name__ == "__main__":
+    demo = create_indexing_interface()
     demo.launch()

data_processing.py ADDED Viewed

	@@ -0,0 +1,697 @@

+import sys
+import os
+from pathlib import Path
+import requests
+import re
+import tempfile
+import json
+import math
+import time
+import warnings
+from typing import Dict, List
+from urllib3.exceptions import IncompleteRead
+from datetime import datetime
+import docling
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+import pandas as pd
+import gradio as gr
+from pymongo import MongoClient, UpdateOne
+from pymongo.errors import ConnectionFailure, OperationFailure
+from data_helper import *
+from config import MONGODB_URI
+# Suppress PyTorch DataLoader pin_memory warning on MPS
+warnings.filterwarnings("ignore", message=".*pin_memory.*not supported on MPS.*")
+class MongoDBHandler:
+    """Handler for MongoDB operations"""
+    def __init__(self, connection_string: str = None, database_name: str = "product_database"):
+        """
+        Initialize MongoDB connection
+        Args:
+            connection_string: MongoDB connection string (default: localhost)
+            database_name: Name of the database to use
+        """
+        if connection_string is None:
+            connection_string = "mongodb://localhost:27017/"
+        self.connection_string = connection_string
+        self.database_name = database_name
+        self.client = None
+        self.db = None
+    def connect(self):
+        """Establish connection to MongoDB"""
+        try:
+            self.client = MongoClient(self.connection_string, serverSelectionTimeoutMS=5000)
+            # Test connection
+            self.client.admin.command('ping')
+            self.db = self.client[self.database_name]
+            print(f"✅ Connected to MongoDB database: {self.database_name}")
+            return True
+        except ConnectionFailure as e:
+            print(f"❌ Failed to connect to MongoDB: {e}")
+            return False
+        except Exception as e:
+            print(f"❌ Unexpected error connecting to MongoDB: {e}")
+            return False
+    def disconnect(self):
+        """Close MongoDB connection"""
+        if self.client is not None:
+            self.client.close()
+            print("🔌 Disconnected from MongoDB")
+    def get_collection_name(self, category: str) -> str:
+        """Map category name to collection name"""
+        collection_mapping = {
+            "Sản phẩm nhà thông minh": "sp_nha_thong_minh",
+            "Đèn LED": "sp_chieu_sang",
+            "Chiếu sáng chuyên dụng": "sp_chuyen_dung",
+            "Thiết bị điện": "sp_thiet_bi_dien",
+            "Phích nước": "sp_phich_nuoc",
+        }
+        return collection_mapping.get(category, "unknown_products")
+    def upload_data(self, data: List[Dict], collection_name: str, upsert: bool = True) -> Dict:
+        """
+        Upload data to MongoDB collection
+        Args:
+            data: List of product dictionaries
+            collection_name: Name of the collection
+            upsert: If True, update existing documents or insert new ones
+        Returns:
+            Dictionary with upload statistics
+        """
+        if self.db is None:
+            return {"success": False, "error": "Not connected to database"}
+        if not data:
+            return {"success": False, "error": "No data to upload"}
+        try:
+            collection = self.db[collection_name]
+            # Add metadata
+            timestamp = datetime.utcnow()
+            for item in data:
+                item['_updated_at'] = timestamp
+                if '_created_at' not in item:
+                    item['_created_at'] = timestamp
+            if upsert:
+                # Use bulk write with upsert for better performance
+                operations = []
+                for item in data:
+                    product_id = item.get('Product_ID')
+                    if product_id:
+                        operations.append(
+                            UpdateOne(
+                                {'Product_ID': product_id},
+                                {'$set': item},
+                                upsert=True
+                            )
+                        )
+                if operations:
+                    result = collection.bulk_write(operations)
+                    return {
+                        "success": True,
+                        "collection": collection_name,
+                        "inserted": result.upserted_count,
+                        "modified": result.modified_count,
+                        "matched": result.matched_count,
+                        "total": len(data)
+                    }
+                else:
+                    return {"success": False, "error": "No valid product IDs found"}
+            else:
+                # Simple insert (may cause duplicates)
+                result = collection.insert_many(data)
+                return {
+                    "success": True,
+                    "collection": collection_name,
+                    "inserted": len(result.inserted_ids),
+                    "total": len(data)
+                }
+        except OperationFailure as e:
+            return {"success": False, "error": f"MongoDB operation failed: {e}"}
+        except Exception as e:
+            return {"success": False, "error": f"Unexpected error: {e}"}
+    def test_connection(self) -> str:
+        """Test MongoDB connection and return status"""
+        try:
+            if self.connect():
+                # Get database stats
+                stats = self.db.command("dbstats")
+                collections = self.db.list_collection_names()
+                self.disconnect()
+                return f"✅ Connected successfully!\n📊 Database: {self.database_name}\n📁 Collections: {len(collections)}\n💾 Size: {stats.get('dataSize', 0) / 1024 / 1024:.2f} MB"
+            else:
+                return "❌ Connection failed"
+        except Exception as e:
+            return f"❌ Error: {str(e)}"
+class DataProcessing:
+    def __init__(self):
+        pass
+    def get_data_from_excel_file(self, excel_path, key_match, collection_name,
+                                  processor_type="docling", mongo_handler=None):
+        """
+        Process Excel file and upload to MongoDB
+        Args:
+            excel_path: Path to Excel file
+            key_match: Category to match
+            collection_name: MongoDB collection name
+            processor_type: Type of PDF processor
+            mongo_handler: MongoDBHandler instance (required)
+        """
+        if not mongo_handler:
+            return "❌ MongoDB handler not provided"
+        all_sheets = pd.read_excel(excel_path, sheet_name=None, header=1)
+        sheet_names = list(all_sheets.keys())
+        sheets = {k: all_sheets[k] for k in sheet_names[2:]}
+        data = []
+        for sheet_name, df in sheets.items():
+            df.columns = df.columns.str.strip()
+            if "category 1" not in df.columns:
+                df = pd.read_excel(excel_path, sheet_name=sheet_name, header=0)
+                df.columns = df.columns.str.strip()
+            if "category 1" in df.columns:
+                filtered = df[df["category 1"].astype(str).str.replace("\n", " ").str.strip() == key_match]
+                data.append(filtered)
+        if data:
+            result_df = pd.concat(data, ignore_index=True)
+            result_df = result_df.where(pd.notnull(result_df), None)
+            result_df["HDSD"] = None
+            cols_to_drop = [col for col in result_df.columns if col.strip().lower().startswith("unnamed") or col.strip() == "a" or col == "STT"]
+            result_df = result_df.drop(columns=cols_to_drop, errors='ignore')
+            cols_to_replace = [col for col in result_df.columns if col not in ["Tóm tắt ưu điểm, tính năng", "Thông số kỹ thuật", "Nội dung Ưu điểm SP", "Ưu điểm"]]
+            result_df[cols_to_replace] = result_df[cols_to_replace].replace('\n', ' ', regex=True)
+            # Replace "none" values with None
+            result_df.loc[result_df["Thông số kỹ thuật"] == "none", "Thông số kỹ thuật"] = None
+            result_df.loc[result_df["Tóm tắt ưu điểm, tính năng"] == "none", "Tóm tắt ưu điểm, tính năng"] = None
+            result_df.loc[result_df["Tóm tắt TSKT"] == "none", "Tóm tắt TSKT"] = None
+            result_df.loc[result_df["Nội dung Ưu điểm SP"] == "none", "Nội dung Ưu điểm SP"] = None
+            result_df = result_df.map(lambda x: x.strip() if isinstance(x, str) else x)
+            result_df.drop_duplicates(subset=["Product_ID"], inplace=True)
+            result_df = self.data_normalization(result_df=result_df)
+            data = result_df.to_dict(orient="records")
+            data = self.convert_floats(data)
+            data = self.replace_nan_with_none(data)
+            # Process instructions based on processor type
+            if processor_type == "docling_with_ocr":
+                data = self.process_instruction_with_tesseract(data)
+            else:
+                data = self.process_instruction(data)
+            # Upload to MongoDB
+            if not mongo_handler.connect():
+                return "❌ Failed to connect to MongoDB"
+            result = mongo_handler.upload_data(data, collection_name, upsert=True)
+            mongo_handler.disconnect()
+            if result.get("success"):
+                return f"✅ Uploaded to MongoDB collection '{result['collection']}':\n" \
+                       f"   • Total records: {result['total']}\n" \
+                       f"   • Inserted: {result.get('inserted', 0)}\n" \
+                       f"   • Updated: {result.get('modified', 0)}"
+            else:
+                return f"❌ MongoDB upload failed: {result.get('error', 'Unknown error')}"
+        else:
+            return f"❌ Data not found for key: {key_match}"
+    def convert_floats(self, obj):
+        if isinstance(obj, float) and obj.is_integer():
+            return int(obj)
+        elif isinstance(obj, list):
+            return [self.convert_floats(i) for i in obj]
+        elif isinstance(obj, dict):
+            return {k: self.convert_floats(v) for k, v in obj.items()}
+        else:
+            return obj
+    def strip_redundant_space(self, text):
+        cleaned_text = " ".join(text.strip().split())
+        return cleaned_text
+    def convert_tag_to_dict(self, tag_str: str) -> dict:
+        if not isinstance(tag_str, str) or not tag_str.strip().startswith("{"):
+            return {}
+        try:
+            fixed = re.sub(r'([{,]\s*)(\w+)\s*:', r'\1"\2":', tag_str)
+            raw_pairs = fixed.strip('{} ').split(',')
+            raw_pairs = [pair.strip() for pair in raw_pairs if pair.strip()]
+            result = {}
+            current_key = None
+            for pair in raw_pairs:
+                if ':' in pair:
+                    key, value = pair.split(':', 1)
+                    key = key.strip().strip('"')
+                    value = value.strip()
+                    pattern = r',\s[A-Z]'
+                    match = re.search(pattern, value)
+                    if match:
+                        values = [v.strip() for v in value.split(',')]
+                    else:
+                        values = value
+                    result[key] = values
+                    current_key = key
+                elif current_key:
+                    previous_value = result[current_key]
+                    if isinstance(previous_value, list):
+                        result[current_key].append(pair.strip())
+                    else:
+                        result[current_key] = [previous_value, pair.strip()]
+            return result
+        except Exception as e:
+            print(f"Error parse tag: {tag_str} -> {e}")
+            return {}
+    def convert_tags_to_numeric(self, tags_dict):
+        keys_to_convert = ["dung_tich", "cong_suat", "lo_khoet_tran", "so_cuc", "so_hat", "modules", "cuon_day", "kich_thuoc"]
+        new_tags = {}
+        for key, value in tags_dict.items():
+            if key in keys_to_convert:
+                match = re.search(r'([\d.]+)', str(value))
+                if match:
+                    num = float(match.group(1))
+                    new_tags[key] = int(num) if num.is_integer() else num
+                else:
+                    new_tags[key] = value
+            else:
+                new_tags[key] = value
+        return new_tags
+    def data_normalization(self, result_df):
+        if "Tags" in result_df.columns:
+            result_df["Tags"] = result_df["Tags"].astype(str).str.lower().apply(self.convert_tag_to_dict)
+            result_df["Tags"] = result_df["Tags"].apply(self.convert_tags_to_numeric)
+        if "Giá" in result_df.columns:
+            result_df["Giá"] = result_df["Giá"].apply(lambda x: "Liên hệ" if x == 0 else x)
+        if "Tên sản phẩm" in result_df.columns:
+            result_df["Tên sản phẩm"] = result_df["Tên sản phẩm"].apply(self.strip_redundant_space)
+        for col_name in result_df.columns:
+            if col_name in ["Tóm tắt TSKT", "Thông số kỹ thuật"]:
+                result_df[col_name] = result_df[col_name].astype(str).str.lower().str.strip()
+        return result_df
+    def replace_nan_with_none(self, obj):
+        if isinstance(obj, float) and math.isnan(obj):
+            return None
+        elif isinstance(obj, dict):
+            return {k: self.replace_nan_with_none(v) for k, v in obj.items()}
+        elif isinstance(obj, list):
+            return [self.replace_nan_with_none(i) for i in obj]
+        else:
+            return obj
+    @staticmethod
+    def download_pdf_with_retry(url, max_retries=3, timeout=30):
+        """Download PDF with retry logic and better error handling"""
+        for attempt in range(max_retries):
+            try:
+                print(f"Downloading PDF (attempt {attempt + 1}/{max_retries})...")
+                session = requests.Session()
+                session.headers.update({
+                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+                })
+                response = session.get(url, stream=True, timeout=timeout)
+                response.raise_for_status()
+                content_length = response.headers.get('content-length')
+                if content_length:
+                    print(f"Expected file size: {int(content_length):,} bytes")
+                content = b''
+                chunk_size = 8192
+                downloaded = 0
+                for chunk in response.iter_content(chunk_size=chunk_size):
+                    if chunk:
+                        content += chunk
+                        downloaded += len(chunk)
+                print(f"\nDownload completed: {len(content):,} bytes")
+                return content
+            except (requests.exceptions.RequestException, IncompleteRead, ConnectionError) as e:
+                print(f"Download attempt {attempt + 1} failed: {e}")
+                if attempt < max_retries - 1:
+                    wait_time = 2 ** attempt
+                    print(f"Waiting {wait_time} seconds before retry...")
+                    time.sleep(wait_time)
+                else:
+                    print("All download attempts failed")
+                    raise e
+    @staticmethod
+    def process_pdf_with_docling(url):
+        """Process PDF from URL using Docling for better structure extraction"""
+        try:
+            pdf_content = DataProcessing.download_pdf_with_retry(url)
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
+                tmp_file.write(pdf_content)
+                tmp_path = tmp_file.name
+            print(f"PDF saved to temporary file: {tmp_path}")
+            pipeline_options = PdfPipelineOptions()
+            pipeline_options.do_ocr = False
+            pipeline_options.do_table_structure = False
+            converter = DocumentConverter(
+                format_options={
+                    InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+                }
+            )
+            print("Converting document with Docling...")
+            result = converter.convert(tmp_path)
+            os.unlink(tmp_path)
+            print("Temporary file cleaned up")
+            return result
+        except Exception as e:
+            print(f"Error processing PDF with Docling from URL {url}: {e}")
+            return None
+    @staticmethod
+    def extract_content_from_docling_result(docling_result):
+        """Extract content from Docling result in a more robust way"""
+        if not docling_result:
+            return None
+        try:
+            doc = docling_result.document
+            try:
+                markdown_content = doc.export_to_markdown()
+                return {'markdown': markdown_content}
+            except Exception as e:
+                print(f"Markdown export failed: {e}")
+            if hasattr(doc, 'main_text'):
+                return {'text': doc.main_text}
+            if hasattr(doc, 'body') and doc.body:
+                content = []
+                for element in doc.body:
+                    content.append(str(element))
+                return {'text': '\n'.join(content)}
+            if hasattr(doc, 'elements') and doc.elements:
+                content = []
+                for element in doc.elements:
+                    content.append(str(element))
+                return {'text': '\n'.join(content)}
+            return {'error': 'No accessible content found'}
+        except Exception as e:
+            return {'error': f"Error extracting content: {e}"}
+    def process_instruction(self, data):
+        """Lấy thông tin hướng dẫn sử dụng"""
+        tmp_data = data[:]
+        for item in tmp_data:
+            instruction_url = item.get("Link file HDSD", None)
+            if not instruction_url:
+                print("No instruction URL found, skipping...")
+                item["HDSD"] = ""
+                continue
+            if "https://" not in instruction_url and "http://" not in instruction_url:
+                print("Wrong URL, but has instruction info")
+                item["HDSD"] = instruction_url
+                continue
+            if "hdsd" not in instruction_url or "Khong" in instruction_url:
+                print("invalid instruction url/content")
+                item["HDSD"] = ""
+                continue
+            raw_result = DataProcessing.process_pdf_with_docling(instruction_url)
+            if raw_result:
+                extract_result = DataProcessing.extract_content_from_docling_result(raw_result)
+                if 'markdown' in extract_result.keys():
+                    item["HDSD"] = re.sub(r"<!--\s*image\s*-->", '', extract_result['markdown'], flags=re.IGNORECASE).strip()
+                elif 'text' in extract_result.keys():
+                    item["HDSD"] = re.sub(r"<!--\s*image\s*-->", '', extract_result['text'], flags=re.IGNORECASE).strip()
+        return tmp_data
+def process_single_category(excel_path, category_name, processor_type,
+                           mongo_connection, mongo_database,
+                           progress=gr.Progress()):
+    """Process a single product category and upload to MongoDB"""
+    if excel_path is None:
+        return "❌ Please upload an Excel file first"
+    # Category mapping
+    category_mapping = {
+        "Sản phẩm nhà thông minh": ("Sản phẩm nhà thông minh", "sp_nha_thong_minh"),
+        "Đèn LED": ("Đèn LED", "sp_chieu_sang"),
+        "Chiếu sáng chuyên dụng": ("Chiếu sáng chuyên dụng", "sp_chuyen_dung"),
+        "Thiết bị điện": ("Thiết bị điện", "sp_thiet_bi_dien"),
+        "Phích nước": ("Phích nước", "sp_phich_nuoc"),
+    }
+    if category_name not in category_mapping:
+        return f"❌ Unknown category: {category_name}"
+    key_match, collection_name = category_mapping[category_name]
+    try:
+        progress(0.1, desc="Initializing data processor...")
+        dp = DataProcessing()
+        # Initialize MongoDB handler
+        mongo_handler = MongoDBHandler(
+            connection_string=mongo_connection if mongo_connection else None,
+            database_name=mongo_database if mongo_database else "product_database"
+        )
+        progress(0.3, desc=f"Processing {category_name} with {processor_type}...")
+        result = dp.get_data_from_excel_file(
+            excel_path=excel_path,
+            key_match=key_match,
+            collection_name=collection_name,
+            processor_type=processor_type,
+            mongo_handler=mongo_handler
+        )
+        progress(1.0, desc="Processing completed!")
+        return result
+    except Exception as e:
+        return f"❌ Error processing {category_name}: {str(e)}"
+def process_all_categories(excel_path, processor_type,
+                          mongo_connection, mongo_database, progress=gr.Progress()):
+    """Process all product categories and upload to MongoDB"""
+    if excel_path is None:
+        return "❌ Please upload an Excel file first"
+    categories = [
+        "Sản phẩm nhà thông minh",
+        "Đèn LED",
+        "Chiếu sáng chuyên dụng",
+        "Thiết bị điện",
+        "Phích nước"
+    ]
+    results = []
+    total_categories = len(categories)
+    for i, category in enumerate(categories):
+        progress((i + 1) / total_categories, desc=f"Processing {category}...")
+        result = process_single_category(
+            excel_path, category, processor_type,
+            mongo_connection, mongo_database
+        )
+        results.append(f"{category}: {result}")
+    return "\n".join(results)
+def test_mongo_connection(connection_string, database_name):
+    """Test MongoDB connection"""
+    if not connection_string:
+        connection_string = "mongodb://localhost:27017/"
+    if not database_name:
+        database_name = "product_database"
+    handler = MongoDBHandler(connection_string, database_name)
+    return handler.test_connection()
+def create_processing_interface():
+    """Create Gradio interface with MongoDB-only storage"""
+    with gr.Blocks(title="Data Processing - Product Metadata Extractor") as demo:
+        gr.Markdown("# 📊 Product Data Processing")
+        gr.Markdown("Extract and process product metadata from Excel files and upload to MongoDB")
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### 📤 Upload Excel File")
+                excel_upload = gr.File(
+                    label="Upload Excel File",
+                    file_types=[".xlsx", ".xls"],
+                    type="filepath"
+                )
+                gr.Markdown("### ⚙️ Processing Settings")
+                processor_dropdown = gr.Dropdown(
+                    choices=["docling"],
+                    value="docling",
+                    label="PDF Processor Type",
+                    info="Using basic docling for fast processing"
+                )
+                category_dropdown = gr.Dropdown(
+                    choices=[
+                        "Sản phẩm nhà thông minh",
+                        "Đèn LED",
+                        "Chiếu sáng chuyên dụng",
+                        "Thiết bị điện",
+                        "Phích nước"
+                    ],
+                    value="Sản phẩm nhà thông minh",
+                    label="Product Category",
+                    info="Select which product category to process"
+                )
+                gr.Markdown("### 🗄️ MongoDB Configuration")
+                mongo_connection = gr.Textbox(
+                    label="MongoDB Connection String",
+                    placeholder="mongodb+srv://<username>:<password>@cluster.mongodb.net/?retryWrites=true&w=majority",
+                    value=MONGODB_URI,
+                    info="MongoDB connection string"
+                )
+                mongo_database = gr.Textbox(
+                    label="Database Name",
+                    placeholder="product_database",
+                    value="product_database",
+                    info="Name of the MongoDB database"
+                )
+                test_connection_btn = gr.Button("🔌 Test Connection", size="sm")
+                connection_status = gr.Textbox(
+                    label="Connection Status",
+                    interactive=False,
+                    lines=3
+                )
+            with gr.Column(scale=2):
+                output_box = gr.Textbox(
+                    lines=15,
+                    label="📋 Processing Log",
+                    placeholder="Processing results will appear here..."
+                )
+        gr.Markdown("### 🚀 Actions")
+        with gr.Row():
+            process_single_btn = gr.Button("🔄 Process Selected Category", variant="primary")
+            process_all_btn = gr.Button("🔄 Process All Categories", variant="secondary")
+        gr.Markdown("### 📖 Information")
+        with gr.Accordion("MongoDB Collections", open=False):
+            gr.Markdown("""
+            **📦 Collections**:
+            - `sp_nha_thong_minh` - Sản phẩm nhà thông minh
+            - `sp_chieu_sang` - Đèn LED
+            - `sp_chuyen_dung` - Chiếu sáng chuyên dụng
+            - `sp_thiet_bi_dien` - Thiết bị điện
+            - `sp_phich_nuoc` - Phích nước
+            **🔄 Upsert Logic**:
+            - Existing records are updated based on `Product_ID`
+            - New records are inserted automatically
+            - Timestamps `_created_at` and `_updated_at` are managed automatically
+            """)
+        with gr.Accordion("Processor Types", open=False):
+            gr.Markdown("""
+            **🔹 docling**: Basic PDF text extraction
+            - Fast processing
+            - Good for text-based PDFs
+            - No OCR capabilities
+            """)
+        # Event handlers
+        test_connection_btn.click(
+            fn=test_mongo_connection,
+            inputs=[mongo_connection, mongo_database],
+            outputs=[connection_status]
+        )
+        process_single_btn.click(
+            fn=process_single_category,
+            inputs=[
+                excel_upload, category_dropdown, processor_dropdown,
+                mongo_connection, mongo_database
+            ],
+            outputs=output_box,
+            show_progress=True
+        )
+        process_all_btn.click(
+            fn=process_all_categories,
+            inputs=[
+                excel_upload, processor_dropdown,
+                mongo_connection, mongo_database
+            ],
+            outputs=output_box,
+            show_progress=True
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_processing_interface()
+    demo.launch(share=False, server_name="localhost", server_port=7860)