anhkhoiphan commited on
Commit
a519263
·
1 Parent(s): 4e3aadb

Bổ sung data processing

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. app.py +70 -3
  3. data_indexing.py +235 -156
  4. data_processing.py +697 -0
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  /venv
2
  .env
 
 
1
  /venv
2
  .env
3
+ __pycache__/
app.py CHANGED
@@ -1,4 +1,71 @@
1
- from data_indexing import create_gradio_interface
 
2
 
3
- demo = create_gradio_interface()
4
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
 
4
+ import gradio as gr
5
+
6
+ # Import your interfaces
7
+ from data_indexing import create_indexing_interface
8
+ from data_processing import create_processing_interface
9
+
10
+ def create_combined_app():
11
+ """Create combined app with two tabs"""
12
+
13
+ with gr.Blocks(
14
+ title="Rạng Đông Data Management System",
15
+ theme=gr.themes.Soft()
16
+ ) as app:
17
+
18
+ gr.Markdown("""
19
+ # 🏢 Rạng Đông Data Management System
20
+
21
+ Hệ thống quản lý dữ liệu sản phẩm và giải pháp của Rạng Đông
22
+ """)
23
+
24
+ with gr.Tabs() as tabs:
25
+ # Tab 1: Vector Indexing (MongoDB to Qdrant)
26
+ with gr.Tab("🗄️ Vector Indexing", id="indexing"):
27
+ gr.Markdown("""
28
+ ## Indexing dữ liệu từ MongoDB lên Qdrant
29
+
30
+ Tạo vector embeddings và index dữ liệu từ MongoDB lên Qdrant Vector Database
31
+ """)
32
+
33
+ # Create indexing interface
34
+ indexing_interface = create_indexing_interface()
35
+
36
+ # Tab 2: Data Processing (Excel to MongoDB)
37
+ with gr.Tab("📊 Data Processing", id="processing"):
38
+ gr.Markdown("""
39
+ ## Xử lý dữ liệu từ Excel lên MongoDB
40
+
41
+ Upload file Excel, xử lý dữ liệu sản phẩm và đẩy lên MongoDB Atlas
42
+ """)
43
+
44
+ # Create processing interface
45
+ processing_interface = create_processing_interface()
46
+
47
+
48
+
49
+ gr.Markdown("""
50
+ ---
51
+ ### 📖 Hướng dẫn sử dụng
52
+
53
+ **Bước 1: Data Processing**
54
+ 1. Upload file Excel chứa dữ liệu sản phẩm (product_Metadata.xlsx)
55
+ 2. Cấu hình MongoDB connection string, database name và test connection
56
+ 3. Chọn loại sản phẩm hoặc xử lý tất cả.
57
+
58
+ **Bước 2: Vector Indexing**
59
+ 1. Chọn collection cần indexing
60
+ 2. Hệ thống sẽ tạo embeddings và đẩy lên Qdrant
61
+
62
+ """)
63
+
64
+ return app
65
+
66
+
67
+ if __name__ == "__main__":
68
+ app = create_combined_app()
69
+
70
+ # Launch with appropriate settings for Hugging Face Spaces
71
+ app.launch()
data_indexing.py CHANGED
@@ -7,11 +7,6 @@ import sys
7
  from typing import List, Dict, Tuple, Any, Optional
8
  import uuid
9
 
10
- # Add project root to Python path
11
- project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
12
- if project_root not in sys.path:
13
- sys.path.insert(0, project_root)
14
-
15
  from PIL import Image
16
  from FlagEmbedding import BGEM3FlagModel
17
  import gradio as gr
@@ -22,6 +17,7 @@ from qdrant_client.http.models import Modifier, Distance, SparseVectorParams, Ve
22
  import torch
23
  from transformers import EfficientNetModel, AutoImageProcessor
24
  from pymongo import MongoClient
 
25
 
26
  from config import (
27
  QDRANT_COLLECTION_NAME_SPCHIEUSANG,
@@ -151,6 +147,26 @@ mongodb_solution_collections = {
151
  "nha_o_xa_hoi": "gp_nha_o_xa_hoi"
152
  }
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  """=================MONGODB CONNECTION========================"""
156
  class MongoDBConnection:
@@ -686,48 +702,72 @@ class ProductIndexing:
686
  reload: Whether to recreate collections
687
  hybrid_mode: Whether to use hybrid text embedding (BGEM3)
688
  """
689
- if reload:
690
  try:
691
- for collection in product_collections:
692
- self.client.recreate_collection(
693
- collection_name=collection,
694
- vectors_config=product_vectors_config,
695
- sparse_vectors_config=sparse_vectors_config
696
- )
697
- print("All product collections recreated.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
698
  except Exception as e:
699
- print(f"Error while recreating collections: {e}")
700
- return
701
-
702
- # Setup MongoDB connection
703
- if not self.mongodb_conn:
704
- if not self.setup_mongodb():
705
- print("❌ Failed to connect to MongoDB. Aborting indexing.")
706
- return
707
-
708
- # Create embedding processor
709
- embed_object = ProductEmbedding()
710
-
711
- for collection, product_type in zip(product_collections, product_types):
712
- print(f"\n🔄 Processing {product_type} data from MongoDB...")
713
-
714
- # Generate embeddings for specific product type
715
- embeddings = embed_object.run_embedding(
716
- product_type=product_type,
717
- mongodb_conn=self.mongodb_conn,
718
- hybrid_mode=hybrid_mode
719
- )
720
 
721
- # Index embeddings to specific collection
722
- self.index(embeddings, collection)
723
- self._create_payload_indexes_for_product_type(product_type, collection)
724
-
725
- # Close MongoDB connection
726
- if self.mongodb_conn:
727
- self.mongodb_conn.close()
728
- self.mongodb_conn = None
729
 
730
- def indexing_single_product_type(self, product_type: str, collection_name: str, hybrid_mode: bool = True) -> str:
 
731
  """
732
  Indexing a single product group into its Qdrant collection from MongoDB
733
  Args:
@@ -735,46 +775,56 @@ class ProductIndexing:
735
  collection_name: Qdrant collection name
736
  hybrid_mode: Whether to use hybrid text embedding (BGEM3)
737
  """
738
- buffer = io.StringIO()
739
- sys.stdout = buffer
740
-
741
- try:
742
- self.client.recreate_collection(
743
- collection_name=collection_name,
744
- vectors_config=product_vectors_config,
745
- sparse_vectors_config=sparse_vectors_config
746
- )
747
- print(f"Collection {collection_name} created")
748
-
749
- # Setup MongoDB connection
750
- if not self.mongodb_conn:
751
- if not self.setup_mongodb():
752
- print("❌ Failed to connect to MongoDB")
753
- sys.stdout = sys.__stdout__
754
- return buffer.getvalue()
755
-
756
- # Create embedding processor
757
- embed_object = ProductEmbedding()
758
-
759
- print(f"\n🔄 Processing {product_type} data from MongoDB...")
760
- embeddings = embed_object.run_embedding(
761
- product_type=product_type,
762
- mongodb_conn=self.mongodb_conn,
763
- hybrid_mode=hybrid_mode
764
- )
765
- self.index(embeddings, collection_name)
 
 
 
 
 
 
766
 
767
- # Close MongoDB connection
768
- if self.mongodb_conn:
769
- self.mongodb_conn.close()
770
- self.mongodb_conn = None
 
 
 
 
771
 
772
- except Exception as e:
773
- print(f"Error while indexing product type {product_type}: {e}")
774
-
775
- self._create_payload_indexes_for_product_type(product_type, collection_name)
776
- sys.stdout = sys.__stdout__
777
- return buffer.getvalue()
778
 
779
  def _create_payload_indexes_for_product_type(self, product_type: str, collection_name: str):
780
  """Create payload indexes based on product type field schemas"""
@@ -1035,83 +1085,113 @@ class SolutionIndexing:
1035
 
1036
  def run_indexing(self, reload: bool = True):
1037
  """Index all solution data from MongoDB into Qdrant collections."""
1038
- if reload:
1039
  try:
1040
- for collection in solution_collections:
1041
- self.client.recreate_collection(
1042
- collection_name=collection,
1043
- vectors_config=qdrant_client.http.models.VectorParams(
1044
- size=768,
1045
- distance=qdrant_client.http.models.Distance.COSINE,
1046
- )
1047
- )
1048
- print("All solution collections recreated.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1049
  except Exception as e:
1050
- print(f"Error while recreating collections: {e}")
1051
- return
1052
-
1053
- # Setup MongoDB connection
1054
- if not self.mongodb_conn:
1055
- if not self.setup_mongodb():
1056
- print("❌ Failed to connect to MongoDB. Aborting indexing.")
1057
- return
1058
-
1059
- # Create embedding processor
1060
- embed_object = SolutionEmbedding()
1061
-
1062
- for collection, solution_type in zip(solution_collections, solution_types):
1063
- print(f"\n🔄 Processing {solution_type} data from MongoDB...")
1064
- embeddings = embed_object.run_embedding(solution_type, self.mongodb_conn)
1065
- self.index(embeddings, collection)
1066
-
1067
- # Close MongoDB connection
1068
- if self.mongodb_conn:
1069
- self.mongodb_conn.close()
1070
- self.mongodb_conn = None
1071
 
1072
  def indexing_single_solution(self, solution: str, collection_name: str) -> str:
1073
  """Indexing a single solution into its Qdrant collection from MongoDB"""
1074
- buffer = io.StringIO()
1075
- sys.stdout = buffer
1076
-
1077
- try:
1078
- self.client.recreate_collection(
1079
- collection_name=collection_name,
1080
- vectors_config=qdrant_client.http.models.VectorParams(
1081
- size=768,
1082
- distance=qdrant_client.http.models.Distance.COSINE,
 
 
 
1083
  )
1084
- )
1085
- print(f"Collection {collection_name} created")
1086
 
1087
- # Setup MongoDB connection
1088
- if not self.mongodb_conn:
1089
- if not self.setup_mongodb():
1090
- print("❌ Failed to connect to MongoDB")
1091
- sys.stdout = sys.__stdout__
1092
- return buffer.getvalue()
1093
 
1094
- # Create embedding processor
1095
- embed_object = SolutionEmbedding()
1096
 
1097
- print(f"\n🔄 Processing {solution} data from MongoDB...")
1098
- embeddings = embed_object.run_embedding(solution, self.mongodb_conn)
1099
- self.index(embeddings, collection_name)
1100
-
1101
- # Close MongoDB connection
1102
- if self.mongodb_conn:
1103
- self.mongodb_conn.close()
1104
- self.mongodb_conn = None
1105
 
1106
- except Exception as e:
1107
- print(f"Error while recreating collection and indexing solution {solution}: {e}")
1108
-
1109
- sys.stdout = sys.__stdout__
1110
- return buffer.getvalue()
 
 
 
 
 
 
 
 
 
 
 
 
 
1111
 
1112
 
1113
  """=================GRADIO UI========================"""
1114
- def create_gradio_interface():
1115
  """Create Gradio interface for indexing from MongoDB"""
1116
  product_indexing = ProductIndexing()
1117
  solution_indexing = SolutionIndexing()
@@ -1120,7 +1200,13 @@ def create_gradio_interface():
1120
  gr.Markdown("# 🗄️ Qdrant Data Indexing System (MongoDB)")
1121
  gr.Markdown("Recreate Qdrant Collections and Index Data from MongoDB Atlas")
1122
 
1123
- output_box = gr.Textbox(lines=15, label="📋 Logs", interactive=False)
 
 
 
 
 
 
1124
 
1125
  gr.Markdown("---")
1126
  gr.Markdown("## 🏢 Giải pháp (Solutions)")
@@ -1223,20 +1309,13 @@ def create_gradio_interface():
1223
  inputs=[gr.State("thiet_bi_dien"), gr.State(QDRANT_COLLECTION_NAME_SPTHIETBIDIEN), gr.State(True)],
1224
  outputs=output_box)
1225
 
1226
- def index_all_products():
1227
- buffer = io.StringIO()
1228
- sys.stdout = buffer
1229
- product_indexing.run_indexing(reload=True, hybrid_mode=True)
1230
- sys.stdout = sys.__stdout__
1231
- return buffer.getvalue()
1232
-
1233
  btn_all_products.click(
1234
- index_all_products,
 
1235
  outputs=output_box)
1236
 
1237
  return demo
1238
 
1239
-
1240
  if __name__ == "__main__":
1241
- demo = create_gradio_interface()
1242
  demo.launch()
 
7
  from typing import List, Dict, Tuple, Any, Optional
8
  import uuid
9
 
 
 
 
 
 
10
  from PIL import Image
11
  from FlagEmbedding import BGEM3FlagModel
12
  import gradio as gr
 
17
  import torch
18
  from transformers import EfficientNetModel, AutoImageProcessor
19
  from pymongo import MongoClient
20
+ import contextlib
21
 
22
  from config import (
23
  QDRANT_COLLECTION_NAME_SPCHIEUSANG,
 
147
  "nha_o_xa_hoi": "gp_nha_o_xa_hoi"
148
  }
149
 
150
+ class OutputCapture:
151
+ """Context manager to capture stdout and stderr"""
152
+ def __init__(self):
153
+ self.buffer = io.StringIO()
154
+ self.old_stdout = None
155
+ self.old_stderr = None
156
+
157
+ def __enter__(self):
158
+ self.old_stdout = sys.stdout
159
+ self.old_stderr = sys.stderr
160
+ sys.stdout = self.buffer
161
+ sys.stderr = self.buffer
162
+ return self.buffer
163
+
164
+ def __exit__(self, *args):
165
+ sys.stdout = self.old_stdout
166
+ sys.stderr = self.old_stderr
167
+
168
+ def getvalue(self):
169
+ return self.buffer.getvalue()
170
 
171
  """=================MONGODB CONNECTION========================"""
172
  class MongoDBConnection:
 
702
  reload: Whether to recreate collections
703
  hybrid_mode: Whether to use hybrid text embedding (BGEM3)
704
  """
705
+ with OutputCapture() as output:
706
  try:
707
+ if reload:
708
+ try:
709
+ for collection in product_collections:
710
+ self.client.recreate_collection(
711
+ collection_name=collection,
712
+ vectors_config=product_vectors_config,
713
+ sparse_vectors_config=sparse_vectors_config
714
+ )
715
+ print("✅ All product collections recreated.")
716
+ except Exception as e:
717
+ print(f"❌ Error while recreating collections: {e}")
718
+ return output.getvalue()
719
+
720
+ # Setup MongoDB connection
721
+ if not self.mongodb_conn:
722
+ if not self.setup_mongodb():
723
+ print("❌ Failed to connect to MongoDB. Aborting indexing.")
724
+ return output.getvalue()
725
+
726
+ # Create embedding processor
727
+ embed_object = ProductEmbedding()
728
+
729
+ for collection, product_type in zip(product_collections, product_types):
730
+ print(f"\n{'='*60}")
731
+ print(f"🔄 Processing {product_type} data from MongoDB...")
732
+ print(f"{'='*60}")
733
+
734
+ try:
735
+ # Generate embeddings for specific product type
736
+ embeddings = embed_object.run_embedding(
737
+ product_type=product_type,
738
+ mongodb_conn=self.mongodb_conn,
739
+ hybrid_mode=hybrid_mode
740
+ )
741
+
742
+ # Index embeddings to specific collection
743
+ self.index(embeddings, collection)
744
+ self._create_payload_indexes_for_product_type(product_type, collection)
745
+
746
+ print(f"✅ Completed indexing for {product_type}")
747
+
748
+ except Exception as e:
749
+ print(f"❌ Error indexing {product_type}: {e}")
750
+ import traceback
751
+ print(traceback.format_exc())
752
+
753
+ # Close MongoDB connection
754
+ if self.mongodb_conn:
755
+ self.mongodb_conn.close()
756
+ self.mongodb_conn = None
757
+
758
+ print(f"\n{'='*60}")
759
+ print("🎉 All indexing completed!")
760
+ print(f"{'='*60}")
761
+
762
  except Exception as e:
763
+ print(f" Fatal error during indexing: {e}")
764
+ import traceback
765
+ print(traceback.format_exc())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
766
 
767
+ return output.getvalue()
 
 
 
 
 
 
 
768
 
769
+ def indexing_single_product_type(self, product_type: str, collection_name: str,
770
+ hybrid_mode: bool = True) -> str:
771
  """
772
  Indexing a single product group into its Qdrant collection from MongoDB
773
  Args:
 
775
  collection_name: Qdrant collection name
776
  hybrid_mode: Whether to use hybrid text embedding (BGEM3)
777
  """
778
+ with OutputCapture() as output:
779
+ try:
780
+ print(f"{'='*60}")
781
+ print(f"🚀 Starting indexing for {product_type}")
782
+ print(f"{'='*60}\n")
783
+
784
+ self.client.recreate_collection(
785
+ collection_name=collection_name,
786
+ vectors_config=product_vectors_config,
787
+ sparse_vectors_config=sparse_vectors_config
788
+ )
789
+ print(f"✅ Collection {collection_name} created\n")
790
+
791
+ # Setup MongoDB connection
792
+ if not self.mongodb_conn:
793
+ if not self.setup_mongodb():
794
+ print("❌ Failed to connect to MongoDB")
795
+ return output.getvalue()
796
+
797
+ # Create embedding processor
798
+ embed_object = ProductEmbedding()
799
+
800
+ print(f"🔄 Processing {product_type} data from MongoDB...")
801
+ embeddings = embed_object.run_embedding(
802
+ product_type=product_type,
803
+ mongodb_conn=self.mongodb_conn,
804
+ hybrid_mode=hybrid_mode
805
+ )
806
+
807
+ print(f"\n📊 Indexing to Qdrant...")
808
+ self.index(embeddings, collection_name)
809
+
810
+ print(f"\n🔍 Creating payload indexes...")
811
+ self._create_payload_indexes_for_product_type(product_type, collection_name)
812
 
813
+ # Close MongoDB connection
814
+ if self.mongodb_conn:
815
+ self.mongodb_conn.close()
816
+ self.mongodb_conn = None
817
+
818
+ print(f"\n{'='*60}")
819
+ print(f"✅ Successfully completed indexing for {product_type}")
820
+ print(f"{'='*60}")
821
 
822
+ except Exception as e:
823
+ print(f"Error while indexing product type {product_type}: {e}")
824
+ import traceback
825
+ print(traceback.format_exc())
826
+
827
+ return output.getvalue()
828
 
829
  def _create_payload_indexes_for_product_type(self, product_type: str, collection_name: str):
830
  """Create payload indexes based on product type field schemas"""
 
1085
 
1086
  def run_indexing(self, reload: bool = True):
1087
  """Index all solution data from MongoDB into Qdrant collections."""
1088
+ with OutputCapture() as output:
1089
  try:
1090
+ if reload:
1091
+ try:
1092
+ for collection in solution_collections:
1093
+ self.client.recreate_collection(
1094
+ collection_name=collection,
1095
+ vectors_config=qdrant_client.http.models.VectorParams(
1096
+ size=768,
1097
+ distance=qdrant_client.http.models.Distance.COSINE,
1098
+ )
1099
+ )
1100
+ print("✅ All solution collections recreated.")
1101
+ except Exception as e:
1102
+ print(f"❌ Error while recreating collections: {e}")
1103
+ return output.getvalue()
1104
+
1105
+ # Setup MongoDB connection
1106
+ if not self.mongodb_conn:
1107
+ if not self.setup_mongodb():
1108
+ print("❌ Failed to connect to MongoDB. Aborting indexing.")
1109
+ return output.getvalue()
1110
+
1111
+ # Create embedding processor
1112
+ embed_object = SolutionEmbedding()
1113
+
1114
+ for collection, solution_type in zip(solution_collections, solution_types):
1115
+ print(f"\n{'='*60}")
1116
+ print(f"🔄 Processing {solution_type} data from MongoDB...")
1117
+ print(f"{'='*60}")
1118
+
1119
+ try:
1120
+ embeddings = embed_object.run_embedding(solution_type, self.mongodb_conn)
1121
+ self.index(embeddings, collection)
1122
+ print(f"✅ Completed indexing for {solution_type}")
1123
+ except Exception as e:
1124
+ print(f"❌ Error indexing {solution_type}: {e}")
1125
+ import traceback
1126
+ print(traceback.format_exc())
1127
+
1128
+ # Close MongoDB connection
1129
+ if self.mongodb_conn:
1130
+ self.mongodb_conn.close()
1131
+ self.mongodb_conn = None
1132
+
1133
+ print(f"\n{'='*60}")
1134
+ print("🎉 All solution indexing completed!")
1135
+ print(f"{'='*60}")
1136
+
1137
  except Exception as e:
1138
+ print(f" Fatal error during indexing: {e}")
1139
+ import traceback
1140
+ print(traceback.format_exc())
1141
+
1142
+ return output.getvalue()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1143
 
1144
  def indexing_single_solution(self, solution: str, collection_name: str) -> str:
1145
  """Indexing a single solution into its Qdrant collection from MongoDB"""
1146
+ with OutputCapture() as output:
1147
+ try:
1148
+ print(f"{'='*60}")
1149
+ print(f"🚀 Starting indexing for {solution}")
1150
+ print(f"{'='*60}\n")
1151
+
1152
+ self.client.recreate_collection(
1153
+ collection_name=collection_name,
1154
+ vectors_config=qdrant_client.http.models.VectorParams(
1155
+ size=768,
1156
+ distance=qdrant_client.http.models.Distance.COSINE,
1157
+ )
1158
  )
1159
+ print(f"✅ Collection {collection_name} created\n")
 
1160
 
1161
+ # Setup MongoDB connection
1162
+ if not self.mongodb_conn:
1163
+ if not self.setup_mongodb():
1164
+ print("❌ Failed to connect to MongoDB")
1165
+ return output.getvalue()
 
1166
 
1167
+ # Create embedding processor
1168
+ embed_object = SolutionEmbedding()
1169
 
1170
+ print(f"🔄 Processing {solution} data from MongoDB...")
1171
+ embeddings = embed_object.run_embedding(solution, self.mongodb_conn)
 
 
 
 
 
 
1172
 
1173
+ print(f"\n📊 Indexing to Qdrant...")
1174
+ self.index(embeddings, collection_name)
1175
+
1176
+ # Close MongoDB connection
1177
+ if self.mongodb_conn:
1178
+ self.mongodb_conn.close()
1179
+ self.mongodb_conn = None
1180
+
1181
+ print(f"\n{'='*60}")
1182
+ print(f"✅ Successfully completed indexing for {solution}")
1183
+ print(f"{'='*60}")
1184
+
1185
+ except Exception as e:
1186
+ print(f"❌ Error while indexing solution {solution}: {e}")
1187
+ import traceback
1188
+ print(traceback.format_exc())
1189
+
1190
+ return output.getvalue()
1191
 
1192
 
1193
  """=================GRADIO UI========================"""
1194
+ def create_indexing_interface():
1195
  """Create Gradio interface for indexing from MongoDB"""
1196
  product_indexing = ProductIndexing()
1197
  solution_indexing = SolutionIndexing()
 
1200
  gr.Markdown("# 🗄️ Qdrant Data Indexing System (MongoDB)")
1201
  gr.Markdown("Recreate Qdrant Collections and Index Data from MongoDB Atlas")
1202
 
1203
+ output_box = gr.Textbox(
1204
+ lines=20,
1205
+ label="📋 Logs",
1206
+ interactive=False,
1207
+ show_copy_button=True,
1208
+ max_lines=30
1209
+ )
1210
 
1211
  gr.Markdown("---")
1212
  gr.Markdown("## 🏢 Giải pháp (Solutions)")
 
1309
  inputs=[gr.State("thiet_bi_dien"), gr.State(QDRANT_COLLECTION_NAME_SPTHIETBIDIEN), gr.State(True)],
1310
  outputs=output_box)
1311
 
 
 
 
 
 
 
 
1312
  btn_all_products.click(
1313
+ product_indexing.run_indexing,
1314
+ inputs=[gr.State(True), gr.State(True)],
1315
  outputs=output_box)
1316
 
1317
  return demo
1318
 
 
1319
  if __name__ == "__main__":
1320
+ demo = create_indexing_interface()
1321
  demo.launch()
data_processing.py ADDED
@@ -0,0 +1,697 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from pathlib import Path
4
+
5
+ import requests
6
+ import re
7
+ import tempfile
8
+ import json
9
+ import math
10
+ import time
11
+ import warnings
12
+ from typing import Dict, List
13
+ from urllib3.exceptions import IncompleteRead
14
+ from datetime import datetime
15
+
16
+ import docling
17
+ from docling.document_converter import DocumentConverter, PdfFormatOption
18
+ from docling.datamodel.base_models import InputFormat
19
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
20
+ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
21
+ import pandas as pd
22
+ import gradio as gr
23
+ from pymongo import MongoClient, UpdateOne
24
+ from pymongo.errors import ConnectionFailure, OperationFailure
25
+
26
+ from data_helper import *
27
+ from config import MONGODB_URI
28
+
29
+ # Suppress PyTorch DataLoader pin_memory warning on MPS
30
+ warnings.filterwarnings("ignore", message=".*pin_memory.*not supported on MPS.*")
31
+
32
+ class MongoDBHandler:
33
+ """Handler for MongoDB operations"""
34
+
35
+ def __init__(self, connection_string: str = None, database_name: str = "product_database"):
36
+ """
37
+ Initialize MongoDB connection
38
+
39
+ Args:
40
+ connection_string: MongoDB connection string (default: localhost)
41
+ database_name: Name of the database to use
42
+ """
43
+ if connection_string is None:
44
+ connection_string = "mongodb://localhost:27017/"
45
+
46
+ self.connection_string = connection_string
47
+ self.database_name = database_name
48
+ self.client = None
49
+ self.db = None
50
+
51
+ def connect(self):
52
+ """Establish connection to MongoDB"""
53
+ try:
54
+ self.client = MongoClient(self.connection_string, serverSelectionTimeoutMS=5000)
55
+ # Test connection
56
+ self.client.admin.command('ping')
57
+ self.db = self.client[self.database_name]
58
+ print(f"✅ Connected to MongoDB database: {self.database_name}")
59
+ return True
60
+ except ConnectionFailure as e:
61
+ print(f"❌ Failed to connect to MongoDB: {e}")
62
+ return False
63
+ except Exception as e:
64
+ print(f"❌ Unexpected error connecting to MongoDB: {e}")
65
+ return False
66
+
67
+ def disconnect(self):
68
+ """Close MongoDB connection"""
69
+ if self.client is not None:
70
+ self.client.close()
71
+ print("🔌 Disconnected from MongoDB")
72
+
73
+ def get_collection_name(self, category: str) -> str:
74
+ """Map category name to collection name"""
75
+ collection_mapping = {
76
+ "Sản phẩm nhà thông minh": "sp_nha_thong_minh",
77
+ "Đèn LED": "sp_chieu_sang",
78
+ "Chiếu sáng chuyên dụng": "sp_chuyen_dung",
79
+ "Thiết bị điện": "sp_thiet_bi_dien",
80
+ "Phích nước": "sp_phich_nuoc",
81
+ }
82
+ return collection_mapping.get(category, "unknown_products")
83
+
84
+ def upload_data(self, data: List[Dict], collection_name: str, upsert: bool = True) -> Dict:
85
+ """
86
+ Upload data to MongoDB collection
87
+
88
+ Args:
89
+ data: List of product dictionaries
90
+ collection_name: Name of the collection
91
+ upsert: If True, update existing documents or insert new ones
92
+
93
+ Returns:
94
+ Dictionary with upload statistics
95
+ """
96
+ if self.db is None:
97
+ return {"success": False, "error": "Not connected to database"}
98
+
99
+ if not data:
100
+ return {"success": False, "error": "No data to upload"}
101
+
102
+ try:
103
+ collection = self.db[collection_name]
104
+
105
+ # Add metadata
106
+ timestamp = datetime.utcnow()
107
+ for item in data:
108
+ item['_updated_at'] = timestamp
109
+ if '_created_at' not in item:
110
+ item['_created_at'] = timestamp
111
+
112
+ if upsert:
113
+ # Use bulk write with upsert for better performance
114
+ operations = []
115
+ for item in data:
116
+ product_id = item.get('Product_ID')
117
+ if product_id:
118
+ operations.append(
119
+ UpdateOne(
120
+ {'Product_ID': product_id},
121
+ {'$set': item},
122
+ upsert=True
123
+ )
124
+ )
125
+
126
+ if operations:
127
+ result = collection.bulk_write(operations)
128
+ return {
129
+ "success": True,
130
+ "collection": collection_name,
131
+ "inserted": result.upserted_count,
132
+ "modified": result.modified_count,
133
+ "matched": result.matched_count,
134
+ "total": len(data)
135
+ }
136
+ else:
137
+ return {"success": False, "error": "No valid product IDs found"}
138
+ else:
139
+ # Simple insert (may cause duplicates)
140
+ result = collection.insert_many(data)
141
+ return {
142
+ "success": True,
143
+ "collection": collection_name,
144
+ "inserted": len(result.inserted_ids),
145
+ "total": len(data)
146
+ }
147
+
148
+ except OperationFailure as e:
149
+ return {"success": False, "error": f"MongoDB operation failed: {e}"}
150
+ except Exception as e:
151
+ return {"success": False, "error": f"Unexpected error: {e}"}
152
+
153
+ def test_connection(self) -> str:
154
+ """Test MongoDB connection and return status"""
155
+ try:
156
+ if self.connect():
157
+ # Get database stats
158
+ stats = self.db.command("dbstats")
159
+ collections = self.db.list_collection_names()
160
+ self.disconnect()
161
+ return f"✅ Connected successfully!\n📊 Database: {self.database_name}\n📁 Collections: {len(collections)}\n💾 Size: {stats.get('dataSize', 0) / 1024 / 1024:.2f} MB"
162
+ else:
163
+ return "❌ Connection failed"
164
+ except Exception as e:
165
+ return f"❌ Error: {str(e)}"
166
+
167
+
168
+ class DataProcessing:
169
+ def __init__(self):
170
+ pass
171
+
172
+ def get_data_from_excel_file(self, excel_path, key_match, collection_name,
173
+ processor_type="docling", mongo_handler=None):
174
+ """
175
+ Process Excel file and upload to MongoDB
176
+
177
+ Args:
178
+ excel_path: Path to Excel file
179
+ key_match: Category to match
180
+ collection_name: MongoDB collection name
181
+ processor_type: Type of PDF processor
182
+ mongo_handler: MongoDBHandler instance (required)
183
+ """
184
+ if not mongo_handler:
185
+ return "❌ MongoDB handler not provided"
186
+
187
+ all_sheets = pd.read_excel(excel_path, sheet_name=None, header=1)
188
+ sheet_names = list(all_sheets.keys())
189
+ sheets = {k: all_sheets[k] for k in sheet_names[2:]}
190
+
191
+ data = []
192
+
193
+ for sheet_name, df in sheets.items():
194
+ df.columns = df.columns.str.strip()
195
+ if "category 1" not in df.columns:
196
+ df = pd.read_excel(excel_path, sheet_name=sheet_name, header=0)
197
+ df.columns = df.columns.str.strip()
198
+
199
+ if "category 1" in df.columns:
200
+ filtered = df[df["category 1"].astype(str).str.replace("\n", " ").str.strip() == key_match]
201
+ data.append(filtered)
202
+
203
+ if data:
204
+ result_df = pd.concat(data, ignore_index=True)
205
+ result_df = result_df.where(pd.notnull(result_df), None)
206
+ result_df["HDSD"] = None
207
+
208
+ cols_to_drop = [col for col in result_df.columns if col.strip().lower().startswith("unnamed") or col.strip() == "a" or col == "STT"]
209
+ result_df = result_df.drop(columns=cols_to_drop, errors='ignore')
210
+
211
+ cols_to_replace = [col for col in result_df.columns if col not in ["Tóm tắt ưu điểm, tính năng", "Thông số kỹ thuật", "Nội dung Ưu điểm SP", "Ưu điểm"]]
212
+ result_df[cols_to_replace] = result_df[cols_to_replace].replace('\n', ' ', regex=True)
213
+
214
+ # Replace "none" values with None
215
+ result_df.loc[result_df["Thông số kỹ thuật"] == "none", "Thông số kỹ thuật"] = None
216
+ result_df.loc[result_df["Tóm tắt ưu điểm, tính năng"] == "none", "Tóm tắt ưu điểm, tính năng"] = None
217
+ result_df.loc[result_df["Tóm tắt TSKT"] == "none", "Tóm tắt TSKT"] = None
218
+ result_df.loc[result_df["Nội dung Ưu điểm SP"] == "none", "Nội dung Ưu điểm SP"] = None
219
+
220
+ result_df = result_df.map(lambda x: x.strip() if isinstance(x, str) else x)
221
+ result_df.drop_duplicates(subset=["Product_ID"], inplace=True)
222
+ result_df = self.data_normalization(result_df=result_df)
223
+ data = result_df.to_dict(orient="records")
224
+ data = self.convert_floats(data)
225
+ data = self.replace_nan_with_none(data)
226
+
227
+ # Process instructions based on processor type
228
+ if processor_type == "docling_with_ocr":
229
+ data = self.process_instruction_with_tesseract(data)
230
+ else:
231
+ data = self.process_instruction(data)
232
+
233
+ # Upload to MongoDB
234
+ if not mongo_handler.connect():
235
+ return "❌ Failed to connect to MongoDB"
236
+
237
+ result = mongo_handler.upload_data(data, collection_name, upsert=True)
238
+ mongo_handler.disconnect()
239
+
240
+ if result.get("success"):
241
+ return f"✅ Uploaded to MongoDB collection '{result['collection']}':\n" \
242
+ f" • Total records: {result['total']}\n" \
243
+ f" • Inserted: {result.get('inserted', 0)}\n" \
244
+ f" • Updated: {result.get('modified', 0)}"
245
+ else:
246
+ return f"❌ MongoDB upload failed: {result.get('error', 'Unknown error')}"
247
+ else:
248
+ return f"❌ Data not found for key: {key_match}"
249
+
250
+ def convert_floats(self, obj):
251
+ if isinstance(obj, float) and obj.is_integer():
252
+ return int(obj)
253
+ elif isinstance(obj, list):
254
+ return [self.convert_floats(i) for i in obj]
255
+ elif isinstance(obj, dict):
256
+ return {k: self.convert_floats(v) for k, v in obj.items()}
257
+ else:
258
+ return obj
259
+
260
+ def strip_redundant_space(self, text):
261
+ cleaned_text = " ".join(text.strip().split())
262
+ return cleaned_text
263
+
264
+ def convert_tag_to_dict(self, tag_str: str) -> dict:
265
+ if not isinstance(tag_str, str) or not tag_str.strip().startswith("{"):
266
+ return {}
267
+
268
+ try:
269
+ fixed = re.sub(r'([{,]\s*)(\w+)\s*:', r'\1"\2":', tag_str)
270
+ raw_pairs = fixed.strip('{} ').split(',')
271
+ raw_pairs = [pair.strip() for pair in raw_pairs if pair.strip()]
272
+ result = {}
273
+
274
+ current_key = None
275
+ for pair in raw_pairs:
276
+ if ':' in pair:
277
+ key, value = pair.split(':', 1)
278
+ key = key.strip().strip('"')
279
+ value = value.strip()
280
+
281
+ pattern = r',\s[A-Z]'
282
+ match = re.search(pattern, value)
283
+ if match:
284
+ values = [v.strip() for v in value.split(',')]
285
+ else:
286
+ values = value
287
+ result[key] = values
288
+ current_key = key
289
+ elif current_key:
290
+ previous_value = result[current_key]
291
+ if isinstance(previous_value, list):
292
+ result[current_key].append(pair.strip())
293
+ else:
294
+ result[current_key] = [previous_value, pair.strip()]
295
+
296
+ return result
297
+
298
+ except Exception as e:
299
+ print(f"Error parse tag: {tag_str} -> {e}")
300
+ return {}
301
+
302
+ def convert_tags_to_numeric(self, tags_dict):
303
+ keys_to_convert = ["dung_tich", "cong_suat", "lo_khoet_tran", "so_cuc", "so_hat", "modules", "cuon_day", "kich_thuoc"]
304
+
305
+ new_tags = {}
306
+ for key, value in tags_dict.items():
307
+ if key in keys_to_convert:
308
+ match = re.search(r'([\d.]+)', str(value))
309
+ if match:
310
+ num = float(match.group(1))
311
+ new_tags[key] = int(num) if num.is_integer() else num
312
+ else:
313
+ new_tags[key] = value
314
+ else:
315
+ new_tags[key] = value
316
+ return new_tags
317
+
318
+ def data_normalization(self, result_df):
319
+ if "Tags" in result_df.columns:
320
+ result_df["Tags"] = result_df["Tags"].astype(str).str.lower().apply(self.convert_tag_to_dict)
321
+ result_df["Tags"] = result_df["Tags"].apply(self.convert_tags_to_numeric)
322
+
323
+ if "Giá" in result_df.columns:
324
+ result_df["Giá"] = result_df["Giá"].apply(lambda x: "Liên hệ" if x == 0 else x)
325
+
326
+ if "Tên sản phẩm" in result_df.columns:
327
+ result_df["Tên sản phẩm"] = result_df["Tên sản phẩm"].apply(self.strip_redundant_space)
328
+
329
+ for col_name in result_df.columns:
330
+ if col_name in ["Tóm tắt TSKT", "Thông số kỹ thuật"]:
331
+ result_df[col_name] = result_df[col_name].astype(str).str.lower().str.strip()
332
+
333
+ return result_df
334
+
335
+ def replace_nan_with_none(self, obj):
336
+ if isinstance(obj, float) and math.isnan(obj):
337
+ return None
338
+ elif isinstance(obj, dict):
339
+ return {k: self.replace_nan_with_none(v) for k, v in obj.items()}
340
+ elif isinstance(obj, list):
341
+ return [self.replace_nan_with_none(i) for i in obj]
342
+ else:
343
+ return obj
344
+
345
+ @staticmethod
346
+ def download_pdf_with_retry(url, max_retries=3, timeout=30):
347
+ """Download PDF with retry logic and better error handling"""
348
+ for attempt in range(max_retries):
349
+ try:
350
+ print(f"Downloading PDF (attempt {attempt + 1}/{max_retries})...")
351
+
352
+ session = requests.Session()
353
+ session.headers.update({
354
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
355
+ })
356
+
357
+ response = session.get(url, stream=True, timeout=timeout)
358
+ response.raise_for_status()
359
+
360
+ content_length = response.headers.get('content-length')
361
+ if content_length:
362
+ print(f"Expected file size: {int(content_length):,} bytes")
363
+
364
+ content = b''
365
+ chunk_size = 8192
366
+ downloaded = 0
367
+
368
+ for chunk in response.iter_content(chunk_size=chunk_size):
369
+ if chunk:
370
+ content += chunk
371
+ downloaded += len(chunk)
372
+
373
+ print(f"\nDownload completed: {len(content):,} bytes")
374
+ return content
375
+
376
+ except (requests.exceptions.RequestException, IncompleteRead, ConnectionError) as e:
377
+ print(f"Download attempt {attempt + 1} failed: {e}")
378
+ if attempt < max_retries - 1:
379
+ wait_time = 2 ** attempt
380
+ print(f"Waiting {wait_time} seconds before retry...")
381
+ time.sleep(wait_time)
382
+ else:
383
+ print("All download attempts failed")
384
+ raise e
385
+
386
+ @staticmethod
387
+ def process_pdf_with_docling(url):
388
+ """Process PDF from URL using Docling for better structure extraction"""
389
+ try:
390
+ pdf_content = DataProcessing.download_pdf_with_retry(url)
391
+
392
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
393
+ tmp_file.write(pdf_content)
394
+ tmp_path = tmp_file.name
395
+
396
+ print(f"PDF saved to temporary file: {tmp_path}")
397
+
398
+ pipeline_options = PdfPipelineOptions()
399
+ pipeline_options.do_ocr = False
400
+ pipeline_options.do_table_structure = False
401
+
402
+ converter = DocumentConverter(
403
+ format_options={
404
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
405
+ }
406
+ )
407
+
408
+ print("Converting document with Docling...")
409
+ result = converter.convert(tmp_path)
410
+
411
+ os.unlink(tmp_path)
412
+ print("Temporary file cleaned up")
413
+
414
+ return result
415
+
416
+ except Exception as e:
417
+ print(f"Error processing PDF with Docling from URL {url}: {e}")
418
+ return None
419
+
420
+ @staticmethod
421
+ def extract_content_from_docling_result(docling_result):
422
+ """Extract content from Docling result in a more robust way"""
423
+ if not docling_result:
424
+ return None
425
+
426
+ try:
427
+ doc = docling_result.document
428
+
429
+ try:
430
+ markdown_content = doc.export_to_markdown()
431
+ return {'markdown': markdown_content}
432
+ except Exception as e:
433
+ print(f"Markdown export failed: {e}")
434
+
435
+ if hasattr(doc, 'main_text'):
436
+ return {'text': doc.main_text}
437
+
438
+ if hasattr(doc, 'body') and doc.body:
439
+ content = []
440
+ for element in doc.body:
441
+ content.append(str(element))
442
+ return {'text': '\n'.join(content)}
443
+
444
+ if hasattr(doc, 'elements') and doc.elements:
445
+ content = []
446
+ for element in doc.elements:
447
+ content.append(str(element))
448
+ return {'text': '\n'.join(content)}
449
+
450
+ return {'error': 'No accessible content found'}
451
+
452
+ except Exception as e:
453
+ return {'error': f"Error extracting content: {e}"}
454
+
455
+ def process_instruction(self, data):
456
+ """Lấy thông tin hướng dẫn sử dụng"""
457
+ tmp_data = data[:]
458
+ for item in tmp_data:
459
+ instruction_url = item.get("Link file HDSD", None)
460
+ if not instruction_url:
461
+ print("No instruction URL found, skipping...")
462
+ item["HDSD"] = ""
463
+ continue
464
+
465
+ if "https://" not in instruction_url and "http://" not in instruction_url:
466
+ print("Wrong URL, but has instruction info")
467
+ item["HDSD"] = instruction_url
468
+ continue
469
+
470
+ if "hdsd" not in instruction_url or "Khong" in instruction_url:
471
+ print("invalid instruction url/content")
472
+ item["HDSD"] = ""
473
+ continue
474
+
475
+ raw_result = DataProcessing.process_pdf_with_docling(instruction_url)
476
+ if raw_result:
477
+ extract_result = DataProcessing.extract_content_from_docling_result(raw_result)
478
+ if 'markdown' in extract_result.keys():
479
+ item["HDSD"] = re.sub(r"<!--\s*image\s*-->", '', extract_result['markdown'], flags=re.IGNORECASE).strip()
480
+ elif 'text' in extract_result.keys():
481
+ item["HDSD"] = re.sub(r"<!--\s*image\s*-->", '', extract_result['text'], flags=re.IGNORECASE).strip()
482
+
483
+ return tmp_data
484
+
485
+
486
+ def process_single_category(excel_path, category_name, processor_type,
487
+ mongo_connection, mongo_database,
488
+ progress=gr.Progress()):
489
+ """Process a single product category and upload to MongoDB"""
490
+
491
+ if excel_path is None:
492
+ return "❌ Please upload an Excel file first"
493
+
494
+ # Category mapping
495
+ category_mapping = {
496
+ "Sản phẩm nhà thông minh": ("Sản phẩm nhà thông minh", "sp_nha_thong_minh"),
497
+ "Đèn LED": ("Đèn LED", "sp_chieu_sang"),
498
+ "Chiếu sáng chuyên dụng": ("Chiếu sáng chuyên dụng", "sp_chuyen_dung"),
499
+ "Thiết bị điện": ("Thiết bị điện", "sp_thiet_bi_dien"),
500
+ "Phích nước": ("Phích nước", "sp_phich_nuoc"),
501
+ }
502
+
503
+ if category_name not in category_mapping:
504
+ return f"❌ Unknown category: {category_name}"
505
+
506
+ key_match, collection_name = category_mapping[category_name]
507
+
508
+ try:
509
+ progress(0.1, desc="Initializing data processor...")
510
+ dp = DataProcessing()
511
+
512
+ # Initialize MongoDB handler
513
+ mongo_handler = MongoDBHandler(
514
+ connection_string=mongo_connection if mongo_connection else None,
515
+ database_name=mongo_database if mongo_database else "product_database"
516
+ )
517
+
518
+ progress(0.3, desc=f"Processing {category_name} with {processor_type}...")
519
+ result = dp.get_data_from_excel_file(
520
+ excel_path=excel_path,
521
+ key_match=key_match,
522
+ collection_name=collection_name,
523
+ processor_type=processor_type,
524
+ mongo_handler=mongo_handler
525
+ )
526
+
527
+ progress(1.0, desc="Processing completed!")
528
+ return result
529
+
530
+ except Exception as e:
531
+ return f"❌ Error processing {category_name}: {str(e)}"
532
+
533
+
534
+ def process_all_categories(excel_path, processor_type,
535
+ mongo_connection, mongo_database, progress=gr.Progress()):
536
+ """Process all product categories and upload to MongoDB"""
537
+ if excel_path is None:
538
+ return "❌ Please upload an Excel file first"
539
+
540
+ categories = [
541
+ "Sản phẩm nhà thông minh",
542
+ "Đèn LED",
543
+ "Chiếu sáng chuyên dụng",
544
+ "Thiết bị điện",
545
+ "Phích nước"
546
+ ]
547
+
548
+ results = []
549
+ total_categories = len(categories)
550
+
551
+ for i, category in enumerate(categories):
552
+ progress((i + 1) / total_categories, desc=f"Processing {category}...")
553
+ result = process_single_category(
554
+ excel_path, category, processor_type,
555
+ mongo_connection, mongo_database
556
+ )
557
+ results.append(f"{category}: {result}")
558
+
559
+ return "\n".join(results)
560
+
561
+
562
+ def test_mongo_connection(connection_string, database_name):
563
+ """Test MongoDB connection"""
564
+ if not connection_string:
565
+ connection_string = "mongodb://localhost:27017/"
566
+ if not database_name:
567
+ database_name = "product_database"
568
+
569
+ handler = MongoDBHandler(connection_string, database_name)
570
+ return handler.test_connection()
571
+
572
+
573
+ def create_processing_interface():
574
+ """Create Gradio interface with MongoDB-only storage"""
575
+ with gr.Blocks(title="Data Processing - Product Metadata Extractor") as demo:
576
+ gr.Markdown("# 📊 Product Data Processing")
577
+ gr.Markdown("Extract and process product metadata from Excel files and upload to MongoDB")
578
+
579
+ with gr.Row():
580
+ with gr.Column(scale=1):
581
+ gr.Markdown("### 📤 Upload Excel File")
582
+ excel_upload = gr.File(
583
+ label="Upload Excel File",
584
+ file_types=[".xlsx", ".xls"],
585
+ type="filepath"
586
+ )
587
+
588
+ gr.Markdown("### ⚙️ Processing Settings")
589
+ processor_dropdown = gr.Dropdown(
590
+ choices=["docling"],
591
+ value="docling",
592
+ label="PDF Processor Type",
593
+ info="Using basic docling for fast processing"
594
+ )
595
+
596
+ category_dropdown = gr.Dropdown(
597
+ choices=[
598
+ "Sản phẩm nhà thông minh",
599
+ "Đèn LED",
600
+ "Chiếu sáng chuyên dụng",
601
+ "Thiết bị điện",
602
+ "Phích nước"
603
+ ],
604
+ value="Sản phẩm nhà thông minh",
605
+ label="Product Category",
606
+ info="Select which product category to process"
607
+ )
608
+
609
+ gr.Markdown("### 🗄️ MongoDB Configuration")
610
+ mongo_connection = gr.Textbox(
611
+ label="MongoDB Connection String",
612
+ placeholder="mongodb+srv://<username>:<password>@cluster.mongodb.net/?retryWrites=true&w=majority",
613
+ value=MONGODB_URI,
614
+ info="MongoDB connection string"
615
+ )
616
+ mongo_database = gr.Textbox(
617
+ label="Database Name",
618
+ placeholder="product_database",
619
+ value="product_database",
620
+ info="Name of the MongoDB database"
621
+ )
622
+ test_connection_btn = gr.Button("🔌 Test Connection", size="sm")
623
+ connection_status = gr.Textbox(
624
+ label="Connection Status",
625
+ interactive=False,
626
+ lines=3
627
+ )
628
+
629
+ with gr.Column(scale=2):
630
+ output_box = gr.Textbox(
631
+ lines=15,
632
+ label="📋 Processing Log",
633
+ placeholder="Processing results will appear here..."
634
+ )
635
+
636
+ gr.Markdown("### 🚀 Actions")
637
+ with gr.Row():
638
+ process_single_btn = gr.Button("🔄 Process Selected Category", variant="primary")
639
+ process_all_btn = gr.Button("🔄 Process All Categories", variant="secondary")
640
+
641
+ gr.Markdown("### 📖 Information")
642
+ with gr.Accordion("MongoDB Collections", open=False):
643
+ gr.Markdown("""
644
+ **📦 Collections**:
645
+ - `sp_nha_thong_minh` - Sản phẩm nhà thông minh
646
+ - `sp_chieu_sang` - Đèn LED
647
+ - `sp_chuyen_dung` - Chiếu sáng chuyên dụng
648
+ - `sp_thiet_bi_dien` - Thiết bị điện
649
+ - `sp_phich_nuoc` - Phích nước
650
+
651
+ **🔄 Upsert Logic**:
652
+ - Existing records are updated based on `Product_ID`
653
+ - New records are inserted automatically
654
+ - Timestamps `_created_at` and `_updated_at` are managed automatically
655
+ """)
656
+
657
+ with gr.Accordion("Processor Types", open=False):
658
+ gr.Markdown("""
659
+ **🔹 docling**: Basic PDF text extraction
660
+ - Fast processing
661
+ - Good for text-based PDFs
662
+ - No OCR capabilities
663
+ """)
664
+
665
+ # Event handlers
666
+ test_connection_btn.click(
667
+ fn=test_mongo_connection,
668
+ inputs=[mongo_connection, mongo_database],
669
+ outputs=[connection_status]
670
+ )
671
+
672
+ process_single_btn.click(
673
+ fn=process_single_category,
674
+ inputs=[
675
+ excel_upload, category_dropdown, processor_dropdown,
676
+ mongo_connection, mongo_database
677
+ ],
678
+ outputs=output_box,
679
+ show_progress=True
680
+ )
681
+
682
+ process_all_btn.click(
683
+ fn=process_all_categories,
684
+ inputs=[
685
+ excel_upload, processor_dropdown,
686
+ mongo_connection, mongo_database
687
+ ],
688
+ outputs=output_box,
689
+ show_progress=True
690
+ )
691
+
692
+ return demo
693
+
694
+
695
+ if __name__ == "__main__":
696
+ demo = create_processing_interface()
697
+ demo.launch(share=False, server_name="localhost", server_port=7860)